added Gab scraper

This commit is contained in:
Tristan Lee
2022-02-28 12:11:21 -06:00
parent 7a257ea9f5
commit bc840e631d
6 changed files with 126 additions and 5 deletions

View File

@@ -15,6 +15,7 @@ boto3 = "*"
snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
ffmpeg-python = "*"
polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"}
garc = "*"
[dev-packages]

58
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "263a7825d8113518c7a0690d5f69526cabe2dfa6ea572bb39cbe5d26495e619c"
"sha256": "08623c70f7bb2da863def501ebdc6b0b2afab9865ef9e457b3137b8020314507"
},
"pipfile-spec": 6,
"requires": {
@@ -23,6 +23,14 @@
],
"version": "==0.7.12"
},
"attrs": {
"hashes": [
"sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4",
"sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==21.4.0"
},
"babel": {
"hashes": [
"sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9",
@@ -124,6 +132,13 @@
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.18.2"
},
"garc": {
"hashes": [
"sha256:6f1da8ccdb30b165b8d9247314b73d1002f60381480e61fdbf108dc9abf3c216"
],
"index": "pypi",
"version": "==2.1"
},
"gogettr": {
"hashes": [
"sha256:9f5c90e3b1befe6eb561d4bca9ca124faddbe5787d8b429f02703c68dd51d255",
@@ -217,6 +232,13 @@
"markers": "python_version < '3.10'",
"version": "==4.11.2"
},
"iniconfig": {
"hashes": [
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
"sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
],
"version": "==1.1.1"
},
"jinja2": {
"hashes": [
"sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8",
@@ -414,10 +436,26 @@
"markers": "python_version >= '3.8'",
"version": "==1.4.1"
},
"pluggy": {
"hashes": [
"sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
"sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
],
"markers": "python_version >= '3.6'",
"version": "==1.0.0"
},
"polyphemus": {
"git": "https://github.com/bellingcat/polyphemus.git",
"ref": "18b89f19ecdd32e7dc8b5564b258a67165e680ca"
},
"py": {
"hashes": [
"sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719",
"sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==1.11.0"
},
"pygments": {
"hashes": [
"sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65",
@@ -442,6 +480,14 @@
],
"version": "==1.7.1"
},
"pytest": {
"hashes": [
"sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db",
"sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171"
],
"markers": "python_version >= '3.6'",
"version": "==7.0.1"
},
"python-dateutil": {
"hashes": [
"sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
@@ -688,6 +734,14 @@
"index": "pypi",
"version": "==1.4.31"
},
"tomli": {
"hashes": [
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
"sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
],
"markers": "python_version >= '3.7'",
"version": "==2.0.1"
},
"tzdata": {
"hashes": [
"sha256:3eee491e22ebfe1e5cfcc97a4137cd70f092ce59144d81f8924a844de05ba8f5",
@@ -709,7 +763,7 @@
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'",
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.26.8"
},
"zipp": {

View File

@@ -19,6 +19,9 @@ class Scraper:
'DO_SPACES_KEY'),
aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
self.headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'}
pass
def __str__(self):
@@ -32,12 +35,13 @@ class Scraper:
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
n_retries = 0
r = requests.get(url)
r = requests.get(url, headers = self.headers)
while r.status_code != 200 and n_retries < 5:
logger.warning(f"{n_retries}/5: Request for {url} failed")
n_retries += 1
r = requests.get(url)
r = requests.get(url, headers = self.headers)
if r.status_code != 200:
logger.error(f"Could not fetch URL {url}")

View File

@@ -26,7 +26,7 @@ class BitchuteScraper(cisticola.scraper.base.Scraper):
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
session = requests.Session()
session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
session.headers.update(self.headers)
request = session.get("https://www.bitchute.com/search")
csrftoken = BeautifulSoup(request.text, 'html.parser').findAll(
"input", {"name": "csrfmiddlewaretoken"})[0].get("value")

53
cisticola/scraper/gab.py Normal file
View File

@@ -0,0 +1,53 @@
import cisticola.base
import cisticola.scraper.base
from datetime import datetime
import json
from typing import Generator, Tuple
from garc import Garc
import tempfile
class GabScraper(cisticola.scraper.base.Scraper):
"""An implementation of a Scraper for Gab, using GARC library"""
__version__ = "GabScraper 0.0.1"
def get_username_from_url(url):
username = url.split('https://gab.com/')[-1]
return username
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
client = Garc(profile = 'main')
username = GabScraper.get_username_from_url(channel.url)
scraper = client.userposts(username)
for post in scraper:
if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo = None) <= since.date:
break
media_urls = []
archived_urls = {}
media_urls.extend([p['url'] for p in post['media_attachments']])
if post.get('repost') is not None:
media_urls.extend([p['url'] for p in post['repost']['media_attachments']])
for url in media_urls:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[url] = archived_url
yield cisticola.base.ScraperResult(
scraper=self.__version__,
platform="Gab",
channel=channel.id,
platform_id=post['id'],
date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo = None),
date_archived=datetime.now(),
raw_data=json.dumps(post),
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Gab" and GabScraper.get_username_from_url(channel.url) is not None:
return True

View File

@@ -4,6 +4,7 @@ import cisticola.scraper.twitter
import cisticola.scraper.gettr
import cisticola.scraper.bitchute
import cisticola.scraper.odysee
import cisticola.scraper.gab
from sqlalchemy import create_engine
@@ -31,6 +32,11 @@ test_channels = [
id=5, name="Mak1n' Bacon (test)", platform_id='Mak1nBacon',
category="test", followers=None, platform="Odysee",
url="https://odysee.com/@Mak1nBacon", screenname='Mak1nBacon', country="US",
influencer=None, public=True, chat=False, notes=""),
cisticola.base.Channel(
id=6, name="Capt. Marc Simon (test)", platform_id='marc_capt',
category="test", followers=None, platform="Gab",
url="https://gab.com/marc_capt", screenname='marc_capt', country="CA",
influencer=None, public=True, chat=False, notes="")]
@@ -51,6 +57,9 @@ controller.register_scraper(bitchute)
odysee = cisticola.scraper.odysee.OdyseeScraper()
controller.register_scraper(odysee)
gab = cisticola.scraper.gab.GabScraper()
controller.register_scraper(gab)
engine = create_engine('sqlite:///test3.db')
controller.connect_to_db(engine)