mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
added Gab scraper
This commit is contained in:
1
Pipfile
1
Pipfile
@@ -15,6 +15,7 @@ boto3 = "*"
|
||||
snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
|
||||
ffmpeg-python = "*"
|
||||
polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"}
|
||||
garc = "*"
|
||||
|
||||
[dev-packages]
|
||||
|
||||
|
||||
58
Pipfile.lock
generated
58
Pipfile.lock
generated
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "263a7825d8113518c7a0690d5f69526cabe2dfa6ea572bb39cbe5d26495e619c"
|
||||
"sha256": "08623c70f7bb2da863def501ebdc6b0b2afab9865ef9e457b3137b8020314507"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
@@ -23,6 +23,14 @@
|
||||
],
|
||||
"version": "==0.7.12"
|
||||
},
|
||||
"attrs": {
|
||||
"hashes": [
|
||||
"sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4",
|
||||
"sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==21.4.0"
|
||||
},
|
||||
"babel": {
|
||||
"hashes": [
|
||||
"sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9",
|
||||
@@ -124,6 +132,13 @@
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.18.2"
|
||||
},
|
||||
"garc": {
|
||||
"hashes": [
|
||||
"sha256:6f1da8ccdb30b165b8d9247314b73d1002f60381480e61fdbf108dc9abf3c216"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.1"
|
||||
},
|
||||
"gogettr": {
|
||||
"hashes": [
|
||||
"sha256:9f5c90e3b1befe6eb561d4bca9ca124faddbe5787d8b429f02703c68dd51d255",
|
||||
@@ -217,6 +232,13 @@
|
||||
"markers": "python_version < '3.10'",
|
||||
"version": "==4.11.2"
|
||||
},
|
||||
"iniconfig": {
|
||||
"hashes": [
|
||||
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
|
||||
"sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
|
||||
],
|
||||
"version": "==1.1.1"
|
||||
},
|
||||
"jinja2": {
|
||||
"hashes": [
|
||||
"sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8",
|
||||
@@ -414,10 +436,26 @@
|
||||
"markers": "python_version >= '3.8'",
|
||||
"version": "==1.4.1"
|
||||
},
|
||||
"pluggy": {
|
||||
"hashes": [
|
||||
"sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
|
||||
"sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.0.0"
|
||||
},
|
||||
"polyphemus": {
|
||||
"git": "https://github.com/bellingcat/polyphemus.git",
|
||||
"ref": "18b89f19ecdd32e7dc8b5564b258a67165e680ca"
|
||||
},
|
||||
"py": {
|
||||
"hashes": [
|
||||
"sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719",
|
||||
"sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==1.11.0"
|
||||
},
|
||||
"pygments": {
|
||||
"hashes": [
|
||||
"sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65",
|
||||
@@ -442,6 +480,14 @@
|
||||
],
|
||||
"version": "==1.7.1"
|
||||
},
|
||||
"pytest": {
|
||||
"hashes": [
|
||||
"sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db",
|
||||
"sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==7.0.1"
|
||||
},
|
||||
"python-dateutil": {
|
||||
"hashes": [
|
||||
"sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
|
||||
@@ -688,6 +734,14 @@
|
||||
"index": "pypi",
|
||||
"version": "==1.4.31"
|
||||
},
|
||||
"tomli": {
|
||||
"hashes": [
|
||||
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
|
||||
"sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==2.0.1"
|
||||
},
|
||||
"tzdata": {
|
||||
"hashes": [
|
||||
"sha256:3eee491e22ebfe1e5cfcc97a4137cd70f092ce59144d81f8924a844de05ba8f5",
|
||||
@@ -709,7 +763,7 @@
|
||||
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
|
||||
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'",
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
|
||||
"version": "==1.26.8"
|
||||
},
|
||||
"zipp": {
|
||||
|
||||
@@ -19,6 +19,9 @@ class Scraper:
|
||||
'DO_SPACES_KEY'),
|
||||
aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
|
||||
|
||||
self.headers = {
|
||||
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'}
|
||||
|
||||
pass
|
||||
|
||||
def __str__(self):
|
||||
@@ -32,12 +35,13 @@ class Scraper:
|
||||
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
|
||||
n_retries = 0
|
||||
r = requests.get(url)
|
||||
|
||||
r = requests.get(url, headers = self.headers)
|
||||
|
||||
while r.status_code != 200 and n_retries < 5:
|
||||
logger.warning(f"{n_retries}/5: Request for {url} failed")
|
||||
n_retries += 1
|
||||
r = requests.get(url)
|
||||
r = requests.get(url, headers = self.headers)
|
||||
|
||||
if r.status_code != 200:
|
||||
logger.error(f"Could not fetch URL {url}")
|
||||
|
||||
@@ -26,7 +26,7 @@ class BitchuteScraper(cisticola.scraper.base.Scraper):
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
|
||||
|
||||
session = requests.Session()
|
||||
session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
|
||||
session.headers.update(self.headers)
|
||||
request = session.get("https://www.bitchute.com/search")
|
||||
csrftoken = BeautifulSoup(request.text, 'html.parser').findAll(
|
||||
"input", {"name": "csrfmiddlewaretoken"})[0].get("value")
|
||||
|
||||
53
cisticola/scraper/gab.py
Normal file
53
cisticola/scraper/gab.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import cisticola.base
|
||||
import cisticola.scraper.base
|
||||
from datetime import datetime
|
||||
import json
|
||||
from typing import Generator, Tuple
|
||||
from garc import Garc
|
||||
import tempfile
|
||||
|
||||
class GabScraper(cisticola.scraper.base.Scraper):
|
||||
"""An implementation of a Scraper for Gab, using GARC library"""
|
||||
__version__ = "GabScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(url):
|
||||
username = url.split('https://gab.com/')[-1]
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
|
||||
client = Garc(profile = 'main')
|
||||
username = GabScraper.get_username_from_url(channel.url)
|
||||
|
||||
scraper = client.userposts(username)
|
||||
|
||||
for post in scraper:
|
||||
if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo = None) <= since.date:
|
||||
break
|
||||
|
||||
media_urls = []
|
||||
archived_urls = {}
|
||||
|
||||
media_urls.extend([p['url'] for p in post['media_attachments']])
|
||||
|
||||
if post.get('repost') is not None:
|
||||
media_urls.extend([p['url'] for p in post['repost']['media_attachments']])
|
||||
|
||||
for url in media_urls:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield cisticola.base.ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Gab",
|
||||
channel=channel.id,
|
||||
platform_id=post['id'],
|
||||
date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo = None),
|
||||
date_archived=datetime.now(),
|
||||
raw_data=json.dumps(post),
|
||||
archived_urls=archived_urls)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Gab" and GabScraper.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
9
test.py
9
test.py
@@ -4,6 +4,7 @@ import cisticola.scraper.twitter
|
||||
import cisticola.scraper.gettr
|
||||
import cisticola.scraper.bitchute
|
||||
import cisticola.scraper.odysee
|
||||
import cisticola.scraper.gab
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
@@ -31,6 +32,11 @@ test_channels = [
|
||||
id=5, name="Mak1n' Bacon (test)", platform_id='Mak1nBacon',
|
||||
category="test", followers=None, platform="Odysee",
|
||||
url="https://odysee.com/@Mak1nBacon", screenname='Mak1nBacon', country="US",
|
||||
influencer=None, public=True, chat=False, notes=""),
|
||||
cisticola.base.Channel(
|
||||
id=6, name="Capt. Marc Simon (test)", platform_id='marc_capt',
|
||||
category="test", followers=None, platform="Gab",
|
||||
url="https://gab.com/marc_capt", screenname='marc_capt', country="CA",
|
||||
influencer=None, public=True, chat=False, notes="")]
|
||||
|
||||
|
||||
@@ -51,6 +57,9 @@ controller.register_scraper(bitchute)
|
||||
odysee = cisticola.scraper.odysee.OdyseeScraper()
|
||||
controller.register_scraper(odysee)
|
||||
|
||||
gab = cisticola.scraper.gab.GabScraper()
|
||||
controller.register_scraper(gab)
|
||||
|
||||
engine = create_engine('sqlite:///test3.db')
|
||||
controller.connect_to_db(engine)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user