diff --git a/.gitignore b/.gitignore index 643d95d..b107dd0 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,4 @@ *.db docs/build/ docs/source/_* - +.env diff --git a/Pipfile b/Pipfile index cd6422b..533ee24 100644 --- a/Pipfile +++ b/Pipfile @@ -5,13 +5,14 @@ name = "pypi" [packages] sqlalchemy = "*" -snscrape = "*" loguru = "*" gogettr = "*" requests = "*" bs4 = "*" dateparser = "*" sphinx = "*" +boto3 = "*" +snscrape = {git = "https://github.com/bellingcat/snscrape.git"} [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index 4b40a97..8a11934 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "cde7247f41da5501b9fc4fc5d01916548f719b3d4ea0f1dd1765c4cf0413bbf7" + "sha256": "d3ee112521273c2b0b9df074b4eb9a20649a2854bfffa433171749019acf8561" }, "pipfile-spec": 6, "requires": { @@ -39,6 +39,22 @@ "markers": "python_version >= '3.1'", "version": "==4.10.0" }, + "boto3": { + "hashes": [ + "sha256:0e8d4d814f94031947035a4c2bb2c23832d5de941a6a492fb85794a02bafc44d", + "sha256:95d9b5b6fe3383fbf8f33d58f62258d3b3ea138d4369017031339b60fd5b8887" + ], + "index": "pypi", + "version": "==1.21.6" + }, + "botocore": { + "hashes": [ + "sha256:359b9ea3870a1f8264113cb0b1216baa94bf1e8cee5d5d8af63a2e7ca6e7b33c", + "sha256:69aaa5a78ac7371f573e463be51fb962213c42fab08ef82380e84b9a87386949" + ], + "markers": "python_version >= '3.6'", + "version": "==1.24.6" + }, "bs4": { "hashes": [ "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a" @@ -194,6 +210,14 @@ "markers": "python_version >= '3.6'", "version": "==3.0.3" }, + "jmespath": { + "hashes": [ + "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9", + "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f" + ], + "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==0.10.0" + }, "loguru": { "hashes": [ "sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c", @@ -460,6 +484,14 @@ "index": "pypi", "version": "==2.27.1" }, + "s3transfer": { + "hashes": [ + "sha256:25c140f5c66aa79e1ac60be50dcd45ddc59e83895f062a3aab263b870102911f", + "sha256:69d264d3e760e569b78aaa0f22c97e955891cd22e32b10c51f784eeda4d9d10a" + ], + "markers": "python_version >= '3.6'", + "version": "==0.5.1" + }, "six": { "hashes": [ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", @@ -476,12 +508,8 @@ "version": "==2.2.0" }, "snscrape": { - "hashes": [ - "sha256:af30d12872da692ff9ccaf5651962edceb1fd4a28cf7cc92c8c898902f009ce3", - "sha256:fd176765196ca17979be7f54e041f430e4cb23a5e651fa29cf3dc382258019f2" - ], - "index": "pypi", - "version": "==0.4.3.20220106" + "git": "https://github.com/bellingcat/snscrape.git", + "ref": "de4ebed81f3f6a4bb4c65630daab6ec63784959b" }, "soupsieve": { "hashes": [ diff --git a/cisticola/__init__.py b/cisticola/__init__.py index 115d143..2b873cc 100644 --- a/cisticola/__init__.py +++ b/cisticola/__init__.py @@ -1,6 +1,6 @@ from typing import List -import cisticola.scraper import cisticola.base +import cisticola.scraper.base from sqlalchemy.orm import sessionmaker from loguru import logger @@ -14,7 +14,7 @@ class ScraperController: self.session = None self.mapper_registry = None - def register_scraper(self, scraper: cisticola.scraper.Scraper): + def register_scraper(self, scraper: cisticola.scraper.base.Scraper): self.scrapers.append(scraper) def scrape_channels(self, channels: List[cisticola.base.Channel]): @@ -27,10 +27,15 @@ class ScraperController: for scraper in self.scrapers: if scraper.can_handle(channel): + session = self.session() + handled = True + added = 0 + # get most recent post session = self.session() - rows = session.query(cisticola.base.ScraperResult).order_by( - cisticola.base.ScraperResult.date_archived).limit(1).all() + rows = session.query(cisticola.base.ScraperResult).where( + cisticola.base.ScraperResult.channel == channel.id).order_by( + cisticola.base.ScraperResult.date.desc()).limit(1).all() if len(rows) == 1: since = rows[0] @@ -38,21 +43,19 @@ class ScraperController: since = None posts = scraper.get_posts(channel, since=since) - handled = True + for post in posts: + session.add(post) + added += 1 + + session.commit() logger.info( - f"{scraper} found {len(posts)} new posts from {channel}") + f"{scraper} found {added} new posts from {channel}") break if not handled: logger.warning(f"No handler found for Channel {channel}") - session = self.session() - session.bulk_save_objects(posts) - session.commit() - - logger.info(f"Added {len(posts)} entries to database") - def connect_to_db(self, engine): # create tables cisticola.base.mapper_registry.metadata.create_all(bind=engine) diff --git a/cisticola/base.py b/cisticola/base.py index 57880d0..03a1641 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -1,7 +1,7 @@ from dataclasses import dataclass from datetime import datetime from sqlalchemy.orm import registry -from sqlalchemy import Table, Column, Integer, String, DateTime, ForeignKey +from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey mapper_registry = registry() @@ -12,11 +12,12 @@ class ScraperResult: scraper: str platform: str - channel: int + channel: int #TODO there is probably a way of making this a Channel object foreign key platform_id: str date: datetime raw_data: str date_archived: datetime + archived_urls: dict raw_data_table = Table('raw_data', mapper_registry.metadata, @@ -28,7 +29,8 @@ raw_data_table = Table('raw_data', mapper_registry.metadata, Column('platform_id', String), Column('date', DateTime), Column('raw_data', String), - Column('date_archived', DateTime)) + Column('date_archived', DateTime), + Column('archived_urls', JSON)) mapper_registry.map_imperatively(ScraperResult, raw_data_table) @@ -42,6 +44,7 @@ class Channel: followers: int platform: str url: str + screenname: str country: str influencer: str public: bool diff --git a/cisticola/scraper/__init__.py b/cisticola/scraper/__init__.py index c8a3166..e69de29 100644 --- a/cisticola/scraper/__init__.py +++ b/cisticola/scraper/__init__.py @@ -1,18 +0,0 @@ -from typing import List -import cisticola.base - - -class Scraper: - __version__ = "Scraper 0.0.0" - - def __init__(self): - pass - - def __str__(self): - return self.__version__ - - def can_handle(self, channel: cisticola.base.Channel) -> bool: - pass - - def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: - pass diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py new file mode 100644 index 0000000..a41a0b2 --- /dev/null +++ b/cisticola/scraper/base.py @@ -0,0 +1,61 @@ +from typing import Generator +import cisticola.base +import requests +import os +import boto3 +from io import BytesIO +from loguru import logger + +class Scraper: + __version__ = "Scraper 0.0.0" + + def __init__(self): + self.s3_client = boto3.client('s3', + region_name=os.getenv( + 'DO_SPACES_REGION'), + endpoint_url='https://{}.digitaloceanspaces.com'.format( + os.getenv('DO_SPACES_REGION')), + aws_access_key_id=os.getenv( + 'DO_SPACES_KEY'), + aws_secret_access_key=os.getenv('DO_SPACES_SECRET')) + + pass + + def __str__(self): + return self.__version__ + + def archive_media(self, url: str, key: str = None) -> str: + n_retries = 0 + r = requests.get(url) + + while r.status_code != 200 and n_retries < 5: + logger.warning(f"{n_retries}/5: Request for {url} failed") + n_retries += 1 + r = requests.get(url) + + if r.status_code != 200: + logger.error(f"Could not fetch URL {url}") + return url + + blob = r.content + + content_type = r.headers.get('Content-Type') + + if key is None: + key = url.split('/')[-1] + key = key.split('?')[0] + + filename = self.__version__.replace(' ', '_') + '/' + key + + self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.getenv( + 'DO_BUCKET'), Key=filename, ExtraArgs={'ACL': 'public-read', 'ContentType': content_type}) + + archived_url = os.getenv('DO_URL') + '/' + filename + + return archived_url + + def can_handle(self, channel: cisticola.base.Channel) -> bool: + pass + + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: + pass diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index ed144dc..c5c9f66 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -4,7 +4,7 @@ import re from html.parser import HTMLParser import dateparser import json -from typing import List +from typing import Generator import requests from bs4 import BeautifulSoup @@ -23,7 +23,7 @@ class BitchuteScraper(cisticola.scraper.Scraper): return username - def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: session = requests.Session() session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0" diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index c656549..9a52a69 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -1,10 +1,11 @@ import cisticola.base +import cisticola.scraper.base from datetime import datetime import json -from typing import List +from typing import Generator from gogettr import PublicClient -class GettrScraper(cisticola.scraper.Scraper): +class GettrScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Gettr, using gogettr library""" __version__ = "GettrScraper 0.0.1" @@ -15,26 +16,41 @@ class GettrScraper(cisticola.scraper.Scraper): return username - def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: - posts = [] + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: client = PublicClient() username = GettrScraper.get_username_from_url(channel.url) scraper = client.user_activity(username=username, type="posts") for post in scraper: - if since is not None and post['cdate'] <= int(since.date_archived.timestamp()): + if since is not None and datetime.fromtimestamp(post['cdate']*0.001) <= since.date: break - posts.append(cisticola.base.ScraperResult( + archived_urls = {} + + if 'imgs' in post: + for img in post['imgs']: + url = "https://media.gettr.com/" + img + archived_url = self.archive_media(url) + archived_urls[img] = archived_url + + if 'main' in post: + archived_url = self.archive_media("https://media.gettr.com/" + post['main']) + archived_urls[post['main']] = archived_url + + # TODO this is just archiving the playlist file, not the actual video + if 'vid' in post: + archived_url = self.archive_media("https://media.gettr.com/" + post['vid']) + archived_urls[post['vid']] = archived_url + + yield cisticola.base.ScraperResult( scraper=self.__version__, platform="Gettr", - channel=username, + channel=channel.id, platform_id=post['_id'], date=datetime.fromtimestamp(post['cdate']/1000.), date_archived=datetime.now(), - raw_data=json.dumps(post))) - - return posts + raw_data=json.dumps(post), + archived_urls=archived_urls) def can_handle(self, channel): if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None: diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py new file mode 100644 index 0000000..bba9276 --- /dev/null +++ b/cisticola/scraper/telegram_snscrape.py @@ -0,0 +1,44 @@ +import cisticola.base +import cisticola.scraper.base +from typing import Generator +import snscrape.modules +from datetime import datetime, timezone + + +class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper): + __version__ = "TelegramSnscrapeScraper 0.0.1" + + def can_handle(self, channel): + if channel.platform == "Telegram" and channel.public and not channel.chat: + return True + + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: + scr = snscrape.modules.telegram.TelegramChannelScraper( + channel.screenname) + + g = scr.get_items() + + for post in g: + if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): + break + + archived_urls = {} + + for image_url in post.images: + archive_url = self.archive_media(image_url) + archived_urls[image_url] = archive_url + + if post.video: + video_archive_url = self.archive_media(post.video) + archived_urls[post.video] = video_archive_url + + yield cisticola.base.ScraperResult( + scraper=self.__version__, + platform="Telegram", + channel=channel.id, + platform_id=post.url, + date=post.date, + date_archived=datetime.now(timezone.utc), + raw_data=post.json(), + archived_urls=archived_urls + ) diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index 19766a4..a43365c 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -1,42 +1,59 @@ import cisticola.base -from datetime import datetime -from typing import List +import cisticola.scraper.base +from datetime import datetime, timezone +from typing import Generator import snscrape.modules +from loguru import logger -class TwitterScraper(cisticola.scraper.Scraper): +class TwitterScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Twitter, using snscrape library""" __version__ = "TwitterScraper 0.0.1" - # TODO snscrape should be able to scrape from user ID alone, but there is - # currently a bug/other issue, so it is extracting the username from URL - def get_username_from_url(url): - username = url.split("twitter.com/")[1] - if len(username.split("/")) > 1: - return None + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: + scraper = snscrape.modules.twitter.TwitterProfileScraper(channel.platform_id) - return username - - def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: - posts = [] - scraper = snscrape.modules.twitter.TwitterUserScraper( - TwitterScraper.get_username_from_url(channel.url)) + first = True for tweet in scraper.get_items(): - if since is not None and tweet.date.timestamp() <= since.date_archived.timestamp(): - break + if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc): + # with TwitterProfileScraper, the first tweet could be an old pinned tweet + if first: + first = False + continue + else: + break - posts.append(cisticola.base.ScraperResult( + archived_urls = {} + + if tweet.media: + for media in tweet.media: + if type(media) == snscrape.modules.twitter.Video: + variant = max( + [v for v in media.variants if v.bitrate], key=lambda v: v.bitrate) + url = variant.url + elif type(media) == snscrape.modules.twitter.Gif: + url = media.variants[0].url + elif type(media) == snscrape.modules.twitter.Photo: + url = media.fullUrl + else: + logger.warning(f"Could not get media URL of {media}") + url = None + + if url is not None: + archived_url = self.archive_media(url) + archived_urls[url] = archived_url + + yield cisticola.base.ScraperResult( scraper=self.__version__, platform="Twitter", channel=channel.id, platform_id=tweet.id, date=tweet.date, date_archived=datetime.now(), - raw_data=tweet.json())) - - return posts + raw_data=tweet.json(), + archived_urls=archived_urls) def can_handle(self, channel): - if channel.platform == "Twitter" and TwitterScraper.get_username_from_url(channel.url) is not None: + if channel.platform == "Twitter" and channel.platform_id: return True diff --git a/test.py b/test.py index b8c093d..9c60fb0 100644 --- a/test.py +++ b/test.py @@ -1,38 +1,43 @@ -# TODO/TODECIDE: -# should 'username' be a part of the Channel definition somehow? -# still need to do some planning for handling media - import cisticola +import cisticola.scraper.telegram_snscrape import cisticola.scraper.twitter +import cisticola.scraper.gettr from sqlalchemy import create_engine -test_channels = [cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132, +test_channels = [ + cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132, category="test", followers=None, platform="Twitter", - url="https://twitter.com/obtusatum", country="US", + url="https://twitter.com/obtusatum", screenname="obtusatum", country="US", influencer=None, public=True, chat=False, notes=""), cisticola.base.Channel(id=1, name="JQHN SPARTAN", platform_id=-1001181961026, category="qanon", followers=None, platform="Telegram", - url="https://t.me/jqhnspartan", country="FR", + url="https://t.me/jqhnspartan", screenname="jqhnspartan", country="FR", influencer="JQNH SPARTAN", public=True, chat=False, notes=""), cisticola.base.Channel(id=2, name="LizardRepublic", platform_id='lizardrepublic', category="qanon", followers=None, platform="Gettr", - url="https://www.gettr.com/user/lizardrepublic", country="US", + url="https://www.gettr.com/user/lizardrepublic", screenname="lizardrepublic", country="US", influencer=None, public=True, chat=False, notes=""), cisticola.base.Channel(id=3, name="Patriot Front", platform_id='OVv9QZL4sEsC', category="nazi", followers=None, platform="Bitchute", - url="https://www.bitchute.com/channel/OVv9QZL4sEsC/", country="US", + url="https://www.bitchute.com/channel/OVv9QZL4sEsC/", screenname=None, country="US", influencer=None, public=True, chat=False, notes=""),] controller = cisticola.ScraperController() -scraper = cisticola.scraper.twitter.TwitterScraper() -controller.register_scraper(scraper) +twitter = cisticola.scraper.twitter.TwitterScraper() +controller.register_scraper(twitter) -engine = create_engine('sqlite:///test.db') +telegram = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper() +controller.register_scraper(telegram) + +gettr = cisticola.scraper.gettr.GettrScraper() +controller.register_scraper(gettr) + +engine = create_engine('sqlite:///test3.db') controller.connect_to_db(engine) controller.scrape_channels(test_channels)