From 6092e4caa5aaef84818c49862840412fce22b520 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Thu, 24 Feb 2022 16:36:55 +0100 Subject: [PATCH 01/12] Add method for archiving media, reoranize scraper base classes --- .env | 5 +++++ Pipfile | 3 ++- Pipfile.lock | 42 +++++++++++++++++++++++++++++------ cisticola/__init__.py | 6 ++--- cisticola/base.py | 1 + cisticola/scraper/__init__.py | 18 --------------- cisticola/scraper/twitter.py | 3 ++- test.py | 17 ++++++++------ 8 files changed, 58 insertions(+), 37 deletions(-) create mode 100644 .env diff --git a/.env b/.env new file mode 100644 index 0000000..ace9ca9 --- /dev/null +++ b/.env @@ -0,0 +1,5 @@ +DO_SPACES_REGION=ams3 +DO_SPACES_KEY=DKIMQ7ABHPOBC4OZDEQR +DO_SPACES_SECRET=uqKaPQsV4WmskQr8/O2NTS+OHiTNV2yVJn8u9Ny0rsA +DO_BUCKET=cisticola-test +DO_URL=https://cisticola-test.ams3.digitaloceanspaces.com \ No newline at end of file diff --git a/Pipfile b/Pipfile index cd6422b..533ee24 100644 --- a/Pipfile +++ b/Pipfile @@ -5,13 +5,14 @@ name = "pypi" [packages] sqlalchemy = "*" -snscrape = "*" loguru = "*" gogettr = "*" requests = "*" bs4 = "*" dateparser = "*" sphinx = "*" +boto3 = "*" +snscrape = {git = "https://github.com/bellingcat/snscrape.git"} [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index 4b40a97..919e8f0 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "cde7247f41da5501b9fc4fc5d01916548f719b3d4ea0f1dd1765c4cf0413bbf7" + "sha256": "d3ee112521273c2b0b9df074b4eb9a20649a2854bfffa433171749019acf8561" }, "pipfile-spec": 6, "requires": { @@ -39,6 +39,22 @@ "markers": "python_version >= '3.1'", "version": "==4.10.0" }, + "boto3": { + "hashes": [ + "sha256:0e8d4d814f94031947035a4c2bb2c23832d5de941a6a492fb85794a02bafc44d", + "sha256:95d9b5b6fe3383fbf8f33d58f62258d3b3ea138d4369017031339b60fd5b8887" + ], + "index": "pypi", + "version": "==1.21.6" + }, + "botocore": { + "hashes": [ + "sha256:359b9ea3870a1f8264113cb0b1216baa94bf1e8cee5d5d8af63a2e7ca6e7b33c", + "sha256:69aaa5a78ac7371f573e463be51fb962213c42fab08ef82380e84b9a87386949" + ], + "markers": "python_version >= '3.6'", + "version": "==1.24.6" + }, "bs4": { "hashes": [ "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a" @@ -194,6 +210,14 @@ "markers": "python_version >= '3.6'", "version": "==3.0.3" }, + "jmespath": { + "hashes": [ + "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9", + "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f" + ], + "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==0.10.0" + }, "loguru": { "hashes": [ "sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c", @@ -460,6 +484,14 @@ "index": "pypi", "version": "==2.27.1" }, + "s3transfer": { + "hashes": [ + "sha256:25c140f5c66aa79e1ac60be50dcd45ddc59e83895f062a3aab263b870102911f", + "sha256:69d264d3e760e569b78aaa0f22c97e955891cd22e32b10c51f784eeda4d9d10a" + ], + "markers": "python_version >= '3.6'", + "version": "==0.5.1" + }, "six": { "hashes": [ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", @@ -476,12 +508,8 @@ "version": "==2.2.0" }, "snscrape": { - "hashes": [ - "sha256:af30d12872da692ff9ccaf5651962edceb1fd4a28cf7cc92c8c898902f009ce3", - "sha256:fd176765196ca17979be7f54e041f430e4cb23a5e651fa29cf3dc382258019f2" - ], - "index": "pypi", - "version": "==0.4.3.20220106" + "git": "https://github.com/bellingcat/snscrape.git", + "ref": "72b26f2373f3fecf53bdf9c62d7408df3d15a329" }, "soupsieve": { "hashes": [ diff --git a/cisticola/__init__.py b/cisticola/__init__.py index 115d143..ee24a4e 100644 --- a/cisticola/__init__.py +++ b/cisticola/__init__.py @@ -1,6 +1,6 @@ from typing import List -import cisticola.scraper import cisticola.base +import cisticola.scraper.base from sqlalchemy.orm import sessionmaker from loguru import logger @@ -14,7 +14,7 @@ class ScraperController: self.session = None self.mapper_registry = None - def register_scraper(self, scraper: cisticola.scraper.Scraper): + def register_scraper(self, scraper: cisticola.scraper.base.Scraper): self.scrapers.append(scraper) def scrape_channels(self, channels: List[cisticola.base.Channel]): @@ -30,7 +30,7 @@ class ScraperController: # get most recent post session = self.session() rows = session.query(cisticola.base.ScraperResult).order_by( - cisticola.base.ScraperResult.date_archived).limit(1).all() + cisticola.base.ScraperResult.date.desc()).limit(1).all() if len(rows) == 1: since = rows[0] diff --git a/cisticola/base.py b/cisticola/base.py index 57880d0..da811df 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -42,6 +42,7 @@ class Channel: followers: int platform: str url: str + screenname: str country: str influencer: str public: bool diff --git a/cisticola/scraper/__init__.py b/cisticola/scraper/__init__.py index c8a3166..e69de29 100644 --- a/cisticola/scraper/__init__.py +++ b/cisticola/scraper/__init__.py @@ -1,18 +0,0 @@ -from typing import List -import cisticola.base - - -class Scraper: - __version__ = "Scraper 0.0.0" - - def __init__(self): - pass - - def __str__(self): - return self.__version__ - - def can_handle(self, channel: cisticola.base.Channel) -> bool: - pass - - def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: - pass diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index 19766a4..e3c399d 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -1,10 +1,11 @@ import cisticola.base +import cisticola.scraper.base from datetime import datetime from typing import List import snscrape.modules -class TwitterScraper(cisticola.scraper.Scraper): +class TwitterScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Twitter, using snscrape library""" __version__ = "TwitterScraper 0.0.1" diff --git a/test.py b/test.py index b8c093d..1885487 100644 --- a/test.py +++ b/test.py @@ -3,36 +3,39 @@ # still need to do some planning for handling media import cisticola -import cisticola.scraper.twitter +import cisticola.scraper.telegram_snscrape from sqlalchemy import create_engine test_channels = [cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132, category="test", followers=None, platform="Twitter", - url="https://twitter.com/obtusatum", country="US", + url="https://twitter.com/obtusatum", screenname="obtusatum", country="US", influencer=None, public=True, chat=False, notes=""), cisticola.base.Channel(id=1, name="JQHN SPARTAN", platform_id=-1001181961026, category="qanon", followers=None, platform="Telegram", - url="https://t.me/jqhnspartan", country="FR", + url="https://t.me/jqhnspartan", screenname="jqhnspartan", country="FR", influencer="JQNH SPARTAN", public=True, chat=False, notes=""), cisticola.base.Channel(id=2, name="LizardRepublic", platform_id='lizardrepublic', category="qanon", followers=None, platform="Gettr", - url="https://www.gettr.com/user/lizardrepublic", country="US", + url="https://www.gettr.com/user/lizardrepublic", screenname="lizardrepublic", country="US", influencer=None, public=True, chat=False, notes=""), cisticola.base.Channel(id=3, name="Patriot Front", platform_id='OVv9QZL4sEsC', category="nazi", followers=None, platform="Bitchute", - url="https://www.bitchute.com/channel/OVv9QZL4sEsC/", country="US", + url="https://www.bitchute.com/channel/OVv9QZL4sEsC/", screenname=None, country="US", influencer=None, public=True, chat=False, notes=""),] controller = cisticola.ScraperController() -scraper = cisticola.scraper.twitter.TwitterScraper() +# scraper = cisticola.scraper.twitter.TwitterScraper() +# controller.register_scraper(scraper) + +scraper = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper() controller.register_scraper(scraper) -engine = create_engine('sqlite:///test.db') +engine = create_engine('sqlite:///test3.db') controller.connect_to_db(engine) controller.scrape_channels(test_channels) From a87cfd570a01590115b4cf6fe0e1a74a040b2f18 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Thu, 24 Feb 2022 16:37:13 +0100 Subject: [PATCH 02/12] Add Telegram channel scraper --- cisticola/scraper/base.py | 58 ++++++++++++++++++++++++++ cisticola/scraper/telegram_snscrape.py | 47 +++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 cisticola/scraper/base.py create mode 100644 cisticola/scraper/telegram_snscrape.py diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py new file mode 100644 index 0000000..477ce7a --- /dev/null +++ b/cisticola/scraper/base.py @@ -0,0 +1,58 @@ +from typing import List +import cisticola.base +import requests +import os +import boto3 +from io import BytesIO +from loguru import logger + +class Scraper: + __version__ = "Scraper 0.0.0" + + def __init__(self): + self.s3_client = boto3.client('s3', + region_name=os.getenv( + 'DO_SPACES_REGION'), + endpoint_url='https://{}.digitaloceanspaces.com'.format( + os.getenv('DO_SPACES_REGION')), + aws_access_key_id=os.getenv( + 'DO_SPACES_KEY'), + aws_secret_access_key=os.getenv('DO_SPACES_SECRET')) + + pass + + def __str__(self): + return self.__version__ + + def archive_media(self, url: str) -> str: + n_retries = 0 + r = requests.get(url) + + while r.status_code != 200 and n_retries < 5: + logger.warning(f"{n_retries}/5: Request for {url} failed") + n_retries += 1 + r = requests.get(url) + + if r.status_code != 200: + logger.error(f"Could not fetch URL {url}") + return url + + blob = r.content + + key = url.split('/')[-1] + key = key.split('?')[0] + + filename = self.__version__.replace(' ', '_') + '/' + key + + self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.getenv( + 'DO_BUCKET'), Key=filename, ExtraArgs={'ACL': 'public-read', 'ContentType': 'image/jpeg'}) + + archived_url = os.getenv('DO_URL') + '/' + filename + + return archived_url + + def can_handle(self, channel: cisticola.base.Channel) -> bool: + pass + + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: + pass diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py new file mode 100644 index 0000000..5752d7d --- /dev/null +++ b/cisticola/scraper/telegram_snscrape.py @@ -0,0 +1,47 @@ + +import cisticola.base +import cisticola.scraper.base +from typing import List +import snscrape.modules +from datetime import datetime, timezone + + +class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper): + __version__ = "TelegramSnscrapeScraper 0.0.1" + + def can_handle(self, channel): + if channel.platform == "Telegram" and channel.public and not channel.chat: + return True + + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None): + posts = [] + scr = snscrape.modules.telegram.TelegramChannelScraper( + channel.screenname) + + g = scr.get_items() + + for post in g: + if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): + break + + raw_data = post.json() + + for image_url in post.images: + archive_url = self.archive_media(image_url) + raw_data = raw_data.replace(image_url, archive_url) + + if post.video: + video_archive_url = self.archive_media(post.video) + raw_data = raw_data.replace(post.video, video_archive_url) + + posts.append(cisticola.base.ScraperResult( + scraper=self.__version__, + platform="Telegram", + channel=channel.id, + platform_id=post.url, + date=post.date, + date_archived=datetime.now(), + raw_data=raw_data + )) + + return posts From 214287b7a8b1c8417494eadf6b2d931f30ca9c34 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Thu, 24 Feb 2022 17:35:24 +0100 Subject: [PATCH 03/12] Archive media in dictionary --- cisticola/base.py | 6 ++++-- cisticola/scraper/base.py | 7 ++++--- cisticola/scraper/telegram_snscrape.py | 12 ++++++------ test.py | 6 +----- 4 files changed, 15 insertions(+), 16 deletions(-) diff --git a/cisticola/base.py b/cisticola/base.py index da811df..d2913e2 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -1,7 +1,7 @@ from dataclasses import dataclass from datetime import datetime from sqlalchemy.orm import registry -from sqlalchemy import Table, Column, Integer, String, DateTime, ForeignKey +from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey mapper_registry = registry() @@ -17,6 +17,7 @@ class ScraperResult: date: datetime raw_data: str date_archived: datetime + archived_urls: dict raw_data_table = Table('raw_data', mapper_registry.metadata, @@ -28,7 +29,8 @@ raw_data_table = Table('raw_data', mapper_registry.metadata, Column('platform_id', String), Column('date', DateTime), Column('raw_data', String), - Column('date_archived', DateTime)) + Column('date_archived', DateTime), + Column('archived_urls', JSON)) mapper_registry.map_imperatively(ScraperResult, raw_data_table) diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 477ce7a..3a399ec 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -24,7 +24,7 @@ class Scraper: def __str__(self): return self.__version__ - def archive_media(self, url: str) -> str: + def archive_media(self, url: str, key: str = None) -> str: n_retries = 0 r = requests.get(url) @@ -39,8 +39,9 @@ class Scraper: blob = r.content - key = url.split('/')[-1] - key = key.split('?')[0] + if key is None: + key = url.split('/')[-1] + key = key.split('?')[0] filename = self.__version__.replace(' ', '_') + '/' + key diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py index 5752d7d..6ea7fd7 100644 --- a/cisticola/scraper/telegram_snscrape.py +++ b/cisticola/scraper/telegram_snscrape.py @@ -1,4 +1,3 @@ - import cisticola.base import cisticola.scraper.base from typing import List @@ -24,15 +23,15 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper): if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): break - raw_data = post.json() + archived_urls = {} for image_url in post.images: archive_url = self.archive_media(image_url) - raw_data = raw_data.replace(image_url, archive_url) + archived_urls[image_url] = archive_url if post.video: video_archive_url = self.archive_media(post.video) - raw_data = raw_data.replace(post.video, video_archive_url) + archived_urls[post.video] = video_archive_url posts.append(cisticola.base.ScraperResult( scraper=self.__version__, @@ -40,8 +39,9 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper): channel=channel.id, platform_id=post.url, date=post.date, - date_archived=datetime.now(), - raw_data=raw_data + date_archived=datetime.now(timezone.utc), + raw_data=post.json(), + archived_urls=archived_urls )) return posts diff --git a/test.py b/test.py index 1885487..c3e4d72 100644 --- a/test.py +++ b/test.py @@ -1,7 +1,3 @@ -# TODO/TODECIDE: -# should 'username' be a part of the Channel definition somehow? -# still need to do some planning for handling media - import cisticola import cisticola.scraper.telegram_snscrape @@ -35,7 +31,7 @@ controller = cisticola.ScraperController() scraper = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper() controller.register_scraper(scraper) -engine = create_engine('sqlite:///test3.db') +engine = create_engine('sqlite:///test4.db') controller.connect_to_db(engine) controller.scrape_channels(test_channels) From e64d84500247ff91c0e8e0f5fefe0fb88e97dbd3 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Thu, 24 Feb 2022 18:48:48 +0100 Subject: [PATCH 04/12] Archive media in Twitter scraper --- Pipfile.lock | 2 +- cisticola/scraper/twitter.py | 40 ++++++++++++++++++++++++++++++++---- test.py | 11 +++++----- 3 files changed, 43 insertions(+), 10 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index 919e8f0..8a11934 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -509,7 +509,7 @@ }, "snscrape": { "git": "https://github.com/bellingcat/snscrape.git", - "ref": "72b26f2373f3fecf53bdf9c62d7408df3d15a329" + "ref": "de4ebed81f3f6a4bb4c65630daab6ec63784959b" }, "soupsieve": { "hashes": [ diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index e3c399d..bb85f48 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -1,8 +1,9 @@ import cisticola.base import cisticola.scraper.base -from datetime import datetime +from datetime import datetime, timezone from typing import List import snscrape.modules +from loguru import logger class TwitterScraper(cisticola.scraper.base.Scraper): @@ -20,13 +21,43 @@ class TwitterScraper(cisticola.scraper.base.Scraper): def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: posts = [] - scraper = snscrape.modules.twitter.TwitterUserScraper( + scraper = snscrape.modules.twitter.TwitterProfileScraper( TwitterScraper.get_username_from_url(channel.url)) + first = True + for tweet in scraper.get_items(): - if since is not None and tweet.date.timestamp() <= since.date_archived.timestamp(): + if len(posts) >= 10: break + if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc): + # with TwitterProfileScraper, the first tweet could be an old pinned tweet + if first: + first = False + continue + else: + break + + archived_urls = {} + + if tweet.media: + for media in tweet.media: + if type(media) == snscrape.modules.twitter.Video: + variant = max( + [v for v in media.variants if v.bitrate], key=lambda v: v.bitrate) + url = variant.url + elif type(media) == snscrape.modules.twitter.Gif: + url = media.variants[0].url + elif type(media) == snscrape.modules.twitter.Photo: + url = media.fullUrl + else: + logger.warning(f"Could not get media URL of {media}") + url = None + + if url is not None: + archived_url = self.archive_media(url) + archived_urls[url] = archived_url + posts.append(cisticola.base.ScraperResult( scraper=self.__version__, platform="Twitter", @@ -34,7 +65,8 @@ class TwitterScraper(cisticola.scraper.base.Scraper): platform_id=tweet.id, date=tweet.date, date_archived=datetime.now(), - raw_data=tweet.json())) + raw_data=tweet.json(), + archived_urls=archived_urls)) return posts diff --git a/test.py b/test.py index c3e4d72..0e4a6e0 100644 --- a/test.py +++ b/test.py @@ -1,5 +1,6 @@ import cisticola import cisticola.scraper.telegram_snscrape +import cisticola.scraper.twitter from sqlalchemy import create_engine @@ -25,13 +26,13 @@ test_channels = [cisticola.base.Channel(id=0, name="Logan Williams (test)", plat controller = cisticola.ScraperController() -# scraper = cisticola.scraper.twitter.TwitterScraper() -# controller.register_scraper(scraper) +twitter = cisticola.scraper.twitter.TwitterScraper() +controller.register_scraper(twitter) -scraper = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper() -controller.register_scraper(scraper) +telegram = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper() +controller.register_scraper(telegram) -engine = create_engine('sqlite:///test4.db') +engine = create_engine('sqlite:///test.db') controller.connect_to_db(engine) controller.scrape_channels(test_channels) From d163e6b3d906a3907961c1dec6967b117fe7b672 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Thu, 24 Feb 2022 18:49:06 +0100 Subject: [PATCH 05/12] Fix logging logic in scraper controller --- cisticola/__init__.py | 10 +++++----- cisticola/scraper/telegram_snscrape.py | 3 +++ 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cisticola/__init__.py b/cisticola/__init__.py index ee24a4e..e87aa19 100644 --- a/cisticola/__init__.py +++ b/cisticola/__init__.py @@ -46,12 +46,12 @@ class ScraperController: if not handled: logger.warning(f"No handler found for Channel {channel}") + else: + session = self.session() + session.bulk_save_objects(posts) + session.commit() - session = self.session() - session.bulk_save_objects(posts) - session.commit() - - logger.info(f"Added {len(posts)} entries to database") + logger.info(f"Added {len(posts)} entries to database") def connect_to_db(self, engine): # create tables diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py index 6ea7fd7..8bd8f15 100644 --- a/cisticola/scraper/telegram_snscrape.py +++ b/cisticola/scraper/telegram_snscrape.py @@ -20,6 +20,9 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper): g = scr.get_items() for post in g: + if (len(posts)) >= 10: + break + if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): break From d159c09aa49b91d00b48b3a24e98111481058801 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Thu, 24 Feb 2022 18:58:08 +0100 Subject: [PATCH 06/12] yield data rather than returning a list --- cisticola/__init__.py | 21 +++++++++++++-------- cisticola/scraper/telegram_snscrape.py | 10 ++-------- cisticola/scraper/twitter.py | 11 +++-------- 3 files changed, 18 insertions(+), 24 deletions(-) diff --git a/cisticola/__init__.py b/cisticola/__init__.py index e87aa19..5bac90b 100644 --- a/cisticola/__init__.py +++ b/cisticola/__init__.py @@ -4,6 +4,7 @@ import cisticola.scraper.base from sqlalchemy.orm import sessionmaker from loguru import logger +MAX_POSTS = 10 class ScraperController: """Registers scrapers, uses them to generate ScraperResults. Synchronizes @@ -27,6 +28,10 @@ class ScraperController: for scraper in self.scrapers: if scraper.can_handle(channel): + session = self.session() + handled = True + added = 0 + # get most recent post session = self.session() rows = session.query(cisticola.base.ScraperResult).order_by( @@ -38,20 +43,20 @@ class ScraperController: since = None posts = scraper.get_posts(channel, since=since) - handled = True + for post in posts: + session.add(post) + added += 1 + if added >= MAX_POSTS: + break + + session.commit() logger.info( - f"{scraper} found {len(posts)} new posts from {channel}") + f"{scraper} found {added} new posts from {channel}") break if not handled: logger.warning(f"No handler found for Channel {channel}") - else: - session = self.session() - session.bulk_save_objects(posts) - session.commit() - - logger.info(f"Added {len(posts)} entries to database") def connect_to_db(self, engine): # create tables diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py index 8bd8f15..c76910f 100644 --- a/cisticola/scraper/telegram_snscrape.py +++ b/cisticola/scraper/telegram_snscrape.py @@ -13,16 +13,12 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper): return True def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None): - posts = [] scr = snscrape.modules.telegram.TelegramChannelScraper( channel.screenname) g = scr.get_items() for post in g: - if (len(posts)) >= 10: - break - if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): break @@ -36,7 +32,7 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper): video_archive_url = self.archive_media(post.video) archived_urls[post.video] = video_archive_url - posts.append(cisticola.base.ScraperResult( + yield cisticola.base.ScraperResult( scraper=self.__version__, platform="Telegram", channel=channel.id, @@ -45,6 +41,4 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper): date_archived=datetime.now(timezone.utc), raw_data=post.json(), archived_urls=archived_urls - )) - - return posts + ) diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index bb85f48..41287ab 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -20,22 +20,19 @@ class TwitterScraper(cisticola.scraper.base.Scraper): return username def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: - posts = [] scraper = snscrape.modules.twitter.TwitterProfileScraper( TwitterScraper.get_username_from_url(channel.url)) first = True for tweet in scraper.get_items(): - if len(posts) >= 10: - break - if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc): # with TwitterProfileScraper, the first tweet could be an old pinned tweet if first: first = False continue else: + print('too far') break archived_urls = {} @@ -58,7 +55,7 @@ class TwitterScraper(cisticola.scraper.base.Scraper): archived_url = self.archive_media(url) archived_urls[url] = archived_url - posts.append(cisticola.base.ScraperResult( + yield cisticola.base.ScraperResult( scraper=self.__version__, platform="Twitter", channel=channel.id, @@ -66,9 +63,7 @@ class TwitterScraper(cisticola.scraper.base.Scraper): date=tweet.date, date_archived=datetime.now(), raw_data=tweet.json(), - archived_urls=archived_urls)) - - return posts + archived_urls=archived_urls) def can_handle(self, channel): if channel.platform == "Twitter" and TwitterScraper.get_username_from_url(channel.url) is not None: From 456d592792e3dbb2bda8a3b418bcb94985fd01b7 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Thu, 24 Feb 2022 20:24:03 +0100 Subject: [PATCH 07/12] Use user id for TwitterScraper --- cisticola/scraper/twitter.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index 41287ab..4793b49 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -10,18 +10,8 @@ class TwitterScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Twitter, using snscrape library""" __version__ = "TwitterScraper 0.0.1" - # TODO snscrape should be able to scrape from user ID alone, but there is - # currently a bug/other issue, so it is extracting the username from URL - def get_username_from_url(url): - username = url.split("twitter.com/")[1] - if len(username.split("/")) > 1: - return None - - return username - def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: - scraper = snscrape.modules.twitter.TwitterProfileScraper( - TwitterScraper.get_username_from_url(channel.url)) + scraper = snscrape.modules.twitter.TwitterProfileScraper(channel.platform_id) first = True @@ -66,5 +56,5 @@ class TwitterScraper(cisticola.scraper.base.Scraper): archived_urls=archived_urls) def can_handle(self, channel): - if channel.platform == "Twitter" and TwitterScraper.get_username_from_url(channel.url) is not None: + if channel.platform == "Twitter" and channel.platform_id: return True From 0b1c175dd9579f67c597bcd3d63db3b7220251ac Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Thu, 24 Feb 2022 20:25:14 +0100 Subject: [PATCH 08/12] Modify GettrScraper to yield results, archive media (videos incomplete) --- cisticola/base.py | 2 +- cisticola/scraper/gettr.py | 32 ++++++++++++++++++++++++-------- test.py | 9 +++++++-- 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/cisticola/base.py b/cisticola/base.py index d2913e2..03a1641 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -12,7 +12,7 @@ class ScraperResult: scraper: str platform: str - channel: int + channel: int #TODO there is probably a way of making this a Channel object foreign key platform_id: str date: datetime raw_data: str diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index c656549..5ae7d96 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -1,10 +1,11 @@ import cisticola.base +import cisticola.scraper.base from datetime import datetime import json from typing import List from gogettr import PublicClient -class GettrScraper(cisticola.scraper.Scraper): +class GettrScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Gettr, using gogettr library""" __version__ = "GettrScraper 0.0.1" @@ -16,25 +17,40 @@ class GettrScraper(cisticola.scraper.Scraper): return username def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: - posts = [] client = PublicClient() username = GettrScraper.get_username_from_url(channel.url) scraper = client.user_activity(username=username, type="posts") for post in scraper: - if since is not None and post['cdate'] <= int(since.date_archived.timestamp()): + if since is not None and datetime.fromtimestamp(post['cdate']*0.001) <= since.date: break - posts.append(cisticola.base.ScraperResult( + archived_urls = {} + + if 'imgs' in post: + for img in post['imgs']: + url = "https://media.gettr.com/" + img + archived_url = self.archive_media(url) + archived_urls[img] = archived_url + + if 'main' in post: + archived_url = self.archive_media("https://media.gettr.com/" + post['main']) + archived_urls[post['main']] = archived_url + + # TODO this is just archiving the playlist file, not the actual video + if 'vid' in post: + archived_url = self.archive_media("https://media.gettr.com/" + post['vid']) + archived_urls[post['vid']] = archived_url + + yield cisticola.base.ScraperResult( scraper=self.__version__, platform="Gettr", - channel=username, + channel=channel.id, platform_id=post['_id'], date=datetime.fromtimestamp(post['cdate']/1000.), date_archived=datetime.now(), - raw_data=json.dumps(post))) - - return posts + raw_data=json.dumps(post), + archived_urls=archived_urls) def can_handle(self, channel): if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None: diff --git a/test.py b/test.py index 0e4a6e0..9c60fb0 100644 --- a/test.py +++ b/test.py @@ -1,11 +1,13 @@ import cisticola import cisticola.scraper.telegram_snscrape import cisticola.scraper.twitter +import cisticola.scraper.gettr from sqlalchemy import create_engine -test_channels = [cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132, +test_channels = [ + cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132, category="test", followers=None, platform="Twitter", url="https://twitter.com/obtusatum", screenname="obtusatum", country="US", influencer=None, public=True, chat=False, @@ -32,7 +34,10 @@ controller.register_scraper(twitter) telegram = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper() controller.register_scraper(telegram) -engine = create_engine('sqlite:///test.db') +gettr = cisticola.scraper.gettr.GettrScraper() +controller.register_scraper(gettr) + +engine = create_engine('sqlite:///test3.db') controller.connect_to_db(engine) controller.scrape_channels(test_channels) From 1ad7c8bc11b37faa49277b590d8ed05d4c9647ad Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Thu, 24 Feb 2022 20:26:10 +0100 Subject: [PATCH 09/12] Search for since per-channel --- cisticola/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cisticola/__init__.py b/cisticola/__init__.py index 5bac90b..1960467 100644 --- a/cisticola/__init__.py +++ b/cisticola/__init__.py @@ -6,6 +6,7 @@ from loguru import logger MAX_POSTS = 10 + class ScraperController: """Registers scrapers, uses them to generate ScraperResults. Synchronizes everything with database via ORM.""" @@ -34,7 +35,8 @@ class ScraperController: # get most recent post session = self.session() - rows = session.query(cisticola.base.ScraperResult).order_by( + rows = session.query(cisticola.base.ScraperResult).where( + cisticola.base.ScraperResult.channel == channel.id).order_by( cisticola.base.ScraperResult.date.desc()).limit(1).all() if len(rows) == 1: From 3480452fac32355c46640799936304da9eba964d Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Thu, 24 Feb 2022 20:36:23 +0100 Subject: [PATCH 10/12] Fix type hints --- cisticola/scraper/base.py | 4 ++-- cisticola/scraper/bitchute.py | 4 ++-- cisticola/scraper/gettr.py | 4 ++-- cisticola/scraper/telegram_snscrape.py | 4 ++-- cisticola/scraper/twitter.py | 5 ++--- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 3a399ec..4d31370 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -1,4 +1,4 @@ -from typing import List +from typing import Generator import cisticola.base import requests import os @@ -55,5 +55,5 @@ class Scraper: def can_handle(self, channel: cisticola.base.Channel) -> bool: pass - def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: pass diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index ed144dc..c5c9f66 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -4,7 +4,7 @@ import re from html.parser import HTMLParser import dateparser import json -from typing import List +from typing import Generator import requests from bs4 import BeautifulSoup @@ -23,7 +23,7 @@ class BitchuteScraper(cisticola.scraper.Scraper): return username - def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: session = requests.Session() session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0" diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index 5ae7d96..9a52a69 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -2,7 +2,7 @@ import cisticola.base import cisticola.scraper.base from datetime import datetime import json -from typing import List +from typing import Generator from gogettr import PublicClient class GettrScraper(cisticola.scraper.base.Scraper): @@ -16,7 +16,7 @@ class GettrScraper(cisticola.scraper.base.Scraper): return username - def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: client = PublicClient() username = GettrScraper.get_username_from_url(channel.url) scraper = client.user_activity(username=username, type="posts") diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py index c76910f..bba9276 100644 --- a/cisticola/scraper/telegram_snscrape.py +++ b/cisticola/scraper/telegram_snscrape.py @@ -1,6 +1,6 @@ import cisticola.base import cisticola.scraper.base -from typing import List +from typing import Generator import snscrape.modules from datetime import datetime, timezone @@ -12,7 +12,7 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper): if channel.platform == "Telegram" and channel.public and not channel.chat: return True - def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None): + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: scr = snscrape.modules.telegram.TelegramChannelScraper( channel.screenname) diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index 4793b49..a43365c 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -1,7 +1,7 @@ import cisticola.base import cisticola.scraper.base from datetime import datetime, timezone -from typing import List +from typing import Generator import snscrape.modules from loguru import logger @@ -10,7 +10,7 @@ class TwitterScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Twitter, using snscrape library""" __version__ = "TwitterScraper 0.0.1" - def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: scraper = snscrape.modules.twitter.TwitterProfileScraper(channel.platform_id) first = True @@ -22,7 +22,6 @@ class TwitterScraper(cisticola.scraper.base.Scraper): first = False continue else: - print('too far') break archived_urls = {} From e6085689b5c7ad6f918da8fd3a2d2cf213c8510d Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Thu, 24 Feb 2022 20:47:46 +0100 Subject: [PATCH 11/12] On second thought, don't share secrets --- .env | 5 ----- .gitignore | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) delete mode 100644 .env diff --git a/.env b/.env deleted file mode 100644 index ace9ca9..0000000 --- a/.env +++ /dev/null @@ -1,5 +0,0 @@ -DO_SPACES_REGION=ams3 -DO_SPACES_KEY=DKIMQ7ABHPOBC4OZDEQR -DO_SPACES_SECRET=uqKaPQsV4WmskQr8/O2NTS+OHiTNV2yVJn8u9Ny0rsA -DO_BUCKET=cisticola-test -DO_URL=https://cisticola-test.ams3.digitaloceanspaces.com \ No newline at end of file diff --git a/.gitignore b/.gitignore index 643d95d..b107dd0 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,4 @@ *.db docs/build/ docs/source/_* - +.env From 8ab56ff5baaeecf1fcaf6d6a84b7c808398d8a17 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Fri, 25 Feb 2022 08:52:42 +0100 Subject: [PATCH 12/12] Remove MAX_POSTS, auto detect MIME type Co-authored-by: Tristan Lee --- cisticola/__init__.py | 4 ---- cisticola/scraper/base.py | 4 +++- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/cisticola/__init__.py b/cisticola/__init__.py index 1960467..2b873cc 100644 --- a/cisticola/__init__.py +++ b/cisticola/__init__.py @@ -4,8 +4,6 @@ import cisticola.scraper.base from sqlalchemy.orm import sessionmaker from loguru import logger -MAX_POSTS = 10 - class ScraperController: """Registers scrapers, uses them to generate ScraperResults. Synchronizes @@ -49,8 +47,6 @@ class ScraperController: for post in posts: session.add(post) added += 1 - if added >= MAX_POSTS: - break session.commit() logger.info( diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 4d31370..a41a0b2 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -38,6 +38,8 @@ class Scraper: return url blob = r.content + + content_type = r.headers.get('Content-Type') if key is None: key = url.split('/')[-1] @@ -46,7 +48,7 @@ class Scraper: filename = self.__version__.replace(' ', '_') + '/' + key self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.getenv( - 'DO_BUCKET'), Key=filename, ExtraArgs={'ACL': 'public-read', 'ContentType': 'image/jpeg'}) + 'DO_BUCKET'), Key=filename, ExtraArgs={'ACL': 'public-read', 'ContentType': content_type}) archived_url = os.getenv('DO_URL') + '/' + filename