From e64d84500247ff91c0e8e0f5fefe0fb88e97dbd3 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Thu, 24 Feb 2022 18:48:48 +0100 Subject: [PATCH] Archive media in Twitter scraper --- Pipfile.lock | 2 +- cisticola/scraper/twitter.py | 40 ++++++++++++++++++++++++++++++++---- test.py | 11 +++++----- 3 files changed, 43 insertions(+), 10 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index 919e8f0..8a11934 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -509,7 +509,7 @@ }, "snscrape": { "git": "https://github.com/bellingcat/snscrape.git", - "ref": "72b26f2373f3fecf53bdf9c62d7408df3d15a329" + "ref": "de4ebed81f3f6a4bb4c65630daab6ec63784959b" }, "soupsieve": { "hashes": [ diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index e3c399d..bb85f48 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -1,8 +1,9 @@ import cisticola.base import cisticola.scraper.base -from datetime import datetime +from datetime import datetime, timezone from typing import List import snscrape.modules +from loguru import logger class TwitterScraper(cisticola.scraper.base.Scraper): @@ -20,13 +21,43 @@ class TwitterScraper(cisticola.scraper.base.Scraper): def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: posts = [] - scraper = snscrape.modules.twitter.TwitterUserScraper( + scraper = snscrape.modules.twitter.TwitterProfileScraper( TwitterScraper.get_username_from_url(channel.url)) + first = True + for tweet in scraper.get_items(): - if since is not None and tweet.date.timestamp() <= since.date_archived.timestamp(): + if len(posts) >= 10: break + if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc): + # with TwitterProfileScraper, the first tweet could be an old pinned tweet + if first: + first = False + continue + else: + break + + archived_urls = {} + + if tweet.media: + for media in tweet.media: + if type(media) == snscrape.modules.twitter.Video: + variant = max( + [v for v in media.variants if v.bitrate], key=lambda v: v.bitrate) + url = variant.url + elif type(media) == snscrape.modules.twitter.Gif: + url = media.variants[0].url + elif type(media) == snscrape.modules.twitter.Photo: + url = media.fullUrl + else: + logger.warning(f"Could not get media URL of {media}") + url = None + + if url is not None: + archived_url = self.archive_media(url) + archived_urls[url] = archived_url + posts.append(cisticola.base.ScraperResult( scraper=self.__version__, platform="Twitter", @@ -34,7 +65,8 @@ class TwitterScraper(cisticola.scraper.base.Scraper): platform_id=tweet.id, date=tweet.date, date_archived=datetime.now(), - raw_data=tweet.json())) + raw_data=tweet.json(), + archived_urls=archived_urls)) return posts diff --git a/test.py b/test.py index c3e4d72..0e4a6e0 100644 --- a/test.py +++ b/test.py @@ -1,5 +1,6 @@ import cisticola import cisticola.scraper.telegram_snscrape +import cisticola.scraper.twitter from sqlalchemy import create_engine @@ -25,13 +26,13 @@ test_channels = [cisticola.base.Channel(id=0, name="Logan Williams (test)", plat controller = cisticola.ScraperController() -# scraper = cisticola.scraper.twitter.TwitterScraper() -# controller.register_scraper(scraper) +twitter = cisticola.scraper.twitter.TwitterScraper() +controller.register_scraper(twitter) -scraper = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper() -controller.register_scraper(scraper) +telegram = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper() +controller.register_scraper(telegram) -engine = create_engine('sqlite:///test4.db') +engine = create_engine('sqlite:///test.db') controller.connect_to_db(engine) controller.scrape_channels(test_channels)