diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index d7f69a1..4a3e57e 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -133,6 +133,17 @@ class ScraperController: def register_scrapers(self, scraper: List[Scraper]): self.scrapers.extend(scraper) + + def scrape_all_channels(self, archive_media: bool = True): + if self.session is None: + logger.error("No DB session") + return + + session = self.session() + + channels = session.query(Channel).where(Channel.source=='researcher').all() + + return self.scrape_channels(channels, archive_media=archive_media) @logger.catch(reraise = True) def scrape_channels(self, channels: List[Channel], archive_media: bool = True): @@ -145,7 +156,6 @@ class ScraperController: for scraper in self.scrapers: if scraper.can_handle(channel): - session = self.session() handled = True added = 0 diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index c019e27..8209282 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -29,25 +29,33 @@ class TwitterScraper(Scraper): archived_urls = {} if archive_media: - + media_list = [] if tweet.media: - for media in tweet.media: - if type(media) == Video: - variant = max( - [v for v in media.variants if v.bitrate], key=lambda v: v.bitrate) - url = variant.url - elif type(media) == Gif: - url = media.variants[0].url - elif type(media) == Photo: - url = media.fullUrl - else: - logger.warning(f"Could not get media URL of {media}") - url = None + media_list += tweet.media - if url is not None: - media_blob, content_type, key = self.url_to_blob(url) - archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[url] = archived_url + if tweet.retweetedTweet and tweet.retweetedTweet.media: + media_list += tweet.retweetedTweet.media + + if tweet.quotedTweet and tweet.quotedTweet.media: + media_list += tweet.quotedTweet.media + + for media in media_list: + if type(media) == Video: + variant = max( + [v for v in media.variants if v.bitrate], key=lambda v: v.bitrate) + url = variant.url + elif type(media) == Gif: + url = media.variants[0].url + elif type(media) == Photo: + url = media.fullUrl + else: + logger.warning(f"Could not get media URL of {media}") + url = None + + if url is not None and url not in archived_urls: + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_blob(media_blob, content_type, key) + archived_urls[url] = archived_url yield ScraperResult( scraper=self.__version__,