Add functions for scraping based on Channel database

This commit is contained in:
Logan Williams
2022-03-22 11:26:46 +01:00
parent 885b4687ce
commit 806f07f458
2 changed files with 36 additions and 18 deletions

View File

@@ -133,6 +133,17 @@ class ScraperController:
def register_scrapers(self, scraper: List[Scraper]):
self.scrapers.extend(scraper)
def scrape_all_channels(self, archive_media: bool = True):
if self.session is None:
logger.error("No DB session")
return
session = self.session()
channels = session.query(Channel).where(Channel.source=='researcher').all()
return self.scrape_channels(channels, archive_media=archive_media)
@logger.catch(reraise = True)
def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
@@ -145,7 +156,6 @@ class ScraperController:
for scraper in self.scrapers:
if scraper.can_handle(channel):
session = self.session()
handled = True
added = 0

View File

@@ -29,25 +29,33 @@ class TwitterScraper(Scraper):
archived_urls = {}
if archive_media:
media_list = []
if tweet.media:
for media in tweet.media:
if type(media) == Video:
variant = max(
[v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
url = variant.url
elif type(media) == Gif:
url = media.variants[0].url
elif type(media) == Photo:
url = media.fullUrl
else:
logger.warning(f"Could not get media URL of {media}")
url = None
media_list += tweet.media
if url is not None:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
if tweet.retweetedTweet and tweet.retweetedTweet.media:
media_list += tweet.retweetedTweet.media
if tweet.quotedTweet and tweet.quotedTweet.media:
media_list += tweet.quotedTweet.media
for media in media_list:
if type(media) == Video:
variant = max(
[v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
url = variant.url
elif type(media) == Gif:
url = media.variants[0].url
elif type(media) == Photo:
url = media.fullUrl
else:
logger.warning(f"Could not get media URL of {media}")
url = None
if url is not None and url not in archived_urls:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,