yield data rather than returning a list

This commit is contained in:
Logan Williams
2022-02-24 18:58:08 +01:00
parent d163e6b3d9
commit d159c09aa4
3 changed files with 18 additions and 24 deletions

View File

@@ -4,6 +4,7 @@ import cisticola.scraper.base
from sqlalchemy.orm import sessionmaker
from loguru import logger
MAX_POSTS = 10
class ScraperController:
"""Registers scrapers, uses them to generate ScraperResults. Synchronizes
@@ -27,6 +28,10 @@ class ScraperController:
for scraper in self.scrapers:
if scraper.can_handle(channel):
session = self.session()
handled = True
added = 0
# get most recent post
session = self.session()
rows = session.query(cisticola.base.ScraperResult).order_by(
@@ -38,20 +43,20 @@ class ScraperController:
since = None
posts = scraper.get_posts(channel, since=since)
handled = True
for post in posts:
session.add(post)
added += 1
if added >= MAX_POSTS:
break
session.commit()
logger.info(
f"{scraper} found {len(posts)} new posts from {channel}")
f"{scraper} found {added} new posts from {channel}")
break
if not handled:
logger.warning(f"No handler found for Channel {channel}")
else:
session = self.session()
session.bulk_save_objects(posts)
session.commit()
logger.info(f"Added {len(posts)} entries to database")
def connect_to_db(self, engine):
# create tables

View File

@@ -13,16 +13,12 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
return True
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None):
posts = []
scr = snscrape.modules.telegram.TelegramChannelScraper(
channel.screenname)
g = scr.get_items()
for post in g:
if (len(posts)) >= 10:
break
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
break
@@ -36,7 +32,7 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
video_archive_url = self.archive_media(post.video)
archived_urls[post.video] = video_archive_url
posts.append(cisticola.base.ScraperResult(
yield cisticola.base.ScraperResult(
scraper=self.__version__,
platform="Telegram",
channel=channel.id,
@@ -45,6 +41,4 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
date_archived=datetime.now(timezone.utc),
raw_data=post.json(),
archived_urls=archived_urls
))
return posts
)

View File

@@ -20,22 +20,19 @@ class TwitterScraper(cisticola.scraper.base.Scraper):
return username
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
posts = []
scraper = snscrape.modules.twitter.TwitterProfileScraper(
TwitterScraper.get_username_from_url(channel.url))
first = True
for tweet in scraper.get_items():
if len(posts) >= 10:
break
if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
# with TwitterProfileScraper, the first tweet could be an old pinned tweet
if first:
first = False
continue
else:
print('too far')
break
archived_urls = {}
@@ -58,7 +55,7 @@ class TwitterScraper(cisticola.scraper.base.Scraper):
archived_url = self.archive_media(url)
archived_urls[url] = archived_url
posts.append(cisticola.base.ScraperResult(
yield cisticola.base.ScraperResult(
scraper=self.__version__,
platform="Twitter",
channel=channel.id,
@@ -66,9 +63,7 @@ class TwitterScraper(cisticola.scraper.base.Scraper):
date=tweet.date,
date_archived=datetime.now(),
raw_data=tweet.json(),
archived_urls=archived_urls))
return posts
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Twitter" and TwitterScraper.get_username_from_url(channel.url) is not None: