From 0bab20e37176320fcdf88fdd2ffb15bf2254442a Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Fri, 1 Apr 2022 17:03:02 -0500 Subject: [PATCH] ensured that before being scraped, all channels are added to the database, preventing channel.platform_id from being null. --- cisticola/scraper/base.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index e08bf8e..0762c16 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -358,7 +358,23 @@ class ScraperController: logger.error("No DB session") return + session = self.session() + + # If any channels are not already in the database, add them for channel in channels: + + platform_id = None + if channel.platform_id not in (None, ''): + platform_id = channel.platform_id + + channel_in_db = session.query(Channel).filter_by(platform_id=platform_id, platform=channel.platform, url=channel.url).first() + + if not channel_in_db: + logger.debug(f"{channel} does not exist in database, adding") + session.add(channel) + session.flush() + session.commit() + handled = False for scraper in self.scrapers: @@ -368,7 +384,6 @@ class ScraperController: added = 0 # get most recent post - session = self.session() rows = session.query(ScraperResult).where( ScraperResult.channel == channel.id).order_by( ScraperResult.date.desc()).limit(1).all() @@ -378,10 +393,6 @@ class ScraperController: else: since = None - # TODO currently, if channels haven't been added to the database, if channel.id is None, the `since` returns the most recently scraped ScraperResult with channel.id == None, which can be from a different platform and channel. Maybe add check in above query logic that channel.id isn't null. - if channel.id is None: - since = None - posts = scraper.get_posts(channel, since=since, archive_media=archive_media) for post in posts: