mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
ensured that before being scraped, all channels are added to the database, preventing channel.platform_id from being null.
This commit is contained in:
@@ -358,7 +358,23 @@ class ScraperController:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
|
||||
session = self.session()
|
||||
|
||||
# If any channels are not already in the database, add them
|
||||
for channel in channels:
|
||||
|
||||
platform_id = None
|
||||
if channel.platform_id not in (None, ''):
|
||||
platform_id = channel.platform_id
|
||||
|
||||
channel_in_db = session.query(Channel).filter_by(platform_id=platform_id, platform=channel.platform, url=channel.url).first()
|
||||
|
||||
if not channel_in_db:
|
||||
logger.debug(f"{channel} does not exist in database, adding")
|
||||
session.add(channel)
|
||||
session.flush()
|
||||
session.commit()
|
||||
|
||||
handled = False
|
||||
|
||||
for scraper in self.scrapers:
|
||||
@@ -368,7 +384,6 @@ class ScraperController:
|
||||
added = 0
|
||||
|
||||
# get most recent post
|
||||
session = self.session()
|
||||
rows = session.query(ScraperResult).where(
|
||||
ScraperResult.channel == channel.id).order_by(
|
||||
ScraperResult.date.desc()).limit(1).all()
|
||||
@@ -378,10 +393,6 @@ class ScraperController:
|
||||
else:
|
||||
since = None
|
||||
|
||||
# TODO currently, if channels haven't been added to the database, if channel.id is None, the `since` returns the most recently scraped ScraperResult with channel.id == None, which can be from a different platform and channel. Maybe add check in above query logic that channel.id isn't null.
|
||||
if channel.id is None:
|
||||
since = None
|
||||
|
||||
posts = scraper.get_posts(channel, since=since, archive_media=archive_media)
|
||||
|
||||
for post in posts:
|
||||
|
||||
Reference in New Issue
Block a user