From b6386747d478ab82948c33f614be67e543252775 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Mon, 4 Apr 2022 10:54:27 +0000 Subject: [PATCH] Add indices on appropriate columns; limit # of posts to archive --- cisticola/base.py | 24 ++++++++++++------------ cisticola/scraper/base.py | 4 +++- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/cisticola/base.py b/cisticola/base.py index bcc4618..bd70c12 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -246,21 +246,21 @@ raw_posts_table = Table('raw_posts', mapper_registry.metadata, autoincrement=True), Column('scraper', String), Column('platform', String), - Column('channel', Integer, ForeignKey('channels.id')), + Column('channel', Integer, ForeignKey('channels.id'), index=True), Column('platform_id', String), - Column('date', DateTime), + Column('date', DateTime, index=True), Column('raw_data', String), - Column('date_archived', DateTime), + Column('date_archived', DateTime, index=True), Column('archived_urls', JSON), - Column('media_archived', DateTime)) + Column('media_archived', DateTime, index=True)) raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata, Column('id', Integer, primary_key=True), Column('scraper', String), Column('platform', String), - Column('channel', Integer, ForeignKey('channels.id')), + Column('channel', Integer, ForeignKey('channels.id'), index=True), Column('raw_data', String), - Column('date_archived', DateTime)) + Column('date_archived', DateTime, index=True)) channel_table = Table('channels', mapper_registry.metadata, Column('id', Integer, primary_key=True, autoincrement=True), @@ -281,20 +281,20 @@ channel_table = Table('channels', mapper_registry.metadata, post_table = Table('posts', mapper_registry.metadata, Column('id', Integer, primary_key=True, autoincrement=True), - Column('raw_id', Integer, ForeignKey('raw_posts.id')), + Column('raw_id', Integer, ForeignKey('raw_posts.id'), index=True), Column('platform_id', Integer), Column('scraper', String), Column('transformer', String), Column('platform', String), - Column('channel', Integer, ForeignKey('channels.id')), - Column('date', DateTime), - Column('date_archived', DateTime), + Column('channel', Integer, ForeignKey('channels.id'), index=True), + Column('date', DateTime, index=True), + Column('date_archived', DateTime, index=True), Column('url', String), Column('author_id', String), Column('author_username', String), Column('content', String), - Column('forwarded_from', Integer, ForeignKey('channels.id')), - Column('reply_to', Integer, ForeignKey('posts.id')) + Column('forwarded_from', Integer, ForeignKey('channels.id'), index=True), + Column('reply_to', Integer, ForeignKey('posts.id'), index=True) ) media_table = Table('media', mapper_registry.metadata, diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 5c8e4be..0144557 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -419,7 +419,9 @@ class ScraperController: session = self.session() - posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).all() + # this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work + # simultaneously with low risk of collision (at least while the number of unarchived items is very large) + posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).limit(10000).all() logger.info(f"Found {len(posts)} posts without media. Archiving now")