Merge pull request #40 from bellingcat/next-release

Add indices on appropriate columns; limit # of posts to archive
2026-06-08 03:18:34 +03:00 · 2022-04-04 13:03:16 +02:00
parent fccbad7a93 b6386747d4
commit 36c81c8e17
2 changed files with 15 additions and 13 deletions
--- a/cisticola/base.py
+++ b/cisticola/base.py
@@ -246,21 +246,21 @@ raw_posts_table = Table('raw_posts', mapper_registry.metadata,
                              autoincrement=True),
                       Column('scraper', String),
                       Column('platform', String),
-                       Column('channel', Integer, ForeignKey('channels.id')),
+                       Column('channel', Integer, ForeignKey('channels.id'), index=True),
                       Column('platform_id', String),
-                       Column('date', DateTime),
+                       Column('date', DateTime, index=True),
                       Column('raw_data', String),
-                       Column('date_archived', DateTime),
+                       Column('date_archived', DateTime, index=True),
                       Column('archived_urls', JSON),
-                       Column('media_archived', DateTime))
+                       Column('media_archived', DateTime, index=True))

 raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata,
                    Column('id', Integer, primary_key=True),
                    Column('scraper', String),
                    Column('platform', String),
-                    Column('channel', Integer, ForeignKey('channels.id')),
+                    Column('channel', Integer, ForeignKey('channels.id'), index=True),
                    Column('raw_data', String),
-                    Column('date_archived', DateTime))
+                    Column('date_archived', DateTime, index=True))

 channel_table = Table('channels', mapper_registry.metadata,
                    Column('id', Integer, primary_key=True, autoincrement=True),
@@ -281,20 +281,20 @@ channel_table = Table('channels', mapper_registry.metadata,
 post_table = Table('posts', mapper_registry.metadata,
                       Column('id', Integer, primary_key=True,
                              autoincrement=True),
-                       Column('raw_id', Integer, ForeignKey('raw_posts.id')),
+                       Column('raw_id', Integer, ForeignKey('raw_posts.id'), index=True),
                       Column('platform_id', Integer),
                       Column('scraper', String),
                       Column('transformer', String),
                       Column('platform', String),
-                       Column('channel', Integer, ForeignKey('channels.id')),
-                       Column('date', DateTime),
-                       Column('date_archived', DateTime),
+                       Column('channel', Integer, ForeignKey('channels.id'), index=True),
+                       Column('date', DateTime, index=True),
+                       Column('date_archived', DateTime, index=True),
                       Column('url', String),
                       Column('author_id', String),
                       Column('author_username', String),
                       Column('content', String),
-                       Column('forwarded_from', Integer, ForeignKey('channels.id')),
-                       Column('reply_to', Integer, ForeignKey('posts.id'))
+                       Column('forwarded_from', Integer, ForeignKey('channels.id'), index=True),
+                       Column('reply_to', Integer, ForeignKey('posts.id'), index=True)
                       )

 media_table = Table('media', mapper_registry.metadata,
--- a/cisticola/scraper/base.py
+++ b/cisticola/scraper/base.py
@@ -419,7 +419,9 @@ class ScraperController:

        session = self.session()

-        posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).all()
+        # this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work
+        # simultaneously with low risk of collision (at least while the number of unarchived items is very large)
+        posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).limit(10000).all()

        logger.info(f"Found {len(posts)} posts without media. Archiving now")