Merge pull request #40 from bellingcat/next-release

Add indices on appropriate columns; limit # of posts to archive
This commit is contained in:
Logan Williams
2022-04-04 13:03:16 +02:00
committed by GitHub
2 changed files with 15 additions and 13 deletions

View File

@@ -246,21 +246,21 @@ raw_posts_table = Table('raw_posts', mapper_registry.metadata,
autoincrement=True),
Column('scraper', String),
Column('platform', String),
Column('channel', Integer, ForeignKey('channels.id')),
Column('channel', Integer, ForeignKey('channels.id'), index=True),
Column('platform_id', String),
Column('date', DateTime),
Column('date', DateTime, index=True),
Column('raw_data', String),
Column('date_archived', DateTime),
Column('date_archived', DateTime, index=True),
Column('archived_urls', JSON),
Column('media_archived', DateTime))
Column('media_archived', DateTime, index=True))
raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata,
Column('id', Integer, primary_key=True),
Column('scraper', String),
Column('platform', String),
Column('channel', Integer, ForeignKey('channels.id')),
Column('channel', Integer, ForeignKey('channels.id'), index=True),
Column('raw_data', String),
Column('date_archived', DateTime))
Column('date_archived', DateTime, index=True))
channel_table = Table('channels', mapper_registry.metadata,
Column('id', Integer, primary_key=True, autoincrement=True),
@@ -281,20 +281,20 @@ channel_table = Table('channels', mapper_registry.metadata,
post_table = Table('posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('raw_id', Integer, ForeignKey('raw_posts.id')),
Column('raw_id', Integer, ForeignKey('raw_posts.id'), index=True),
Column('platform_id', Integer),
Column('scraper', String),
Column('transformer', String),
Column('platform', String),
Column('channel', Integer, ForeignKey('channels.id')),
Column('date', DateTime),
Column('date_archived', DateTime),
Column('channel', Integer, ForeignKey('channels.id'), index=True),
Column('date', DateTime, index=True),
Column('date_archived', DateTime, index=True),
Column('url', String),
Column('author_id', String),
Column('author_username', String),
Column('content', String),
Column('forwarded_from', Integer, ForeignKey('channels.id')),
Column('reply_to', Integer, ForeignKey('posts.id'))
Column('forwarded_from', Integer, ForeignKey('channels.id'), index=True),
Column('reply_to', Integer, ForeignKey('posts.id'), index=True)
)
media_table = Table('media', mapper_registry.metadata,

View File

@@ -419,7 +419,9 @@ class ScraperController:
session = self.session()
posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).all()
# this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work
# simultaneously with low risk of collision (at least while the number of unarchived items is very large)
posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).limit(10000).all()
logger.info(f"Found {len(posts)} posts without media. Archiving now")