mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
Merge pull request #40 from bellingcat/next-release
Add indices on appropriate columns; limit # of posts to archive
This commit is contained in:
@@ -246,21 +246,21 @@ raw_posts_table = Table('raw_posts', mapper_registry.metadata,
|
||||
autoincrement=True),
|
||||
Column('scraper', String),
|
||||
Column('platform', String),
|
||||
Column('channel', Integer, ForeignKey('channels.id')),
|
||||
Column('channel', Integer, ForeignKey('channels.id'), index=True),
|
||||
Column('platform_id', String),
|
||||
Column('date', DateTime),
|
||||
Column('date', DateTime, index=True),
|
||||
Column('raw_data', String),
|
||||
Column('date_archived', DateTime),
|
||||
Column('date_archived', DateTime, index=True),
|
||||
Column('archived_urls', JSON),
|
||||
Column('media_archived', DateTime))
|
||||
Column('media_archived', DateTime, index=True))
|
||||
|
||||
raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True),
|
||||
Column('scraper', String),
|
||||
Column('platform', String),
|
||||
Column('channel', Integer, ForeignKey('channels.id')),
|
||||
Column('channel', Integer, ForeignKey('channels.id'), index=True),
|
||||
Column('raw_data', String),
|
||||
Column('date_archived', DateTime))
|
||||
Column('date_archived', DateTime, index=True))
|
||||
|
||||
channel_table = Table('channels', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True, autoincrement=True),
|
||||
@@ -281,20 +281,20 @@ channel_table = Table('channels', mapper_registry.metadata,
|
||||
post_table = Table('posts', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
Column('raw_id', Integer, ForeignKey('raw_posts.id')),
|
||||
Column('raw_id', Integer, ForeignKey('raw_posts.id'), index=True),
|
||||
Column('platform_id', Integer),
|
||||
Column('scraper', String),
|
||||
Column('transformer', String),
|
||||
Column('platform', String),
|
||||
Column('channel', Integer, ForeignKey('channels.id')),
|
||||
Column('date', DateTime),
|
||||
Column('date_archived', DateTime),
|
||||
Column('channel', Integer, ForeignKey('channels.id'), index=True),
|
||||
Column('date', DateTime, index=True),
|
||||
Column('date_archived', DateTime, index=True),
|
||||
Column('url', String),
|
||||
Column('author_id', String),
|
||||
Column('author_username', String),
|
||||
Column('content', String),
|
||||
Column('forwarded_from', Integer, ForeignKey('channels.id')),
|
||||
Column('reply_to', Integer, ForeignKey('posts.id'))
|
||||
Column('forwarded_from', Integer, ForeignKey('channels.id'), index=True),
|
||||
Column('reply_to', Integer, ForeignKey('posts.id'), index=True)
|
||||
)
|
||||
|
||||
media_table = Table('media', mapper_registry.metadata,
|
||||
|
||||
@@ -419,7 +419,9 @@ class ScraperController:
|
||||
|
||||
session = self.session()
|
||||
|
||||
posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).all()
|
||||
# this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work
|
||||
# simultaneously with low risk of collision (at least while the number of unarchived items is very large)
|
||||
posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).limit(10000).all()
|
||||
|
||||
logger.info(f"Found {len(posts)} posts without media. Archiving now")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user