mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
Merge pull request #53 from bellingcat/sync-channels
Close sessions; sort channel info by least recently archived
This commit is contained in:
@@ -12,8 +12,9 @@ from sqlalchemy.orm import sessionmaker
|
||||
import yt_dlp
|
||||
from sqlalchemy.sql.expression import func
|
||||
from pathlib import Path
|
||||
from sqlalchemy import nullsfirst
|
||||
|
||||
from cisticola.base import Channel, ScraperResult, mapper_registry
|
||||
from cisticola.base import Channel, RawChannelInfo, ScraperResult, mapper_registry
|
||||
from cisticola.utils import make_request
|
||||
|
||||
class Scraper:
|
||||
@@ -340,6 +341,8 @@ class ScraperController:
|
||||
|
||||
channels = session.query(Channel).where(Channel.source=='researcher').all()
|
||||
|
||||
session.close()
|
||||
|
||||
return self.scrape_channels(channels, archive_media=archive_media)
|
||||
|
||||
def scrape_all_channel_info(self):
|
||||
@@ -349,8 +352,15 @@ class ScraperController:
|
||||
|
||||
session = self.session()
|
||||
|
||||
channels = session.query(Channel).where(Channel.source=='researcher').all()
|
||||
# Because of rate limiting, we may not be able to succesfully scrape info for all of these channels.
|
||||
# This will sort the channels by the least recently scraped.
|
||||
most_recently_archived = session.query(func.max(RawChannelInfo.date_archived).label("date"), RawChannelInfo.channel.label("channel")).group_by(RawChannelInfo.channel).subquery()
|
||||
channels = session.query(Channel).\
|
||||
where(Channel.source=='researcher').\
|
||||
outerjoin(most_recently_archived, Channel.id == most_recently_archived.c.channel).\
|
||||
order_by(nullsfirst(most_recently_archived.c.date.asc())).all()
|
||||
|
||||
session.close()
|
||||
return self.scrape_channel_info(channels)
|
||||
|
||||
def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
|
||||
@@ -419,6 +429,8 @@ class ScraperController:
|
||||
if not handled:
|
||||
logger.warning(f"No handler found for Channel {channel}")
|
||||
|
||||
session.close()
|
||||
|
||||
@logger.catch(reraise = True)
|
||||
def archive_unarchived_media(self):
|
||||
if self.session is None:
|
||||
@@ -452,6 +464,7 @@ class ScraperController:
|
||||
logger.warning(f"No handler found for post scraped with {post.scraper}")
|
||||
|
||||
session.commit()
|
||||
session.close()
|
||||
|
||||
@logger.catch(reraise = True)
|
||||
def scrape_channel_info(self, channels: List[Channel]):
|
||||
@@ -470,6 +483,7 @@ class ScraperController:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
|
||||
session = self.session()
|
||||
for channel in channels:
|
||||
handled = False
|
||||
|
||||
@@ -479,10 +493,13 @@ class ScraperController:
|
||||
handled = True
|
||||
|
||||
# get most recent post
|
||||
session = self.session()
|
||||
|
||||
try:
|
||||
info = scraper.get_profile(channel)
|
||||
if info is None:
|
||||
logger.warning(f"No info returned for {channel}")
|
||||
break
|
||||
|
||||
session.add(info)
|
||||
|
||||
session.commit()
|
||||
@@ -495,6 +512,8 @@ class ScraperController:
|
||||
if not handled:
|
||||
logger.warning(f"No handler found for Channel {channel}")
|
||||
|
||||
session.close()
|
||||
|
||||
def connect_to_db(self, engine):
|
||||
"""Connect the specified SQLAlchemy engine to the controller.
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user