diff --git a/app.py b/app.py index a81258c..e7f9969 100644 --- a/app.py +++ b/app.py @@ -31,37 +31,36 @@ def sync_channels(args): row = 2 for c in channels: - logger.info(c) - del c['id'] - del c['followers'] + # only adding channels, so skip everything with an ID + if c['id'] == '': + del c['id'] + del c['followers'] - if c['public'] == '': c['public'] = False - if c['chat'] == '': c['chat'] = False - - for k in c.keys(): - if c[k] == 'TRUE' or c[k] == 'yes': c[k] = True - if c[k] == 'FALSE' or c[k] == 'no': c[k] = False - - if c[k] == '': c[k] = None - - - # check to see if this already exists, - platform_id = None - if c['platform_id'] != '': - platform_id = c['platform_id'] - - channel = session.query(Channel).filter_by(platform_id=platform_id, platform=c['platform'], url=c['url']).first() - logger.info(channel) - - if not channel: - channel = Channel(**c, source='researcher') - logger.debug(f"{channel} does not exist, adding") - session.add(channel) - session.flush() - session.commit() + if c['public'] == '': c['public'] = False + if c['chat'] == '': c['chat'] = False - wks.update_cell(row, 1, channel.id) - time.sleep(1) + for k in c.keys(): + if c[k] == 'TRUE' or c[k] == 'yes': c[k] = True + if c[k] == 'FALSE' or c[k] == 'no': c[k] = False + + if c[k] == '': c[k] = None + + # check to see if this already exists, + platform_id = None + if c['platform_id'] != '': + platform_id = c['platform_id'] + + channel = session.query(Channel).filter_by(platform_id=platform_id, platform=c['platform'], url=c['url']).first() + + if not channel: + channel = Channel(**c, source='researcher') + logger.debug(f"{channel} does not exist, adding") + session.add(channel) + session.flush() + session.commit() + + wks.update_cell(row, 1, channel.id) + time.sleep(1) row += 1 @@ -83,7 +82,7 @@ def get_scraper_controller(): controller.connect_to_db(engine) scrapers = [ - TelegramTelethonScraper(), + # TelegramTelethonScraper(), TwitterScraper()] controller.register_scrapers(scrapers) @@ -93,13 +92,19 @@ def get_scraper_controller(): def scrape_channels(args): logger.info(f"Scraping channels, media: {args.media}") - controller = get_scraper_controller(args) + controller = get_scraper_controller() controller.scrape_all_channels(archive_media = args.media) +def scrape_channel_info(args): + logger.info(f"Scraping channel info") + + controller = get_scraper_controller() + controller.scrape_all_channel_info() + def archive_media(args): logger.info(f"Archiving unarchived media") - controller = get_scraper_controller(args) + controller = get_scraper_controller() controller.archive_unarchived_media() def init_db(): @@ -124,5 +129,7 @@ if __name__ == '__main__': scrape_channels(args) elif args.command == 'archive-media': archive_media(args) + elif args.command == 'channel-info': + scrape_channel_info(args) else: logger.error(f"Unrecognized command {args.command}") diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index a0ca904..4ab84b5 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -313,6 +313,17 @@ class ScraperController: channels = session.query(Channel).where(Channel.source=='researcher').all() return self.scrape_channels(channels, archive_media=archive_media) + + def scrape_all_channel_info(self): + if self.session is None: + logger.error("No DB session") + return + + session = self.session() + + channels = session.query(Channel).where(Channel.source=='researcher').all() + + return self.scrape_channel_info(channels) @logger.catch(reraise = True) def scrape_channels(self, channels: List[Channel], archive_media: bool = True): @@ -397,6 +408,48 @@ class ScraperController: session.commit() + @logger.catch(reraise = True) + def scrape_channel_info(self, channels: List[Channel]): + """Scrape channel info for specified channels. + + Parameters + ---------- + channels: list + List of Channel instances to be scraped + archive_media: bool + If ``True``, any media files (images, video, etc.) from posts are archived. + If ``False``, media files are not archived. + """ + + if self.session is None: + logger.error("No DB session") + return + + for channel in channels: + handled = False + + for scraper in self.scrapers: + if scraper.can_handle(channel): + logger.debug(f"{scraper} is getting channel info for {channel}") + handled = True + + # get most recent post + session = self.session() + + try: + info = scraper.get_profile(channel) + session.add(info) + + session.commit() + logger.info( + f"{scraper} found {info}") + break + except ChannelDoesNotExistError: + logger.warning(f"ChannelDoesNotExist {channel}") + + if not handled: + logger.warning(f"No handler found for Channel {channel}") + def connect_to_db(self, engine): """Connect the specified SQLAlchemy engine to the controller. """ diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py index 7befda5..fb9a58f 100644 --- a/cisticola/scraper/telegram_telethon.py +++ b/cisticola/scraper/telegram_telethon.py @@ -106,7 +106,9 @@ class TelegramTelethonScraper(Scraper): return True def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: - username = self.get_username_from_url(channel.url) + username = channel.screenname + if username is None: + username = self.get_username_from_url(channel.url) api_id = os.environ['TELEGRAM_API_ID'] api_hash = os.environ['TELEGRAM_API_HASH'] @@ -146,8 +148,9 @@ class TelegramTelethonScraper(Scraper): media_archived=archive_media) def get_profile(self, channel: Channel) -> RawChannelInfo: - - username = self.get_username_from_url(channel.url) + username = channel.screenname + if username is None: + username = self.get_username_from_url(channel.url) api_id = os.environ['TELEGRAM_API_ID'] api_hash = os.environ['TELEGRAM_API_HASH'] @@ -155,10 +158,10 @@ class TelegramTelethonScraper(Scraper): with TelegramClient(phone, api_id, api_hash) as client: full_channel = client(GetFullChannelRequest(channel = username)) - profile = full_channel.__dict__ + profile = full_channel.to_dict() return RawChannelInfo(scraper=self.__version__, platform=channel.platform, channel=channel.id, - raw_data=json.dumps(profile), + raw_data=json.dumps(profile, default=str), date_archived=datetime.now(timezone.utc)) diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index 7252ebd..43f6615 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -13,7 +13,12 @@ class TwitterScraper(Scraper): __version__ = "TwitterScraper 0.0.1" def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: - scraper = TwitterProfileScraper(channel.platform_id) + if channel.platform_id: + identifier = channel.platform_id + else: + identifier = channel.screenname + + scraper = TwitterProfileScraper(identifier) first = True @@ -71,7 +76,7 @@ class TwitterScraper(Scraper): media_archived=archive_media) def can_handle(self, channel): - if channel.platform == "Twitter" and channel.platform_id: + if channel.platform == "Twitter" and (channel.platform_id or channel.screenname): return True def url_to_key(self, url: str, content_type: str) -> str: @@ -102,5 +107,5 @@ class TwitterScraper(Scraper): return RawChannelInfo(scraper=self.__version__, platform=channel.platform, channel=channel.id, - raw_data=json.dumps(emtity.__dict__), + raw_data=json.dumps(entity.__dict__, default=str), date_archived=datetime.now(timezone.utc))