mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-11 21:08:34 +03:00
Fix Twitter profile scraper, catch exceptions in controller
This commit is contained in:
71
app.py
71
app.py
@@ -31,37 +31,36 @@ def sync_channels(args):
|
||||
row = 2
|
||||
|
||||
for c in channels:
|
||||
logger.info(c)
|
||||
del c['id']
|
||||
del c['followers']
|
||||
# only adding channels, so skip everything with an ID
|
||||
if c['id'] == '':
|
||||
del c['id']
|
||||
del c['followers']
|
||||
|
||||
if c['public'] == '': c['public'] = False
|
||||
if c['chat'] == '': c['chat'] = False
|
||||
|
||||
for k in c.keys():
|
||||
if c[k] == 'TRUE' or c[k] == 'yes': c[k] = True
|
||||
if c[k] == 'FALSE' or c[k] == 'no': c[k] = False
|
||||
|
||||
if c[k] == '': c[k] = None
|
||||
|
||||
|
||||
# check to see if this already exists,
|
||||
platform_id = None
|
||||
if c['platform_id'] != '':
|
||||
platform_id = c['platform_id']
|
||||
|
||||
channel = session.query(Channel).filter_by(platform_id=platform_id, platform=c['platform'], url=c['url']).first()
|
||||
logger.info(channel)
|
||||
|
||||
if not channel:
|
||||
channel = Channel(**c, source='researcher')
|
||||
logger.debug(f"{channel} does not exist, adding")
|
||||
session.add(channel)
|
||||
session.flush()
|
||||
session.commit()
|
||||
if c['public'] == '': c['public'] = False
|
||||
if c['chat'] == '': c['chat'] = False
|
||||
|
||||
wks.update_cell(row, 1, channel.id)
|
||||
time.sleep(1)
|
||||
for k in c.keys():
|
||||
if c[k] == 'TRUE' or c[k] == 'yes': c[k] = True
|
||||
if c[k] == 'FALSE' or c[k] == 'no': c[k] = False
|
||||
|
||||
if c[k] == '': c[k] = None
|
||||
|
||||
# check to see if this already exists,
|
||||
platform_id = None
|
||||
if c['platform_id'] != '':
|
||||
platform_id = c['platform_id']
|
||||
|
||||
channel = session.query(Channel).filter_by(platform_id=platform_id, platform=c['platform'], url=c['url']).first()
|
||||
|
||||
if not channel:
|
||||
channel = Channel(**c, source='researcher')
|
||||
logger.debug(f"{channel} does not exist, adding")
|
||||
session.add(channel)
|
||||
session.flush()
|
||||
session.commit()
|
||||
|
||||
wks.update_cell(row, 1, channel.id)
|
||||
time.sleep(1)
|
||||
|
||||
row += 1
|
||||
|
||||
@@ -83,7 +82,7 @@ def get_scraper_controller():
|
||||
controller.connect_to_db(engine)
|
||||
|
||||
scrapers = [
|
||||
TelegramTelethonScraper(),
|
||||
# TelegramTelethonScraper(),
|
||||
TwitterScraper()]
|
||||
|
||||
controller.register_scrapers(scrapers)
|
||||
@@ -93,13 +92,19 @@ def get_scraper_controller():
|
||||
def scrape_channels(args):
|
||||
logger.info(f"Scraping channels, media: {args.media}")
|
||||
|
||||
controller = get_scraper_controller(args)
|
||||
controller = get_scraper_controller()
|
||||
controller.scrape_all_channels(archive_media = args.media)
|
||||
|
||||
def scrape_channel_info(args):
|
||||
logger.info(f"Scraping channel info")
|
||||
|
||||
controller = get_scraper_controller()
|
||||
controller.scrape_all_channel_info()
|
||||
|
||||
def archive_media(args):
|
||||
logger.info(f"Archiving unarchived media")
|
||||
|
||||
controller = get_scraper_controller(args)
|
||||
controller = get_scraper_controller()
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
def init_db():
|
||||
@@ -124,5 +129,7 @@ if __name__ == '__main__':
|
||||
scrape_channels(args)
|
||||
elif args.command == 'archive-media':
|
||||
archive_media(args)
|
||||
elif args.command == 'channel-info':
|
||||
scrape_channel_info(args)
|
||||
else:
|
||||
logger.error(f"Unrecognized command {args.command}")
|
||||
|
||||
@@ -313,6 +313,17 @@ class ScraperController:
|
||||
channels = session.query(Channel).where(Channel.source=='researcher').all()
|
||||
|
||||
return self.scrape_channels(channels, archive_media=archive_media)
|
||||
|
||||
def scrape_all_channel_info(self):
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
|
||||
session = self.session()
|
||||
|
||||
channels = session.query(Channel).where(Channel.source=='researcher').all()
|
||||
|
||||
return self.scrape_channel_info(channels)
|
||||
|
||||
@logger.catch(reraise = True)
|
||||
def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
|
||||
@@ -397,6 +408,48 @@ class ScraperController:
|
||||
|
||||
session.commit()
|
||||
|
||||
@logger.catch(reraise = True)
|
||||
def scrape_channel_info(self, channels: List[Channel]):
|
||||
"""Scrape channel info for specified channels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
channels: list<Channel>
|
||||
List of Channel instances to be scraped
|
||||
archive_media: bool
|
||||
If ``True``, any media files (images, video, etc.) from posts are archived.
|
||||
If ``False``, media files are not archived.
|
||||
"""
|
||||
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
|
||||
for channel in channels:
|
||||
handled = False
|
||||
|
||||
for scraper in self.scrapers:
|
||||
if scraper.can_handle(channel):
|
||||
logger.debug(f"{scraper} is getting channel info for {channel}")
|
||||
handled = True
|
||||
|
||||
# get most recent post
|
||||
session = self.session()
|
||||
|
||||
try:
|
||||
info = scraper.get_profile(channel)
|
||||
session.add(info)
|
||||
|
||||
session.commit()
|
||||
logger.info(
|
||||
f"{scraper} found {info}")
|
||||
break
|
||||
except ChannelDoesNotExistError:
|
||||
logger.warning(f"ChannelDoesNotExist {channel}")
|
||||
|
||||
if not handled:
|
||||
logger.warning(f"No handler found for Channel {channel}")
|
||||
|
||||
def connect_to_db(self, engine):
|
||||
"""Connect the specified SQLAlchemy engine to the controller.
|
||||
"""
|
||||
|
||||
@@ -106,7 +106,9 @@ class TelegramTelethonScraper(Scraper):
|
||||
return True
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
username = self.get_username_from_url(channel.url)
|
||||
username = channel.screenname
|
||||
if username is None:
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
api_id = os.environ['TELEGRAM_API_ID']
|
||||
api_hash = os.environ['TELEGRAM_API_HASH']
|
||||
@@ -146,8 +148,9 @@ class TelegramTelethonScraper(Scraper):
|
||||
media_archived=archive_media)
|
||||
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
username = channel.screenname
|
||||
if username is None:
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
api_id = os.environ['TELEGRAM_API_ID']
|
||||
api_hash = os.environ['TELEGRAM_API_HASH']
|
||||
@@ -155,10 +158,10 @@ class TelegramTelethonScraper(Scraper):
|
||||
|
||||
with TelegramClient(phone, api_id, api_hash) as client:
|
||||
full_channel = client(GetFullChannelRequest(channel = username))
|
||||
profile = full_channel.__dict__
|
||||
profile = full_channel.to_dict()
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
raw_data=json.dumps(profile, default=str),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
@@ -13,7 +13,12 @@ class TwitterScraper(Scraper):
|
||||
__version__ = "TwitterScraper 0.0.1"
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
scraper = TwitterProfileScraper(channel.platform_id)
|
||||
if channel.platform_id:
|
||||
identifier = channel.platform_id
|
||||
else:
|
||||
identifier = channel.screenname
|
||||
|
||||
scraper = TwitterProfileScraper(identifier)
|
||||
|
||||
first = True
|
||||
|
||||
@@ -71,7 +76,7 @@ class TwitterScraper(Scraper):
|
||||
media_archived=archive_media)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Twitter" and channel.platform_id:
|
||||
if channel.platform == "Twitter" and (channel.platform_id or channel.screenname):
|
||||
return True
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
@@ -102,5 +107,5 @@ class TwitterScraper(Scraper):
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(emtity.__dict__),
|
||||
raw_data=json.dumps(entity.__dict__, default=str),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
Reference in New Issue
Block a user