From a25426043b2b06a884c004d8b8000d3b3b54f3b3 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 9 Sep 2020 09:33:57 +0000 Subject: [PATCH] Fix Telegram username canonicalisation --- snscrape/modules/telegram.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py index 5e821a5..94546d4 100644 --- a/snscrape/modules/telegram.py +++ b/snscrape/modules/telegram.py @@ -58,9 +58,12 @@ class TelegramChannelScraper(snscrape.base.Scraper): self._initialPage, self._initialPageSoup = r, bs4.BeautifulSoup(r.text, 'lxml') return self._initialPage, self._initialPageSoup - def _soup_to_items(self, soup, pageUrl): + def _soup_to_items(self, soup, pageUrl, onlyUsername = False): posts = soup.find_all('div', attrs = {'class': 'tgme_widget_message', 'data-post': True}) for post in reversed(posts): + if onlyUsername: + yield post['data-post'].split('/')[0] + return date = datetime.datetime.strptime(post.find('div', class_ = 'tgme_widget_message_footer').find('a', class_ = 'tgme_widget_message_date').find('time', datetime = True)['datetime'].replace('-', '', 2).replace(':', ''), '%Y%m%dT%H%M%S%z') message = post.find('div', class_ = 'tgme_widget_message_text') if message: @@ -109,7 +112,14 @@ class TelegramChannelScraper(snscrape.base.Scraper): titleDiv = channelInfoDiv.find('div', class_ = 'tgme_channel_info_header_title') kwargs['title'] = titleDiv.find('span').text kwargs['verified'] = bool(titleDiv.find('i', class_ = 'verified-icon')) - kwargs['username'] = channelInfoDiv.find('div', class_ = 'tgme_channel_info_header_username').text[1:] # Remove @ + # The username in the channel info is not canonicalised, nor is the one on the /channel page anywhere. + # However, the post URLs are, so extract the first post and use that. + try: + kwargs['username'] = next(self._soup_to_items(soup, r.url, onlyUsername = True)) + except StopIteration: + # If there are no posts, fall back to the channel info div, although that should never happen due to the 'Channel created' entry. + logger.warning('Could not find a post; extracting username from channel info div, which may not be capitalised correctly') + kwargs['username'] = channelInfoDiv.find('div', class_ = 'tgme_channel_info_header_username').text[1:] # Remove @ descriptionDiv = channelInfoDiv.find('div', class_ = 'tgme_channel_info_description') if descriptionDiv: kwargs['description'] = descriptionDiv.text