mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-12 20:38:29 +03:00
Handle Telegram channels without public posts
This commit is contained in:
@@ -25,8 +25,8 @@ class Channel(typing.NamedTuple, snscrape.base.Entity):
|
|||||||
title: str
|
title: str
|
||||||
verified: bool
|
verified: bool
|
||||||
photo: str
|
photo: str
|
||||||
members: int
|
|
||||||
description: typing.Optional[str] = None
|
description: typing.Optional[str] = None
|
||||||
|
members: typing.Optional[int] = None
|
||||||
photos: typing.Optional[int] = None
|
photos: typing.Optional[int] = None
|
||||||
photosGranularity: typing.Optional[snscrape.base.Granularity] = None
|
photosGranularity: typing.Optional[snscrape.base.Granularity] = None
|
||||||
videos: typing.Optional[int] = None
|
videos: typing.Optional[int] = None
|
||||||
@@ -75,6 +75,9 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
|||||||
|
|
||||||
def get_items(self):
|
def get_items(self):
|
||||||
r, soup = self._initial_page()
|
r, soup = self._initial_page()
|
||||||
|
if '/s/' not in r.url:
|
||||||
|
logger.warning('No public post list for this user')
|
||||||
|
return
|
||||||
while True:
|
while True:
|
||||||
yield from self._soup_to_items(soup, r.url)
|
yield from self._soup_to_items(soup, r.url)
|
||||||
pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})
|
pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})
|
||||||
@@ -94,11 +97,13 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
|||||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||||
membersDiv = soup.find('div', class_ = 'tgme_page_extra')
|
membersDiv = soup.find('div', class_ = 'tgme_page_extra')
|
||||||
assert membersDiv.text.endswith(' members')
|
if membersDiv.text.endswith(' members'):
|
||||||
kwargs['members'] = int(membersDiv.text[:-8].replace(' ', ''))
|
kwargs['members'] = int(membersDiv.text[:-8].replace(' ', ''))
|
||||||
kwargs['photo'] = soup.find('img', class_ = 'tgme_page_photo_image').attrs['src']
|
kwargs['photo'] = soup.find('img', class_ = 'tgme_page_photo_image').attrs['src']
|
||||||
|
|
||||||
r, soup = self._initial_page()
|
r, soup = self._initial_page()
|
||||||
|
if '/s/' not in r.url: # Redirect on channels without public posts
|
||||||
|
return
|
||||||
channelInfoDiv = soup.find('div', class_ = 'tgme_channel_info')
|
channelInfoDiv = soup.find('div', class_ = 'tgme_channel_info')
|
||||||
assert channelInfoDiv, 'channel info div not found'
|
assert channelInfoDiv, 'channel info div not found'
|
||||||
titleDiv = channelInfoDiv.find('div', class_ = 'tgme_channel_info_header_title')
|
titleDiv = channelInfoDiv.find('div', class_ = 'tgme_channel_info_header_title')
|
||||||
|
|||||||
Reference in New Issue
Block a user