mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-08 02:28:29 +03:00
fixed issue where Telegram scraper terminated early because some pages didn't have a next page link (added reasonable default)
This commit is contained in:
@@ -214,8 +214,13 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
|||||||
yield from self._soup_to_items(soup, r.url)
|
yield from self._soup_to_items(soup, r.url)
|
||||||
pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})
|
pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})
|
||||||
if not pageLink:
|
if not pageLink:
|
||||||
break
|
nextPostIndex = int(nextPageUrl.split('=')[-1]) - 20
|
||||||
|
if nextPostIndex > 20:
|
||||||
|
pageLink = {'href': nextPageUrl.split('=')[0] + f'={nextPostIndex}'}
|
||||||
|
else:
|
||||||
|
break
|
||||||
nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
|
nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
|
||||||
|
print(f'nextPageUrl: {nextPageUrl}')
|
||||||
r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = telegramResponseOkCallback)
|
r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = telegramResponseOkCallback)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||||
|
|||||||
Reference in New Issue
Block a user