mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-12 20:38:29 +03:00
added additional attributes for hashtags and user mentions, removed redundant outlinks
This commit is contained in:
@@ -50,7 +50,9 @@ class TelegramPost(snscrape.base.Item):
|
|||||||
url: str
|
url: str
|
||||||
date: datetime.datetime
|
date: datetime.datetime
|
||||||
content: str
|
content: str
|
||||||
outlinks: list
|
outlinks: typing.List[str] = None
|
||||||
|
mentions: typing.List[str] = None
|
||||||
|
hashtags: typing.List[str] = None
|
||||||
forwarded: typing.Optional['Channel'] = None
|
forwarded: typing.Optional['Channel'] = None
|
||||||
forwardedUrl: typing.Optional[str] = None
|
forwardedUrl: typing.Optional[str] = None
|
||||||
media: typing.Optional[typing.List['Medium']] = None
|
media: typing.Optional[typing.List['Medium']] = None
|
||||||
@@ -133,6 +135,8 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
|||||||
content = None
|
content = None
|
||||||
|
|
||||||
outlinks = []
|
outlinks = []
|
||||||
|
mentions = []
|
||||||
|
hashtags = []
|
||||||
for link in post.find_all('a'):
|
for link in post.find_all('a'):
|
||||||
if any(x in link.parent.attrs.get('class', []) for x in ('tgme_widget_message_user', 'tgme_widget_message_author')):
|
if any(x in link.parent.attrs.get('class', []) for x in ('tgme_widget_message_user', 'tgme_widget_message_author')):
|
||||||
# Author links at the top (avatar and name)
|
# Author links at the top (avatar and name)
|
||||||
@@ -154,8 +158,14 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
|||||||
# encoded_string = base64.b64encode(resp.content)
|
# encoded_string = base64.b64encode(resp.content)
|
||||||
# Individual photo or video link
|
# Individual photo or video link
|
||||||
continue
|
continue
|
||||||
|
if link.text.startswith('@'):
|
||||||
|
mentions.append(link.text.strip('@'))
|
||||||
|
continue
|
||||||
|
if link.text.startswith('#'):
|
||||||
|
hashtags.append(link.text.strip('#'))
|
||||||
|
continue
|
||||||
href = urllib.parse.urljoin(pageUrl, link['href'])
|
href = urllib.parse.urljoin(pageUrl, link['href'])
|
||||||
if (href not in outlinks) and (href != rawUrl):
|
if (href not in outlinks) and (href != rawUrl) and (href != forwardedUrl):
|
||||||
outlinks.append(href)
|
outlinks.append(href)
|
||||||
|
|
||||||
for voice_player in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}):
|
for voice_player in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}):
|
||||||
@@ -217,7 +227,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
|||||||
else:
|
else:
|
||||||
views = parse_num(viewsSpan.text)
|
views = parse_num(viewsSpan.text)
|
||||||
|
|
||||||
yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, linkPreview = linkPreview, media = media, forwarded = forwarded, forwardedUrl = forwardedUrl, views = views)
|
yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, mentions = mentions, hashtags = hashtags, linkPreview = linkPreview, media = media, forwarded = forwarded, forwardedUrl = forwardedUrl, views = views)
|
||||||
|
|
||||||
def get_items(self):
|
def get_items(self):
|
||||||
r, soup = self._initial_page()
|
r, soup = self._initial_page()
|
||||||
|
|||||||
Reference in New Issue
Block a user