From 3817aa59d404cbddbe88ffacde5b818257726ea0 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Thu, 16 May 2019 16:42:52 +0000 Subject: [PATCH] Add support for extracting links from tweets (including cards) Both the t.co and the original URLs can be extracted. Note that card links are always t.co since Twitter's HTML does not include the original URL for those. --- snscrape/modules/twitter.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 6e97261..33e814d 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -14,6 +14,10 @@ class Tweet(typing.NamedTuple, snscrape.base.Item): url: str date: datetime.datetime content: str + outlinks: list + outlinksss: str + tcooutlinks: list + tcooutlinksss: str def __str__(self): return self.url @@ -37,8 +41,23 @@ class TwitterSearchScraper(snscrape.base.Scraper): username = tweet.find('span', 'username').find('b').text tweetID = tweet['data-item-id'] date = datetime.datetime.fromtimestamp(int(tweet.find('a', 'tweet-timestamp').find('span', '_timestamp')['data-time']), datetime.timezone.utc) - content = tweet.find('p', 'tweet-text').text - yield Tweet(f'https://twitter.com/{username}/status/{tweetID}', date, content) + contentP = tweet.find('p', 'tweet-text') + content = contentP.text + outlinks = [] + tcooutlinks = [] + for a in contentP.find_all('a'): + if a.has_attr('href') and not a['href'].startswith('/') and (not a.has_attr('class') or 'u-hidden' not in a['class']): + outlinks.append(a['data-expanded-url']) + tcooutlinks.append(a['href']) + card = tweet.find('div', 'card2') + if card and 'has-autoplayable-media' not in card['class']: + for div in card.find_all('div'): + if div.has_attr('data-card-url'): + outlinks.append(div['data-card-url']) + tcooutlinks.append(div['data-card-url']) + outlinks = list(dict.fromkeys(outlinks)) # Deduplicate in case the same link was shared more than once within this tweet; may change order on Python 3.6 or older + tcooutlinks = list(dict.fromkeys(tcooutlinks)) + yield Tweet(f'https://twitter.com/{username}/status/{tweetID}', date, content, outlinks, ' '.join(outlinks), tcooutlinks, ' '.join(tcooutlinks)) def _check_json_callback(self, r): if r.headers.get('content-type') != 'application/json;charset=utf-8':