From 3817aa59d404cbddbe88ffacde5b818257726ea0 Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Thu, 16 May 2019 16:42:52 +0000
Subject: [PATCH] Add support for extracting links from tweets (including
 cards)

Both the t.co and the original URLs can be extracted. Note that card links are always t.co since Twitter's HTML does not include the original URL for those.
---
 snscrape/modules/twitter.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py
index 6e97261..33e814d 100644
--- a/snscrape/modules/twitter.py
+++ b/snscrape/modules/twitter.py
@@ -14,6 +14,10 @@ class Tweet(typing.NamedTuple, snscrape.base.Item):
 	url: str
 	date: datetime.datetime
 	content: str
+	outlinks: list
+	outlinksss: str
+	tcooutlinks: list
+	tcooutlinksss: str
 
 	def __str__(self):
 		return self.url
@@ -37,8 +41,23 @@ class TwitterSearchScraper(snscrape.base.Scraper):
 			username = tweet.find('span', 'username').find('b').text
 			tweetID = tweet['data-item-id']
 			date = datetime.datetime.fromtimestamp(int(tweet.find('a', 'tweet-timestamp').find('span', '_timestamp')['data-time']), datetime.timezone.utc)
-			content = tweet.find('p', 'tweet-text').text
-			yield Tweet(f'https://twitter.com/{username}/status/{tweetID}', date, content)
+			contentP = tweet.find('p', 'tweet-text')
+			content = contentP.text
+			outlinks = []
+			tcooutlinks = []
+			for a in contentP.find_all('a'):
+				if a.has_attr('href') and not a['href'].startswith('/') and (not a.has_attr('class') or 'u-hidden' not in a['class']):
+					outlinks.append(a['data-expanded-url'])
+					tcooutlinks.append(a['href'])
+			card = tweet.find('div', 'card2')
+			if card and 'has-autoplayable-media' not in card['class']:
+				for div in card.find_all('div'):
+					if div.has_attr('data-card-url'):
+						outlinks.append(div['data-card-url'])
+						tcooutlinks.append(div['data-card-url'])
+			outlinks = list(dict.fromkeys(outlinks)) # Deduplicate in case the same link was shared more than once within this tweet; may change order on Python 3.6 or older
+			tcooutlinks = list(dict.fromkeys(tcooutlinks))
+			yield Tweet(f'https://twitter.com/{username}/status/{tweetID}', date, content, outlinks, ' '.join(outlinks), tcooutlinks, ' '.join(tcooutlinks))
 
 	def _check_json_callback(self, r):
 		if r.headers.get('content-type') != 'application/json;charset=utf-8':