From d65f0434daec9168355f1c25909b326f3a83ea8a Mon Sep 17 00:00:00 2001 From: sahrul Date: Mon, 26 Oct 2020 16:46:10 +0700 Subject: [PATCH 1/2] split source into url and label --- snscrape/modules/twitter.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index d6ac2f1..b77388e 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -34,7 +34,8 @@ class Tweet(snscrape.base.Item): quoteCount: int conversationId: int lang: str - source: str + sourceUrl: str + sourceLabel: str media: typing.Optional[typing.List['Medium']] = None retweetedTweet: typing.Optional['Tweet'] = None quotedTweet: typing.Optional['Tweet'] = None @@ -314,7 +315,10 @@ class TwitterAPIScraper(snscrape.base.Scraper): kwargs['quoteCount'] = tweet['quote_count'] kwargs['conversationId'] = tweet['conversation_id'] if 'conversation_id' in tweet else int(tweet['conversation_id_str']) kwargs['lang'] = tweet['lang'] - kwargs['source'] = tweet['source'] + if (match := re.search(r'href=[\'"]?([^\'" >]+)', tweet['source'])): + kwargs['sourceUrl'] = match.group(1) + if (match := re.search(r'>([^<]*)<', tweet['source'])): + kwargs['sourceLabel'] = match.group(1) if 'extended_entities' in tweet and 'media' in tweet['extended_entities']: media = [] for medium in tweet['extended_entities']['media']: From d2dce37fa0206b1c08dccf77bdc51897a2133e11 Mon Sep 17 00:00:00 2001 From: sahrul Date: Tue, 27 Oct 2020 13:21:21 +0700 Subject: [PATCH 2/2] add the original tweet source --- snscrape/modules/twitter.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index b77388e..536bb58 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -34,8 +34,9 @@ class Tweet(snscrape.base.Item): quoteCount: int conversationId: int lang: str - sourceUrl: str - sourceLabel: str + source: str + sourceUrl: typing.Optional[str] = None + sourceLabel: typing.Optional[str] = None media: typing.Optional[typing.List['Medium']] = None retweetedTweet: typing.Optional['Tweet'] = None quotedTweet: typing.Optional['Tweet'] = None @@ -315,6 +316,7 @@ class TwitterAPIScraper(snscrape.base.Scraper): kwargs['quoteCount'] = tweet['quote_count'] kwargs['conversationId'] = tweet['conversation_id'] if 'conversation_id' in tweet else int(tweet['conversation_id_str']) kwargs['lang'] = tweet['lang'] + kwargs['source'] = tweet['source'] if (match := re.search(r'href=[\'"]?([^\'" >]+)', tweet['source'])): kwargs['sourceUrl'] = match.group(1) if (match := re.search(r'>([^<]*)<', tweet['source'])):