From 3fb731ade16dacc8c83d2b5efa28bfca80c764c7 Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Thu, 16 Sep 2021 08:06:05 -0400 Subject: [PATCH 1/5] User Labels In August of 2020 Twitter started to label the accounts of government officials and state-affiliated media entities: https://blog.twitter.com/en_us/topics/product/2020/new-labels-for-government-and-state-affiliated-media-accounts This information is extremely important for researchers who are studying the impact of social media on political discourse, especially because it is not currently available through either Twitter's v1.1 or v2 API endpoints. The code in this small PR may seem a bit brittle but I've been using it to collect data with each of the twitter subcommands and it seems to work reliably. While there are image and page URLs associated with each label I chose to only collect the text description of the lable since it should be sufficient for finding the additional information later if needed. --- snscrape/modules/twitter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 9b0a598..b6da951 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -136,6 +136,7 @@ class User(snscrape.base.Entity): linkTcourl: typing.Optional[str] = None profileImageUrl: typing.Optional[str] = None profileBannerUrl: typing.Optional[str] = None + label: typing.Optional[str] = None @property def url(self): @@ -457,6 +458,9 @@ class TwitterAPIScraper(snscrape.base.Scraper): kwargs['linkTcourl'] = user.get('url') kwargs['profileImageUrl'] = user['profile_image_url_https'] kwargs['profileBannerUrl'] = user.get('profile_banner_url') + if 'label' in user['ext']['highlightedLabel']['r']['ok']: + kwargs['label'] = user['ext']['highlightedLabel']['r']['ok']['label']['description'] + return User(**kwargs) From a11eef6b06e03177aac3371a81fd8b587e760f5e Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Thu, 16 Sep 2021 13:04:57 -0400 Subject: [PATCH 2/5] User label url MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each label also has a URL which is used for learning more about the label. While there are more label descriptions than label URLs the URLs do seem to group language variants of the same label. For example https://help.twitter.com/rules-and-policies/state-affiliated-china is used for all of the following label descriptions: * Média affilié à un État, Chine * China state-affiliated media * 中国官方媒体 * Çin devletine bağlı medya * China government official In some analysis contexts it could be useful to group these together. --- snscrape/modules/twitter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index b6da951..ea0e0fa 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -137,6 +137,7 @@ class User(snscrape.base.Entity): profileImageUrl: typing.Optional[str] = None profileBannerUrl: typing.Optional[str] = None label: typing.Optional[str] = None + labelUrl: typing.Optional[str] = None @property def url(self): @@ -460,6 +461,7 @@ class TwitterAPIScraper(snscrape.base.Scraper): kwargs['profileBannerUrl'] = user.get('profile_banner_url') if 'label' in user['ext']['highlightedLabel']['r']['ok']: kwargs['label'] = user['ext']['highlightedLabel']['r']['ok']['label']['description'] + kwargs['labelUrl'] = user['ext']['highlightedLabel']['r']['ok']['label']['url']['url'] return User(**kwargs) From 9831f2a4a021d493cac548d2aaf5b99a5d3c1fbc Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Thu, 16 Sep 2021 13:31:47 -0400 Subject: [PATCH 3/5] missing ext While doing some long term data collection I found some user objects that lack the key 'ext'. This would cause an exception unless it's checked for before trying to dig out results. --- snscrape/modules/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index ea0e0fa..2c00baa 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -459,7 +459,7 @@ class TwitterAPIScraper(snscrape.base.Scraper): kwargs['linkTcourl'] = user.get('url') kwargs['profileImageUrl'] = user['profile_image_url_https'] kwargs['profileBannerUrl'] = user.get('profile_banner_url') - if 'label' in user['ext']['highlightedLabel']['r']['ok']: + if 'ext' in user and 'label' in user['ext']['highlightedLabel']['r']['ok']: kwargs['label'] = user['ext']['highlightedLabel']['r']['ok']['label']['description'] kwargs['labelUrl'] = user['ext']['highlightedLabel']['r']['ok']['label']['url']['url'] From 2825bd0a7300777fc01d71aeb5f5a422987c6d73 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 19 Sep 2021 03:31:56 +0000 Subject: [PATCH 4/5] Remove accidental empty line --- snscrape/modules/twitter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 2c00baa..43a3159 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -462,7 +462,6 @@ class TwitterAPIScraper(snscrape.base.Scraper): if 'ext' in user and 'label' in user['ext']['highlightedLabel']['r']['ok']: kwargs['label'] = user['ext']['highlightedLabel']['r']['ok']['label']['description'] kwargs['labelUrl'] = user['ext']['highlightedLabel']['r']['ok']['label']['url']['url'] - return User(**kwargs) From 5fc2562642c8bd6ef7e889f583560ec576e373ce Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 19 Sep 2021 03:32:35 +0000 Subject: [PATCH 5/5] Add user label support on entity retrieval --- snscrape/modules/twitter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 43a3159..c27e17e 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -584,6 +584,8 @@ class TwitterUserScraper(TwitterSearchScraper): linkTcourl = user['legacy'].get('url'), profileImageUrl = user['legacy']['profile_image_url_https'], profileBannerUrl = user['legacy'].get('profile_banner_url'), + label = user['affiliates_highlighted_label']['label']['description'] if user['affiliates_highlighted_label'] else None, + labelUrl = user['affiliates_highlighted_label']['label']['url']['url'] if user['affiliates_highlighted_label'] else None, ) def get_items(self):