From 3ab69a1a0fcc4e5abe1afd07a17993cc23f5100c Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 2 Mar 2018 18:55:36 +0100 Subject: [PATCH] Merge Twitter user and hashtag into one, and add support for generic Twitter search scrapes --- .../{twitter_user_tweets.py => twitter.py} | 46 +++++++++-- .../modules/twitter_hashtag_tweets.py | 78 ------------------- 2 files changed, 38 insertions(+), 86 deletions(-) rename socialmediascraper/modules/{twitter_user_tweets.py => twitter.py} (66%) delete mode 100644 socialmediascraper/modules/twitter_hashtag_tweets.py diff --git a/socialmediascraper/modules/twitter_user_tweets.py b/socialmediascraper/modules/twitter.py similarity index 66% rename from socialmediascraper/modules/twitter_user_tweets.py rename to socialmediascraper/modules/twitter.py index a37f270..bbb34bb 100644 --- a/socialmediascraper/modules/twitter_user_tweets.py +++ b/socialmediascraper/modules/twitter.py @@ -7,12 +7,12 @@ import socialmediascraper.base logger = logging.getLogger(__name__) -class TwitterUserTweetsScraper(socialmediascraper.base.Scraper): - name = 'twitter-user-tweets' +class TwitterSearchScraper(socialmediascraper.base.Scraper): + name = 'twitter-search' - def __init__(self, username, **kwargs): + def __init__(self, query, **kwargs): super().__init__(**kwargs) - self._username = username + self._query = query def _get_feed_from_html(self, html): soup = bs4.BeautifulSoup(html, 'lxml') @@ -32,12 +32,11 @@ class TwitterUserTweetsScraper(socialmediascraper.base.Scraper): return True def get_items(self): - query = f'from:{self._username}' headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} # First page - logger.info(f'Retrieving search page for {query}') - r = self._get('https://twitter.com/search', params = {'f': 'tweets', 'vertical': 'default', 'lang': 'en', 'q': query, 'src': 'typd'}, headers = headers) + logger.info(f'Retrieving search page for {self._query}') + r = self._get('https://twitter.com/search', params = {'f': 'tweets', 'vertical': 'default', 'lang': 'en', 'q': self._query, 'src': 'typd'}, headers = headers) feed = self._get_feed_from_html(r.text) if not feed: @@ -53,7 +52,7 @@ class TwitterUserTweetsScraper(socialmediascraper.base.Scraper): 'f': 'tweets', 'vertical': 'default', 'lang': 'en', - 'q': query, + 'q': self._query, 'include_available_features': '1', 'include_entities': '1', 'reset_error_state': 'false', @@ -69,6 +68,22 @@ class TwitterUserTweetsScraper(socialmediascraper.base.Scraper): maxPosition = f'TWEET-{feed[-1]["data-item-id"]}-{newestID}' yield from self._feed_to_items(feed) + @classmethod + def setup_parser(cls, subparser): + subparser.add_argument('query', help = 'A Twitter search string') + + @classmethod + def from_args(cls, args): + return cls(args.query, retries = args.retries) + + +class TwitterUserScraper(TwitterSearchScraper): + name = 'twitter-user' + + def __init__(self, username, **kwargs): + super().__init__(f'from:{username}', **kwargs) + self._username = username + @classmethod def setup_parser(cls, subparser): subparser.add_argument('username', help = 'A Twitter username (without @)') @@ -76,3 +91,18 @@ class TwitterUserTweetsScraper(socialmediascraper.base.Scraper): @classmethod def from_args(cls, args): return cls(args.username, retries = args.retries) + +class TwitterHashtagScraper(TwitterSearchScraper): + name = 'twitter-hashtag' + + def __init__(self, hashtag, **kwargs): + super().__init__(f'#{hashtag}', **kwargs) + self._hashtag = hashtag + + @classmethod + def setup_parser(cls, subparser): + subparser.add_argument('hashtag', help = 'A Twitter hashtag (without #)') + + @classmethod + def from_args(cls, args): + return cls(args.hashtag, retries = args.retries) diff --git a/socialmediascraper/modules/twitter_hashtag_tweets.py b/socialmediascraper/modules/twitter_hashtag_tweets.py deleted file mode 100644 index 9a8f7fe..0000000 --- a/socialmediascraper/modules/twitter_hashtag_tweets.py +++ /dev/null @@ -1,78 +0,0 @@ -import bs4 -import json -import logging -import socialmediascraper.base - - -logger = logging.getLogger(__name__) - - -class TwitterHashtagTweetsScraper(socialmediascraper.base.Scraper): - name = 'twitter-hashtag-tweets' - - def __init__(self, hashtag, **kwargs): - super().__init__(**kwargs) - self._hashtag = hashtag - - def _get_feed_from_html(self, html): - soup = bs4.BeautifulSoup(html, 'lxml') - feed = soup.find_all('li', 'js-stream-item') - return feed - - def _feed_to_items(self, feed): - for tweet in feed: - username = tweet.find('span', 'username').find('b').text - tweetID = tweet['data-item-id'] - yield socialmediascraper.base.URLItem(f'https://twitter.com/{username}/status/{tweetID}') - - def _check_json_callback(self, r): - if r.headers['content-type'] != 'application/json;charset=utf-8': - logger.error(f'Content type of {r.url} is not JSON') - return False - return True - - def get_items(self): - query = f'#{self._hashtag}' - headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} - - # First page - logger.info(f'Retrieving search page for {query}') - r = self._get('https://twitter.com/search', params = {'f': 'tweets', 'vertical': 'default', 'lang': 'en', 'q': query, 'src': 'typd'}, headers = headers) - - feed = self._get_feed_from_html(r.text) - if not feed: - return - newestID = feed[0]['data-item-id'] - maxPosition = f'TWEET-{feed[-1]["data-item-id"]}-{newestID}' - yield from self._feed_to_items(feed) - - while True: - logger.info(f'Retrieving scroll page {maxPosition}') - r = self._get('https://twitter.com/i/search/timeline', - params = { - 'f': 'tweets', - 'vertical': 'default', - 'lang': 'en', - 'q': query, - 'include_available_features': '1', - 'include_entities': '1', - 'reset_error_state': 'false', - 'src': 'typd', - 'max_position': maxPosition, - }, - headers = headers, - responseOkCallback = self._check_json_callback) - - feed = self._get_feed_from_html(json.loads(r.text)['items_html']) - if not feed: - return - maxPosition = f'TWEET-{feed[-1]["data-item-id"]}-{newestID}' - yield from self._feed_to_items(feed) - - @classmethod - def setup_parser(cls, subparser): - subparser.add_argument('hashtag', help = 'A Twitter hashtag (without #)') - - @classmethod - def from_args(cls, args): - return cls(args.hashtag, retries = args.retries)