From 7989af27b5bbb6b9e1eb7a2fa61d90bd0b148f3a Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 21 May 2019 22:37:43 +0000 Subject: [PATCH] Handle tweets by temporarily blocked accounts (which show up in the search results but don't have a date or content) --- snscrape/modules/twitter.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index edacf46..fc382f2 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -48,18 +48,31 @@ class TwitterSearchScraper(snscrape.base.Scraper): username = tweet.find('span', 'username').find('b').text tweetID = tweet['data-item-id'] url = f'https://twitter.com/{username}/status/{tweetID}' - date = datetime.datetime.fromtimestamp(int(tweet.find('a', 'tweet-timestamp').find('span', '_timestamp')['data-time']), datetime.timezone.utc) + + date = None + timestampA = tweet.find('a', 'tweet-timestamp') + if timestampA: + timestampSpan = timestampA.find('span', '_timestamp') + if timestampSpan and timestampSpan.has_attr('data-time'): + date = datetime.datetime.fromtimestamp(int(timestampSpan['data-time']), datetime.timezone.utc) + if not date: + logger.warning(f'Failed to extract date for {url}') + contentP = tweet.find('p', 'tweet-text') - content = contentP.text + content = None outlinks = [] tcooutlinks = [] - for a in contentP.find_all('a'): - if a.has_attr('href') and not a['href'].startswith('/') and (not a.has_attr('class') or 'u-hidden' not in a['class']): - if a.has_attr('data-expanded-url'): - outlinks.append(a['data-expanded-url']) - else: - logger.warning(f'Ignoring link without expanded URL on {url}: {a["href"]}') - tcooutlinks.append(a['href']) + if contentP: + content = contentP.text + for a in contentP.find_all('a'): + if a.has_attr('href') and not a['href'].startswith('/') and (not a.has_attr('class') or 'u-hidden' not in a['class']): + if a.has_attr('data-expanded-url'): + outlinks.append(a['data-expanded-url']) + else: + logger.warning(f'Ignoring link without expanded URL on {url}: {a["href"]}') + tcooutlinks.append(a['href']) + else: + logger.warning(f'Failed to extract content for {url}') card = tweet.find('div', 'card2') if card and 'has-autoplayable-media' not in card['class']: for div in card.find_all('div'):