mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-12 12:28:28 +03:00
Handle tweets by temporarily blocked accounts (which show up in the search results but don't have a date or content)
This commit is contained in:
@@ -48,18 +48,31 @@ class TwitterSearchScraper(snscrape.base.Scraper):
|
|||||||
username = tweet.find('span', 'username').find('b').text
|
username = tweet.find('span', 'username').find('b').text
|
||||||
tweetID = tweet['data-item-id']
|
tweetID = tweet['data-item-id']
|
||||||
url = f'https://twitter.com/{username}/status/{tweetID}'
|
url = f'https://twitter.com/{username}/status/{tweetID}'
|
||||||
date = datetime.datetime.fromtimestamp(int(tweet.find('a', 'tweet-timestamp').find('span', '_timestamp')['data-time']), datetime.timezone.utc)
|
|
||||||
|
date = None
|
||||||
|
timestampA = tweet.find('a', 'tweet-timestamp')
|
||||||
|
if timestampA:
|
||||||
|
timestampSpan = timestampA.find('span', '_timestamp')
|
||||||
|
if timestampSpan and timestampSpan.has_attr('data-time'):
|
||||||
|
date = datetime.datetime.fromtimestamp(int(timestampSpan['data-time']), datetime.timezone.utc)
|
||||||
|
if not date:
|
||||||
|
logger.warning(f'Failed to extract date for {url}')
|
||||||
|
|
||||||
contentP = tweet.find('p', 'tweet-text')
|
contentP = tweet.find('p', 'tweet-text')
|
||||||
content = contentP.text
|
content = None
|
||||||
outlinks = []
|
outlinks = []
|
||||||
tcooutlinks = []
|
tcooutlinks = []
|
||||||
for a in contentP.find_all('a'):
|
if contentP:
|
||||||
if a.has_attr('href') and not a['href'].startswith('/') and (not a.has_attr('class') or 'u-hidden' not in a['class']):
|
content = contentP.text
|
||||||
if a.has_attr('data-expanded-url'):
|
for a in contentP.find_all('a'):
|
||||||
outlinks.append(a['data-expanded-url'])
|
if a.has_attr('href') and not a['href'].startswith('/') and (not a.has_attr('class') or 'u-hidden' not in a['class']):
|
||||||
else:
|
if a.has_attr('data-expanded-url'):
|
||||||
logger.warning(f'Ignoring link without expanded URL on {url}: {a["href"]}')
|
outlinks.append(a['data-expanded-url'])
|
||||||
tcooutlinks.append(a['href'])
|
else:
|
||||||
|
logger.warning(f'Ignoring link without expanded URL on {url}: {a["href"]}')
|
||||||
|
tcooutlinks.append(a['href'])
|
||||||
|
else:
|
||||||
|
logger.warning(f'Failed to extract content for {url}')
|
||||||
card = tweet.find('div', 'card2')
|
card = tweet.find('div', 'card2')
|
||||||
if card and 'has-autoplayable-media' not in card['class']:
|
if card and 'has-autoplayable-media' not in card['class']:
|
||||||
for div in card.find_all('div'):
|
for div in card.find_all('div'):
|
||||||
|
|||||||
Reference in New Issue
Block a user