mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-11 11:58:28 +03:00
Add support for extracting the entity behind a scrape
Closes #11 Backwards incompatibility: snscrape.modules.twitter.Account is now called User. However, this was previously only used on the list member scraper, which has been broken for a while since the list member list is no longer publicly accessible. For compatibility reasons, the CLI does not output the entity by default; the new option --with-entity enables it.
This commit is contained in:
@@ -17,6 +17,22 @@ class Item:
|
||||
pass
|
||||
|
||||
|
||||
class Entity:
|
||||
'''An abstract base class for an entity returned by the scraper's get_entity method.
|
||||
|
||||
An entity is typically the account of a person or organisation. The string representation should be the preferred direct URL to the entity's page on the network.'''
|
||||
|
||||
@abc.abstractmethod
|
||||
def __str__(self):
|
||||
pass
|
||||
|
||||
|
||||
Granularity = int
|
||||
'''Type of fields storing the unit/granularity of numbers.
|
||||
|
||||
For example, a granularity of 1000 means that the SNS returned something like '42k' and the last three significant digits are unknown.'''
|
||||
|
||||
|
||||
class URLItem(Item):
|
||||
'''A generic item which only holds a URL string.'''
|
||||
|
||||
@@ -49,6 +65,10 @@ class Scraper:
|
||||
'''Iterator yielding Items.'''
|
||||
pass
|
||||
|
||||
def get_entity(self):
|
||||
'''Get the entity behind the scraper, if any.'''
|
||||
return None
|
||||
|
||||
def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None):
|
||||
for attempt in range(self._retries + 1):
|
||||
# The request is newly prepared on each retry because of potential cookie updates.
|
||||
|
||||
@@ -170,6 +170,7 @@ def parse_args():
|
||||
group = parser.add_mutually_exclusive_group(required = False)
|
||||
group.add_argument('-f', '--format', dest = 'format', type = str, default = None, help = 'Output format')
|
||||
group.add_argument('--jsonl', dest = 'jsonl', action = 'store_true', default = False, help = 'Output JSONL')
|
||||
parser.add_argument('--with-entity', dest = 'withEntity', action = 'store_true', default = False, help = 'Include the entity (e.g. user, channel) as the first output item')
|
||||
parser.add_argument('--since', type = parse_datetime_arg, metavar = 'DATETIME', help = 'Only return results newer than DATETIME')
|
||||
|
||||
subparsers = parser.add_subparsers(dest = 'scraper', help = 'The scraper you want to use')
|
||||
@@ -236,6 +237,13 @@ def main():
|
||||
|
||||
i = 0
|
||||
with _dump_locals_on_exception():
|
||||
if args.withEntity:
|
||||
entity = scraper.get_entity()
|
||||
if entity:
|
||||
if args.jsonl:
|
||||
print(json.dumps(entity._asdict(), default = json_serialise_datetime))
|
||||
else:
|
||||
print(entity)
|
||||
for i, item in enumerate(scraper.get_items(), start = 1):
|
||||
if args.since is not None and item.date < args.since:
|
||||
logger.info(f'Exiting due to reaching older results than {args.since}')
|
||||
|
||||
@@ -23,6 +23,25 @@ class FacebookPost(typing.NamedTuple, snscrape.base.Item):
|
||||
return self.cleanUrl
|
||||
|
||||
|
||||
class User(typing.NamedTuple, snscrape.base.Entity):
|
||||
username: str
|
||||
pageId: int
|
||||
name: str
|
||||
verified: bool
|
||||
created: typing.Optional[datetime.date] = None
|
||||
pageOwner: typing.Optional[str] = None
|
||||
likes: typing.Optional[int] = None
|
||||
followers: typing.Optional[int] = None
|
||||
checkins: typing.Optional[int] = None
|
||||
address: typing.Optional[str] = None
|
||||
phone: typing.Optional[str] = None
|
||||
web: typing.Optional[str] = None
|
||||
keywords: typing.Optional[typing.List[str]] = None
|
||||
|
||||
def __str__(self):
|
||||
return f'https://www.facebook.com/{self.username}/'
|
||||
|
||||
|
||||
class FacebookCommonScraper(snscrape.base.Scraper):
|
||||
def _clean_url(self, dirtyUrl):
|
||||
u = urllib.parse.urlparse(dirtyUrl)
|
||||
@@ -136,6 +155,19 @@ class FacebookUserAndCommunityScraper(FacebookCommonScraper):
|
||||
def __init__(self, username, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._username = username
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', 'Accept-Language': 'en-US,en;q=0.5'}
|
||||
self._initialPage = None
|
||||
self._initialPageSoup = None
|
||||
|
||||
def _initial_page(self):
|
||||
if self._initialPage is None:
|
||||
logger.info('Retrieving initial data')
|
||||
r = self._get(self._baseUrl, headers = self._headers)
|
||||
if r.status_code not in (200, 404):
|
||||
raise snscrape.base.ScraperException('Got status code {r.status_code}')
|
||||
self._initialPage = r
|
||||
self._initialPageSoup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
return self._initialPage, self._initialPageSoup
|
||||
|
||||
def get_items(self):
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', 'Accept-Language': 'en-US,en;q=0.5'}
|
||||
@@ -143,14 +175,10 @@ class FacebookUserAndCommunityScraper(FacebookCommonScraper):
|
||||
nextPageLinkPattern = re.compile(r'^/pages_reaction_units/more/\?page_id=')
|
||||
spuriousForLoopPattern = re.compile(r'^for \(;;\);')
|
||||
|
||||
logger.info('Retrieving initial data')
|
||||
r = self._get(self._baseUrl, headers = headers)
|
||||
r, soup = self._initial_page()
|
||||
if r.status_code == 404:
|
||||
logger.warning('User does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
raise snscrape.base.ScraperException('Got status code {r.status_code}')
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
yield from self._soup_to_items(soup, self._baseUrl, 'user')
|
||||
nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern)
|
||||
|
||||
@@ -190,6 +218,76 @@ class FacebookUserScraper(FacebookUserAndCommunityScraper):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._baseUrl = f'https://www.facebook.com/{self._username}/'
|
||||
|
||||
def get_entity(self):
|
||||
kwargs = {}
|
||||
|
||||
nameVerifiedMarkupPattern = re.compile(r'"markup":\[\["__markup_a588f507_0_0",\{"__html":(".*?")\}')
|
||||
handleDivPattern = re.compile(r'<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>')
|
||||
handlePattern = re.compile(r'<a\s[^>]*(?<=\s)href="/([^/]+)')
|
||||
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
|
||||
createdDatePattern = re.compile('^(' + '|'.join(months) + r') (\d+), (\d+)$')
|
||||
|
||||
r, soup = self._initial_page()
|
||||
if r.status_code != 200:
|
||||
return
|
||||
|
||||
handleDiv = handleDivPattern.search(r.text)
|
||||
handle = handlePattern.search(handleDiv.group(0))
|
||||
kwargs['username'] = handle.group(1)
|
||||
|
||||
nameVerifiedMarkup = nameVerifiedMarkupPattern.search(r.text)
|
||||
nameVerifiedMarkup = json.loads(nameVerifiedMarkup.group(1))
|
||||
nameVerifiedSoup = bs4.BeautifulSoup(nameVerifiedMarkup, 'lxml')
|
||||
kwargs['name'] = nameVerifiedSoup.find('a', class_ = '_64-f').text
|
||||
kwargs['verified'] = bool(nameVerifiedSoup.find('a', class_ = '_56_f'))
|
||||
|
||||
pageTransparencyContentDiv = soup.find('div', class_ = '_61-0')
|
||||
if pageTransparencyContentDiv.text.startswith('Page created - '):
|
||||
createdDateMess = pageTransparencyContentDiv.text.split(' - ', 1)[1]
|
||||
m = createdDatePattern.match(createdDateMess)
|
||||
assert m, 'unexpected created div content'
|
||||
kwargs['created'] = datetime.date(int(m.group(3)), months.index(m.group(1)) + 1, int(m.group(2)))
|
||||
if pageTransparencyContentDiv.text.startswith('Confirmed Page Owner: '):
|
||||
kwargs['pageOwner'] = pageTransparencyContentDiv.text.split(': ', 1)[1]
|
||||
|
||||
communityDiv = soup.find('div', class_ = '_6590')
|
||||
for div in communityDiv.find_all('div', class_ = '_4bl9'):
|
||||
text = div.text
|
||||
if text.endswith(' people like this'):
|
||||
kwargs['likes'] = int(text.split(' ', 1)[0].replace(',', ''))
|
||||
elif text.endswith(' people follow this'):
|
||||
kwargs['followers'] = int(text.split(' ', 1)[0].replace(',', ''))
|
||||
elif text.endswith(' check-ins'):
|
||||
kwargs['checkins'] = int(text.split(' ', 1)[0].replace(',', ''))
|
||||
|
||||
aboutDiv = soup.find('div', class_ = '_u9q')
|
||||
if aboutDiv:
|
||||
# As if the above wasn't already ugly enough, this is where it gets really bad...
|
||||
for div in aboutDiv.find_all('div', class_ = '_2pi9'):
|
||||
img = div.find('img', class_ = '_3-91')
|
||||
if not img:
|
||||
continue
|
||||
if img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/y5/r/vfXKA62x4Da.png': # Address
|
||||
rawAddress = div.find('div', class_ = '_2wzd').text
|
||||
kwargs['address'] = re.sub(r' \((\d+,)?\d+(\.\d+)? mi\)', '\n', rawAddress) # Remove distance from inferred IP location, restore linebreak
|
||||
elif img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/yW/r/mYv88EsODOI.png': # Phone number
|
||||
kwargs['phone'] = div.find('div', class_ = '_4bl9').text
|
||||
elif img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/yx/r/xVA3lB-GVep.png': # Web link
|
||||
for a in div.find_all('a'):
|
||||
if a.text == '' or 'href' not in a.attrs or a.find('span'):
|
||||
continue
|
||||
dirtyWeb = a['href']
|
||||
assert dirtyWeb.startswith('https://l.facebook.com/l.php?u='), 'unexpected web link'
|
||||
kwargs['web'] = urllib.parse.unquote(dirtyWeb.split('=', 1)[1].split('&', 1)[0])
|
||||
elif img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/yl/r/LwDWwC1d0Rx.png': # Keywords
|
||||
kwargs['keywords'] = div.find('div', class_ = '_4bl9').text.split(' · ')
|
||||
|
||||
androidUrlMeta = soup.find('meta', property = 'al:android:url')
|
||||
assert androidUrlMeta['content'].startswith('fb://page/') and androidUrlMeta['content'].endswith('?referrer=app_link')
|
||||
kwargs['pageId'] = int(androidUrlMeta['content'][10:-18])
|
||||
|
||||
return User(**kwargs)
|
||||
|
||||
|
||||
class FacebookCommunityScraper(FacebookUserAndCommunityScraper):
|
||||
name = 'facebook-community'
|
||||
|
||||
@@ -2,6 +2,7 @@ import datetime
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import snscrape.base
|
||||
import typing
|
||||
|
||||
@@ -26,6 +27,20 @@ class InstagramPost(typing.NamedTuple, snscrape.base.Item):
|
||||
return self.cleanUrl
|
||||
|
||||
|
||||
class User(typing.NamedTuple, snscrape.base.Entity):
|
||||
username: str
|
||||
name: typing.Optional[str]
|
||||
followers: int
|
||||
followersGranularity: snscrape.base.Granularity
|
||||
following: int
|
||||
followingGranularity: snscrape.base.Granularity
|
||||
posts: int
|
||||
postsGranularity: snscrape.base.Granularity
|
||||
|
||||
def __str__(self):
|
||||
return f'https://www.instagram.com/{self.username}/'
|
||||
|
||||
|
||||
class InstagramCommonScraper(snscrape.base.Scraper):
|
||||
def __init__(self, mode, name, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
@@ -34,6 +49,8 @@ class InstagramCommonScraper(snscrape.base.Scraper):
|
||||
self._mode = mode
|
||||
self._name = name
|
||||
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
|
||||
|
||||
if self._mode == 'User':
|
||||
self._initialUrl = f'https://www.instagram.com/{self._name}/'
|
||||
self._pageName = 'ProfilePage'
|
||||
@@ -58,6 +75,7 @@ class InstagramCommonScraper(snscrape.base.Scraper):
|
||||
self._pageIDKey = 'id'
|
||||
self._queryHash = '1b84447a4d8b6d6d0426fefb34514485'
|
||||
self._variablesFormat = '{{"id":"{pageID}","first":50,"after":"{endCursor}"}}'
|
||||
self._initialPage = None
|
||||
|
||||
def _response_to_items(self, response):
|
||||
for node in response[self._responseContainer][self._edgeXToMedia]['edges']:
|
||||
@@ -79,6 +97,17 @@ class InstagramCommonScraper(snscrape.base.Scraper):
|
||||
isVideo = node['node']['is_video'],
|
||||
)
|
||||
|
||||
def _initial_page(self):
|
||||
if self._initialPage is None:
|
||||
logger.info('Retrieving initial data')
|
||||
r = self._get(self._initialUrl, headers = self._headers, responseOkCallback = self._check_initial_page_callback)
|
||||
if r.status_code not in (200, 404):
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
elif r.url.startswith('https://www.instagram.com/accounts/login/'):
|
||||
raise snscrape.base.ScraperException('Redirected to login page')
|
||||
self._initialPage = r
|
||||
return self._initialPage
|
||||
|
||||
def _check_initial_page_callback(self, r):
|
||||
if r.status_code != 200:
|
||||
return True, None
|
||||
@@ -101,17 +130,10 @@ class InstagramCommonScraper(snscrape.base.Scraper):
|
||||
return True, None
|
||||
|
||||
def get_items(self):
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
|
||||
|
||||
logger.info('Retrieving initial data')
|
||||
r = self._get(self._initialUrl, headers = headers, responseOkCallback = self._check_initial_page_callback)
|
||||
r = self._initial_page()
|
||||
if r.status_code == 404:
|
||||
logger.warning(f'{self._mode} does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
elif r.url.startswith('https://www.instagram.com/accounts/login/'):
|
||||
raise snscrape.base.ScraperException('Redirected to login page')
|
||||
response = r._snscrape_json_obj
|
||||
rhxGis = response['rhx_gis'] if 'rhx_gis' in response else ''
|
||||
if response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['count'] == 0:
|
||||
@@ -126,6 +148,7 @@ class InstagramCommonScraper(snscrape.base.Scraper):
|
||||
return
|
||||
endCursor = response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['page_info']['end_cursor']
|
||||
|
||||
headers = self._headers.copy()
|
||||
while True:
|
||||
logger.info(f'Retrieving endCursor = {endCursor!r}')
|
||||
variables = self._variablesFormat.format(**locals())
|
||||
@@ -156,6 +179,42 @@ class InstagramUserScraper(InstagramCommonScraper):
|
||||
def from_args(cls, args):
|
||||
return cls('User', args.username, retries = args.retries)
|
||||
|
||||
def get_entity(self):
|
||||
r = self._initial_page()
|
||||
if r.status_code != 200:
|
||||
return
|
||||
if '<meta property="og:description" content="' not in r.text:
|
||||
return
|
||||
ogDescriptionContentPos = r.text.index('<meta property="og:description" content="') + len('<meta property="og:description" content="')
|
||||
ogDescription = r.text[ogDescriptionContentPos : r.text.index('"', ogDescriptionContentPos)]
|
||||
|
||||
numPattern = r'\d+(?:\.\d+)?m|\d+(?:\.\d+)?k|\d+,\d+|\d+'
|
||||
ogDescriptionPattern = re.compile('^(' + numPattern + ') Followers, (' + numPattern + ') Following, (' + numPattern + r') Posts - See Instagram photos and videos from (?:(.*?) \(@([a-z0-9_.]+)\)|@([a-z0-9_-]+))$')
|
||||
m = ogDescriptionPattern.match(ogDescription)
|
||||
assert m, 'unexpected og:description format'
|
||||
|
||||
def parse_num(s):
|
||||
if s.endswith('m'):
|
||||
return int(float(s[:-1].replace(',', '')) * 1e6), 10 ** (6 if '.' not in s else 6 - len(s[:-1].replace(',', '').split('.')[1]))
|
||||
elif s.endswith('k'):
|
||||
return int(float(s[:-1].replace(',', '')) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].replace(',', '').split('.')[1]))
|
||||
else:
|
||||
return int(s.replace(',', '')), 1
|
||||
|
||||
followers, followersGranularity = parse_num(m.group(1))
|
||||
following, followingGranularity = parse_num(m.group(2))
|
||||
posts, postsGranularity = parse_num(m.group(3))
|
||||
return User(
|
||||
username = m.group(5) or m.group(6),
|
||||
name = m.group(4) or None,
|
||||
followers = followers,
|
||||
followersGranularity = followersGranularity,
|
||||
following = following,
|
||||
followingGranularity = followingGranularity,
|
||||
posts = posts,
|
||||
postsGranularity = postsGranularity,
|
||||
)
|
||||
|
||||
|
||||
class InstagramHashtagScraper(InstagramCommonScraper):
|
||||
name = 'instagram-hashtag'
|
||||
|
||||
@@ -20,12 +20,43 @@ class TelegramPost(typing.NamedTuple, snscrape.base.Item):
|
||||
return self.url
|
||||
|
||||
|
||||
class Channel(typing.NamedTuple, snscrape.base.Entity):
|
||||
username: str
|
||||
title: str
|
||||
description: str
|
||||
verified: bool
|
||||
photo: str
|
||||
members: int
|
||||
photos: typing.Optional[int] = None
|
||||
photosGranularity: typing.Optional[snscrape.base.Granularity] = None
|
||||
videos: typing.Optional[int] = None
|
||||
videosGranularity: typing.Optional[snscrape.base.Granularity] = None
|
||||
links: typing.Optional[int] = None
|
||||
linksGranularity: typing.Optional[snscrape.base.Granularity] = None
|
||||
files: typing.Optional[int] = None
|
||||
filesGranularity: typing.Optional[snscrape.base.Granularity] = None
|
||||
|
||||
def __str__(self):
|
||||
return f'https://t.me/s/{self.username}'
|
||||
|
||||
|
||||
class TelegramChannelScraper(snscrape.base.Scraper):
|
||||
name = 'telegram-channel'
|
||||
|
||||
def __init__(self, name, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._name = name
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
|
||||
self._initialPage = None
|
||||
self._initialPageSoup = None
|
||||
|
||||
def _initial_page(self):
|
||||
if self._initialPage is None:
|
||||
r = self._get(f'https://t.me/s/{self._name}', headers = self._headers)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
self._initialPage, self._initialPageSoup = r, bs4.BeautifulSoup(r.text, 'lxml')
|
||||
return self._initialPage, self._initialPageSoup
|
||||
|
||||
def _soup_to_items(self, soup, pageUrl):
|
||||
posts = soup.find_all('div', attrs = {'class': 'tgme_widget_message', 'data-post': True})
|
||||
@@ -43,19 +74,58 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
||||
yield TelegramPost(url = f'https://t.me/s/{post["data-post"]}', date = date, content = content, outlinks = outlinks, outlinksss = outlinksss)
|
||||
|
||||
def get_items(self):
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
|
||||
|
||||
nextPageUrl = f'https://t.me/s/{self._name}'
|
||||
r, soup = self._initial_page()
|
||||
while True:
|
||||
yield from self._soup_to_items(soup, r.url)
|
||||
pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})
|
||||
if not pageLink:
|
||||
break
|
||||
nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
|
||||
r = self._get(nextPageUrl, headers = headers)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
yield from self._soup_to_items(soup, nextPageUrl)
|
||||
pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})
|
||||
if not pageLink:
|
||||
break
|
||||
nextPageUrl = urllib.parse.urljoin(nextPageUrl, pageLink['href'])
|
||||
|
||||
def get_entity(self):
|
||||
kwargs = {}
|
||||
# /channel has a more accurate member count and bigger profile picture
|
||||
r = self._get(f'https://t.me/{self._name}', headers = self._headers)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
membersDiv = soup.find('div', class_ = 'tgme_page_extra')
|
||||
assert membersDiv.text.endswith(' members')
|
||||
kwargs['members'] = int(membersDiv.text[:-8].replace(' ', ''))
|
||||
kwargs['photo'] = soup.find('img', class_ = 'tgme_page_photo_image').attrs['src']
|
||||
|
||||
r, soup = self._initial_page()
|
||||
channelInfoDiv = soup.find('div', class_ = 'tgme_channel_info')
|
||||
assert channelInfoDiv, 'channel info div not found'
|
||||
titleDiv = channelInfoDiv.find('div', class_ = 'tgme_channel_info_header_title')
|
||||
kwargs['title'] = titleDiv.find('span').text
|
||||
kwargs['verified'] = bool(titleDiv.find('i', class_ = 'verified-icon'))
|
||||
kwargs['username'] = channelInfoDiv.find('div', class_ = 'tgme_channel_info_header_username').text[1:] # Remove @
|
||||
kwargs['description'] = channelInfoDiv.find('div', class_ = 'tgme_channel_info_description').text
|
||||
|
||||
def parse_num(s):
|
||||
s = s.replace(' ', '')
|
||||
if s.endswith('M'):
|
||||
return int(float(s[:-1]) * 1e6), 10 ** (6 if '.' not in s else 6 - len(s[:-1].split('.')[1]))
|
||||
elif s.endswith('K'):
|
||||
return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1]))
|
||||
else:
|
||||
return int(s), 1
|
||||
|
||||
for div in channelInfoDiv.find_all('div', class_ = 'tgme_channel_info_counter'):
|
||||
value, granularity = parse_num(div.find('span', class_ = 'counter_value').text)
|
||||
type_ = div.find('span', class_ = 'counter_type').text
|
||||
if type_ == 'members':
|
||||
# Already extracted more accurately from /channel, skip
|
||||
continue
|
||||
elif type_ in ('photos', 'videos', 'links', 'files'):
|
||||
kwargs[type_], kwargs[f'{type_}Granularity'] = value, granularity
|
||||
|
||||
return Channel(**kwargs)
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import bs4
|
||||
import datetime
|
||||
import email.utils
|
||||
import itertools
|
||||
import json
|
||||
import random
|
||||
import logging
|
||||
@@ -11,6 +13,7 @@ import urllib.parse
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
_API_AUTHORIZATION_HEADER = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs=1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
|
||||
|
||||
|
||||
class Tweet(typing.NamedTuple, snscrape.base.Item):
|
||||
@@ -28,8 +31,30 @@ class Tweet(typing.NamedTuple, snscrape.base.Item):
|
||||
return self.url
|
||||
|
||||
|
||||
class Account(typing.NamedTuple, snscrape.base.Item):
|
||||
class DescriptionURL(typing.NamedTuple):
|
||||
text: str
|
||||
url: str
|
||||
tcourl: str
|
||||
indices: typing.Tuple[int, int]
|
||||
|
||||
|
||||
class User(typing.NamedTuple, snscrape.base.Item, snscrape.base.Entity):
|
||||
# This is both an Item and an Entity since it can be returned as TwitterUserScraper's entity as well as TwitterListMembersScraper's items.
|
||||
# Most fields can be None if they're not known.
|
||||
|
||||
username: str
|
||||
description: typing.Optional[str] = None # Description as it's displayed on the web interface with URLs replaced
|
||||
rawDescription: typing.Optional[str] = None # Raw description with the URL(s) intact
|
||||
descriptionUrls: typing.Optional[typing.List[DescriptionURL]] = None
|
||||
verified: typing.Optional[bool] = None
|
||||
created: typing.Optional[datetime.datetime] = None
|
||||
followersCount: typing.Optional[int] = None
|
||||
friendsCount: typing.Optional[int] = None
|
||||
statusesCount: typing.Optional[int] = None
|
||||
linkUrl: typing.Optional[str] = None
|
||||
linkTcourl: typing.Optional[str] = None
|
||||
profileImageUrl: typing.Optional[str] = None
|
||||
profileBannerUrl: typing.Optional[str] = None
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
@@ -117,7 +142,7 @@ class TwitterSearchScraper(TwitterCommonScraper):
|
||||
if r.status_code == 429:
|
||||
# Accept a 429 response as "valid" to prevent retries; handled explicitly in get_items
|
||||
return True, None
|
||||
if r.headers.get('content-type') != 'application/json;charset=utf-8':
|
||||
if r.headers.get('content-type').replace(' ', '') != 'application/json;charset=utf-8':
|
||||
return False, f'content type is not JSON'
|
||||
if r.status_code != 200:
|
||||
return False, f'non-200 status code'
|
||||
@@ -126,7 +151,7 @@ class TwitterSearchScraper(TwitterCommonScraper):
|
||||
def get_items(self):
|
||||
headers = {
|
||||
'User-Agent': self._userAgent,
|
||||
'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs=1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA',
|
||||
'Authorization': _API_AUTHORIZATION_HEADER,
|
||||
'Referer': self._baseUrl,
|
||||
}
|
||||
if self._guestToken:
|
||||
@@ -232,6 +257,55 @@ class TwitterUserScraper(TwitterSearchScraper):
|
||||
super().__init__(f'from:{username}', **kwargs)
|
||||
self._username = username
|
||||
|
||||
def get_entity(self):
|
||||
while True:
|
||||
if not self._guestToken:
|
||||
self._guestToken = self._get_guest_token(f'https://twitter.com/{self._username}')
|
||||
|
||||
params = {'variables': json.dumps({'screen_name': self._username, 'withHighlightedLabel': True}, separators = (',', ':'))}
|
||||
r = self._get(f'https://api.twitter.com/graphql/-xfUfZsnR_zqjFd-IfrN5A/UserByScreenName',
|
||||
params = urllib.parse.urlencode(params, quote_via=urllib.parse.quote),
|
||||
headers = {'User-Agent': self._userAgent, 'Authorization': _API_AUTHORIZATION_HEADER, 'Referer': 'https://twitter.com/', 'x-guest-token': self._guestToken},
|
||||
responseOkCallback = self._check_scroll_response,
|
||||
)
|
||||
if r.status_code == 429:
|
||||
self._guestToken = None
|
||||
del self._session.cookies['gt']
|
||||
continue
|
||||
elif r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
try:
|
||||
obj = r.json()
|
||||
except json.JSONDecodeError as e:
|
||||
raise snscrape.base.ScraperException('Received invalid JSON from Twitter') from e
|
||||
user = obj['data']['user']
|
||||
rawDescription = user['legacy']['description']
|
||||
if user['legacy']['entities']['description']['urls']:
|
||||
description = []
|
||||
description.append(rawDescription[:user['legacy']['entities']['description']['urls'][0]['indices'][0]])
|
||||
urls = sorted(user['legacy']['entities']['description']['urls'], key = lambda x: x['indices'][0])
|
||||
for url, nextUrl in itertools.zip_longest(urls, urls[1:]):
|
||||
description.append(url['display_url'])
|
||||
description.append(rawDescription[url['indices'][1] : nextUrl['indices'][0] if nextUrl is not None else None])
|
||||
description = ''.join(description)
|
||||
else:
|
||||
description = rawDescription
|
||||
return User(
|
||||
username = user['legacy']['screen_name'],
|
||||
description = description,
|
||||
rawDescription = rawDescription,
|
||||
descriptionUrls = [{'text': x['display_url'], 'url': x['expanded_url'], 'tcourl': x['url'], 'indices': tuple(x['indices'])} for x in user['legacy']['entities']['description']['urls']],
|
||||
verified = user['legacy']['verified'],
|
||||
created = email.utils.parsedate_to_datetime(user['legacy']['created_at']),
|
||||
followersCount = user['legacy']['followers_count'],
|
||||
friendsCount = user['legacy']['friends_count'],
|
||||
statusesCount = user['legacy']['statuses_count'],
|
||||
linkUrl = user['legacy']['entities']['url']['urls'][0]['expanded_url'] if 'url' in user['legacy']['entities'] else None,
|
||||
linkTcourl = user['legacy'].get('url'),
|
||||
profileImageUrl = user['legacy']['profile_image_url_https'],
|
||||
profileBannerUrl = user['legacy'].get('profile_banner_url'),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
subparser.add_argument('username', help = 'A Twitter username (without @)')
|
||||
@@ -360,7 +434,7 @@ class TwitterListMembersScraper(TwitterCommonScraper):
|
||||
logger.warning('Empty list')
|
||||
return
|
||||
for item in items:
|
||||
yield Account(username = item.find('div', 'account')['data-screen-name'])
|
||||
yield User(username = item.find('div', 'account')['data-screen-name'])
|
||||
|
||||
if not container.has_attr('data-min-position') or container['data-min-position'] == '':
|
||||
return
|
||||
@@ -375,7 +449,7 @@ class TwitterListMembersScraper(TwitterCommonScraper):
|
||||
soup = bs4.BeautifulSoup(obj['items_html'], 'lxml')
|
||||
items = soup.find_all('li', 'js-stream-item')
|
||||
for item in items:
|
||||
yield Account(username = item.find('div', 'account')['data-screen-name'])
|
||||
yield User(username = item.find('div', 'account')['data-screen-name'])
|
||||
if not obj['has_more_items']:
|
||||
break
|
||||
maxPosition = obj['min_position']
|
||||
|
||||
@@ -19,37 +19,63 @@ class VKontaktePost(typing.NamedTuple, snscrape.base.Item):
|
||||
return self.url
|
||||
|
||||
|
||||
class User(typing.NamedTuple, snscrape.base.Entity):
|
||||
username: str
|
||||
name: str
|
||||
verified: bool
|
||||
description: typing.Optional[str] = None
|
||||
websites: typing.Optional[typing.List[str]] = None
|
||||
followers: typing.Optional[int] = None
|
||||
followersGranularity: typing.Optional[snscrape.base.Granularity] = None
|
||||
posts: typing.Optional[int] = None
|
||||
postsGranularity: typing.Optional[snscrape.base.Granularity] = None
|
||||
photos: typing.Optional[int] = None
|
||||
photosGranularity: typing.Optional[snscrape.base.Granularity] = None
|
||||
tags: typing.Optional[int] = None
|
||||
tagsGranularity: typing.Optional[snscrape.base.Granularity] = None
|
||||
following: typing.Optional[int] = None
|
||||
followingGranularity: typing.Optional[snscrape.base.Granularity] = None
|
||||
|
||||
def __str__(self):
|
||||
return f'https://vk.com/{self.username}'
|
||||
|
||||
|
||||
class VKontakteUserScraper(snscrape.base.Scraper):
|
||||
name = 'vkontakte-user'
|
||||
|
||||
def __init__(self, username, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._username = username
|
||||
self._baseUrl = f'https://vk.com/{self._username}'
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept-Language': 'en-US,en;q=0.5'}
|
||||
self._initialPage = None
|
||||
self._initialPageSoup = None
|
||||
|
||||
def _soup_to_items(self, soup, baseUrl):
|
||||
def _soup_to_items(self, soup):
|
||||
for post in soup.find_all('div', class_ = 'post'):
|
||||
dateSpan = post.find('div', class_ = 'post_date').find('span', class_ = 'rel_date')
|
||||
textDiv = post.find('div', class_ = 'wall_post_text')
|
||||
yield VKontaktePost(
|
||||
url = urllib.parse.urljoin(baseUrl, post.find('a', class_ = 'post_link')['href']),
|
||||
url = urllib.parse.urljoin(self._baseUrl, post.find('a', class_ = 'post_link')['href']),
|
||||
date = datetime.datetime.fromtimestamp(int(dateSpan['time']), datetime.timezone.utc) if 'time' in dateSpan else None,
|
||||
content = textDiv.text if textDiv else None,
|
||||
)
|
||||
|
||||
def get_items(self):
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept-Language': 'en-US,en;q=0.5'}
|
||||
baseUrl = f'https://vk.com/{self._username}'
|
||||
def _initial_page(self):
|
||||
if self._initialPage is None:
|
||||
logger.info('Retrieving initial data')
|
||||
r = self._get(self._baseUrl, headers = self._headers)
|
||||
if r.status_code not in (200, 404):
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
# VK sends windows-1251-encoded data, but Requests's decoding doesn't seem to work correctly and causes lxml to choke, so we need to pass the binary content and the encoding explicitly.
|
||||
self._initialPage, self._initialPageSoup = r, bs4.BeautifulSoup(r.content, 'lxml', from_encoding = r.encoding)
|
||||
return self._initialPage, self._initialPageSoup
|
||||
|
||||
logger.info('Retrieving initial data')
|
||||
r = self._get(baseUrl, headers = headers)
|
||||
def get_items(self):
|
||||
r, soup = self._initial_page()
|
||||
if r.status_code == 404:
|
||||
logger.warning('Wall does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
|
||||
# VK sends windows-1251-encoded data, but Requests's decoding doesn't seem to work correctly and causes lxml to choke, so we need to pass the binary content and the encoding explicitly.
|
||||
soup = bs4.BeautifulSoup(r.content, 'lxml', from_encoding = r.encoding)
|
||||
|
||||
if soup.find('div', class_ = 'profile_closed_wall_dummy'):
|
||||
logger.warning('Private profile')
|
||||
@@ -72,7 +98,7 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
||||
else:
|
||||
fixedPostID = ''
|
||||
|
||||
yield from self._soup_to_items(soup, baseUrl)
|
||||
yield from self._soup_to_items(soup)
|
||||
|
||||
headers['X-Requested-With'] = 'XMLHttpRequest'
|
||||
for offset in itertools.count(start = 10, step = 10):
|
||||
@@ -92,7 +118,71 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
||||
if not posts.startswith('<div id="post'):
|
||||
raise snscrape.base.ScraperException(f'Got an unknown response: {posts[:200]!r}...')
|
||||
soup = bs4.BeautifulSoup(posts, 'lxml')
|
||||
yield from self._soup_to_items(soup, baseUrl)
|
||||
yield from self._soup_to_items(soup)
|
||||
|
||||
def get_entity(self):
|
||||
r, soup = self._initial_page()
|
||||
if r.status_code != 200:
|
||||
return
|
||||
kwargs = {}
|
||||
kwargs['username'] = r.url.rsplit('/', 1)[1]
|
||||
nameH1 = soup.find('h1', class_ = 'page_name')
|
||||
kwargs['name'] = nameH1.text
|
||||
kwargs['verified'] = bool(nameH1.find('div', class_ = 'page_verified'))
|
||||
infoDiv = soup.find('div', id = 'page_info_wrap')
|
||||
|
||||
descriptionDiv = soup.find('div', id = 'page_current_info')
|
||||
if descriptionDiv:
|
||||
kwargs['description'] = descriptionDiv.text
|
||||
websites = []
|
||||
for rowDiv in infoDiv.find_all('div', class_ = ['profile_info_row', 'group_info_row']):
|
||||
if 'profile_info_row' in rowDiv['class']:
|
||||
labelDiv = rowDiv.find('div', class_ = 'fl_l')
|
||||
if not labelDiv or labelDiv.text != 'Website:':
|
||||
continue
|
||||
else: # group_info_row
|
||||
if rowDiv['title'] == 'Description':
|
||||
kwargs['description'] = rowDiv.text
|
||||
if rowDiv['title'] != 'Website':
|
||||
continue
|
||||
for a in rowDiv.find_all('a'):
|
||||
if not a['href'].startswith('/away.php?to='):
|
||||
logger.warning(f'Skipping odd website link: {a["href"]!r}')
|
||||
continue
|
||||
websites.append(urllib.parse.unquote(a['href'].split('=', 1)[1].split('&', 1)[0]))
|
||||
if websites:
|
||||
kwargs['websites'] = websites
|
||||
|
||||
def parse_num(s):
|
||||
if s.endswith('K'):
|
||||
return int(s[:-1]) * 1000, 1000
|
||||
else:
|
||||
return int(s.replace(',', '')), 1
|
||||
|
||||
countsDiv = soup.find('div', class_ = 'counts_module')
|
||||
if countsDiv:
|
||||
for a in countsDiv.find_all('a', class_ = 'page_counter'):
|
||||
count, granularity = parse_num(a.find('div', class_ = 'count').text)
|
||||
label = a.find('div', class_ = 'label').text
|
||||
if label in ('follower', 'post', 'photo', 'tag'):
|
||||
label = f'{label}s'
|
||||
if label in ('followers', 'posts', 'photos', 'tags'):
|
||||
kwargs[label], kwargs[f'{label}Granularity'] = count, granularity
|
||||
|
||||
idolsDiv = soup.find('div', id = 'profile_idols')
|
||||
if idolsDiv:
|
||||
topDiv = idolsDiv.find('div', class_ = 'header_top')
|
||||
if topDiv and topDiv.find('span', class_ = 'header_label').text == 'Following':
|
||||
kwargs['following'], kwargs['followingGranularity'] = parse_num(topDiv.find('span', class_ = 'header_count').text)
|
||||
|
||||
# On public pages, this is where followers are listed
|
||||
followersDiv = soup.find('div', id = 'public_followers')
|
||||
if followersDiv:
|
||||
topDiv = followersDiv.find('div', class_ = 'header_top')
|
||||
if topDiv and topDiv.find('span', class_ = 'header_label').text == 'Followers':
|
||||
kwargs['followers'], kwargs['followersGranularity'] = parse_num(topDiv.find('span', class_ = 'header_count').text)
|
||||
|
||||
return User(**kwargs)
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
|
||||
Reference in New Issue
Block a user