Refactor username vs ID mess

Closes #354
This commit is contained in:
JustAnotherArchivist
2022-01-12 22:36:26 +00:00
parent e6076353c8
commit eebdfc1c55
2 changed files with 34 additions and 40 deletions

View File

@@ -694,13 +694,13 @@ class TwitterSearchScraper(_TwitterAPIScraper):
class TwitterUserScraper(TwitterSearchScraper): class TwitterUserScraper(TwitterSearchScraper):
name = 'twitter-user' name = 'twitter-user'
def __init__(self, username, isUserId = False, **kwargs): def __init__(self, user, **kwargs):
if not self.is_valid_username(username): self._isUserId = isinstance(user, int)
if not self._isUserId and not self.is_valid_username(user):
raise ValueError('Invalid username') raise ValueError('Invalid username')
super().__init__(f'from:{username}', **kwargs) super().__init__(f'from:{user}', **kwargs)
self._username = username self._user = user
self._isUserId = isUserId self._baseUrl = f'https://twitter.com/{self._user}' if not self._isUserId else f'https://twitter.com/i/user/{self._user}'
self._baseUrl = f'https://twitter.com/{self._username}' if not self._isUserId else f'https://twitter.com/i/user/{self._username}'
def _get_entity(self): def _get_entity(self):
self._ensure_guest_token() self._ensure_guest_token()
@@ -710,7 +710,7 @@ class TwitterUserScraper(TwitterSearchScraper):
else: else:
fieldName = 'userId' fieldName = 'userId'
endpoint = 'https://twitter.com/i/api/graphql/WN6Hck-Pwm-YP0uxVj1oMQ/UserByRestIdWithoutResults' endpoint = 'https://twitter.com/i/api/graphql/WN6Hck-Pwm-YP0uxVj1oMQ/UserByRestIdWithoutResults'
params = {'variables': json.dumps({fieldName: self._username, 'withHighlightedLabel': True}, separators = (',', ':'))} params = {'variables': json.dumps({fieldName: str(self._user), 'withHighlightedLabel': True}, separators = (',', ':'))}
obj = self._get_api_data(endpoint, params = urllib.parse.urlencode(params, quote_via=urllib.parse.quote)) obj = self._get_api_data(endpoint, params = urllib.parse.urlencode(params, quote_via=urllib.parse.quote))
if not obj['data']: if not obj['data']:
return None return None
@@ -747,28 +747,28 @@ class TwitterUserScraper(TwitterSearchScraper):
def get_items(self): def get_items(self):
if self._isUserId: if self._isUserId:
# Resolve user ID to username # Resolve user ID to username
self._username = self.entity.username self._user = self.entity.username
self._isUserId = False self._isUserId = False
self._query = f'from:{self._username}' self._query = f'from:{self._user}'
yield from super().get_items() yield from super().get_items()
@staticmethod @staticmethod
def is_valid_username(s): def is_valid_username(s):
return (1 <= len(s) <= 15 and s.strip(string.ascii_letters + string.digits + '_') == '') or (s and s.strip(string.digits) == '') return 1 <= len(s) <= 15 and s.strip(string.ascii_letters + string.digits + '_') == ''
@classmethod @classmethod
def _cli_setup_parser(cls, subparser): def _cli_setup_parser(cls, subparser):
def username(s): def user(s):
if cls.is_valid_username(s): if cls.is_valid_username(s) or s.isdigit():
return s return s
raise ValueError('Invalid username') raise ValueError('Invalid username or ID')
subparser.add_argument('--user-id', dest = 'isUserId', action = 'store_true', default = False, help = 'Use user ID instead of username') subparser.add_argument('--user-id', dest = 'isUserId', action = 'store_true', default = False, help = 'Use user ID instead of username')
subparser.add_argument('username', type = username, help = 'A Twitter username (without @)') subparser.add_argument('user', type = user, help = 'A Twitter username (without @)')
@classmethod @classmethod
def _cli_from_args(cls, args): def _cli_from_args(cls, args):
return cls._cli_construct(args, args.username, args.isUserId) return cls._cli_construct(args, user = int(args.user) if args.isUserId else args.user)
class TwitterProfileScraper(TwitterUserScraper): class TwitterProfileScraper(TwitterUserScraper):
@@ -778,7 +778,7 @@ class TwitterProfileScraper(TwitterUserScraper):
if not self._isUserId: if not self._isUserId:
userId = self.entity.id userId = self.entity.id
else: else:
userId = self._username userId = self._user
paginationParams = { paginationParams = {
'include_profile_interstitial_type': '1', 'include_profile_interstitial_type': '1',
'include_blocking': '1', 'include_blocking': '1',

View File

@@ -52,24 +52,23 @@ class User(snscrape.base.Entity):
class WeiboUserScraper(snscrape.base.Scraper): class WeiboUserScraper(snscrape.base.Scraper):
name = 'weibo-user' name = 'weibo-user'
def __init__(self, name, uid, **kwargs): def __init__(self, user, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self._name = name self._user = user
self._uid = uid self._isUserId = isinstance(user, int)
if self._name is None and self._uid is None:
raise ValueError('name or uid must not be None')
self._headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'} self._headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
def _ensure_uid(self): def _ensure_user_id(self):
if self._uid is not None: if self._isUserId:
return return
r = self._get(f'https://m.weibo.cn/n/{self._name}', headers = self._headers, allowRedirects = False) r = self._get(f'https://m.weibo.cn/n/{self._user}', headers = self._headers, allowRedirects = False)
if r.status_code == 302 and r.headers['Location'].startswith('/u/') and len(r.headers['Location']) == 13 and r.headers['Location'][3:].strip('0123456789') == '': if r.status_code == 302 and r.headers['Location'].startswith('/u/') and len(r.headers['Location']) == 13 and r.headers['Location'][3:].strip('0123456789') == '':
# Redirect to uid URL # Redirect to uid URL
self._uid = int(r.headers['Location'][3:]) self._user = int(r.headers['Location'][3:])
self._isUserId = True
elif r.status_code == 200 and '<p class="h5-4con">用户不存在</p>' in r.text: elif r.status_code == 200 and '<p class="h5-4con">用户不存在</p>' in r.text:
_logger.warning('User does not exist') _logger.warning('User does not exist')
self._uid = _userDoesNotExist self._user = _userDoesNotExist
else: else:
raise snscrape.base.ScraperError(f'Got unexpected response on resolving username ({r.status_code})') raise snscrape.base.ScraperError(f'Got unexpected response on resolving username ({r.status_code})')
@@ -99,13 +98,13 @@ class WeiboUserScraper(snscrape.base.Scraper):
) )
def get_items(self): def get_items(self):
self._ensure_uid() self._ensure_user_id()
if self._uid is _userDoesNotExist: if self._user is _userDoesNotExist:
return return
sinceId = None sinceId = None
while True: while True:
sinceParam = f'&since_id={sinceId}' if sinceId is not None else '' sinceParam = f'&since_id={sinceId}' if sinceId is not None else ''
r = self._get(f'https://m.weibo.cn/api/container/getIndex?type=uid&value={self._uid}&containerid=107603{self._uid}&count=25{sinceParam}', headers = self._headers, responseOkCallback = self._check_timeline_response) r = self._get(f'https://m.weibo.cn/api/container/getIndex?type=uid&value={self._user}&containerid=107603{self._user}&count=25{sinceParam}', headers = self._headers, responseOkCallback = self._check_timeline_response)
if r.status_code != 200: if r.status_code != 200:
raise snscrape.base.ScraperException(f'Got status code {r.status_code}') raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
o = r.json() o = r.json()
@@ -133,10 +132,10 @@ class WeiboUserScraper(snscrape.base.Scraper):
) )
def _get_entity(self): def _get_entity(self):
self._ensure_uid() self._ensure_user_id()
if self._uid is _userDoesNotExist: if self._user is _userDoesNotExist:
return return
r = self._get(f'https://m.weibo.cn/api/container/getIndex?type=uid&value={self._uid}', headers = self._headers) r = self._get(f'https://m.weibo.cn/api/container/getIndex?type=uid&value={self._user}', headers = self._headers)
if r.status_code != 200: if r.status_code != 200:
raise snscrape.base.ScraperException('Could not fetch user info') raise snscrape.base.ScraperException('Could not fetch user info')
o = r.json() o = r.json()
@@ -144,14 +143,9 @@ class WeiboUserScraper(snscrape.base.Scraper):
@classmethod @classmethod
def _cli_setup_parser(cls, subparser): def _cli_setup_parser(cls, subparser):
subparser.add_argument('user', type = snscrape.base.nonempty_string('user'), help = 'A user name or ID') subparser.add_argument('--name', dest = 'isName', action = 'store_true', help = 'Use username instead of user ID')
subparser.add_argument('user', type = snscrape.base.nonempty_string('user'), help = 'A user ID')
@classmethod @classmethod
def _cli_from_args(cls, args): def _cli_from_args(cls, args):
if len(args.user) == 10 and args.user.strip('0123456789') == '': return cls._cli_construct(args, user = args.user if args.isName else int(args.user))
uid = args.user
name = None
else:
uid = None
name = args.user
return cls._cli_construct(args, name = name, uid = uid)