mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-08 02:28:29 +03:00
Remove Twitter list member scraper
It has been broken for a while. Member lists were removed from the old design, and they're behind a login wall on the new design.
This commit is contained in:
@@ -6,7 +6,7 @@ The following services are currently supported:
|
|||||||
* Instagram: user profiles, hashtags, and locations
|
* Instagram: user profiles, hashtags, and locations
|
||||||
* Reddit: users, subreddits, and searches (via Pushshift)
|
* Reddit: users, subreddits, and searches (via Pushshift)
|
||||||
* Telegram: channels
|
* Telegram: channels
|
||||||
* Twitter: users, user profiles, hashtags, searches, threads, and lists (members as well as posts)
|
* Twitter: users, user profiles, hashtags, searches, threads, and list posts
|
||||||
* VKontakte: user profiles
|
* VKontakte: user profiles
|
||||||
* Weibo (Sina Weibo): user profiles
|
* Weibo (Sina Weibo): user profiles
|
||||||
|
|
||||||
|
|||||||
@@ -81,8 +81,7 @@ class DescriptionURL(typing.NamedTuple):
|
|||||||
indices: typing.Tuple[int, int]
|
indices: typing.Tuple[int, int]
|
||||||
|
|
||||||
|
|
||||||
class User(typing.NamedTuple, snscrape.base.Item, snscrape.base.Entity):
|
class User(typing.NamedTuple, snscrape.base.Entity):
|
||||||
# This is both an Item and an Entity since it can be returned as TwitterUserScraper's entity as well as TwitterListMembersScraper's items.
|
|
||||||
# Most fields can be None if they're not known.
|
# Most fields can be None if they're not known.
|
||||||
|
|
||||||
username: str
|
username: str
|
||||||
@@ -631,56 +630,3 @@ class TwitterListPostsScraper(TwitterSearchScraper):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def from_args(cls, args):
|
def from_args(cls, args):
|
||||||
return cls(args.list, retries = args.retries)
|
return cls(args.list, retries = args.retries)
|
||||||
|
|
||||||
|
|
||||||
class TwitterListMembersScraper(TwitterOldDesignScraper):
|
|
||||||
name = 'twitter-list-members'
|
|
||||||
|
|
||||||
def __init__(self, listName, **kwargs):
|
|
||||||
super().__init__(**kwargs)
|
|
||||||
self._user, self._list = listName.split('/')
|
|
||||||
|
|
||||||
def get_items(self):
|
|
||||||
headers = {'User-Agent': f'Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.18'}
|
|
||||||
|
|
||||||
baseUrl = f'https://twitter.com/{self._user}/lists/{self._list}/members'
|
|
||||||
r = self._get(baseUrl, headers = headers)
|
|
||||||
if r.status_code != 200:
|
|
||||||
logger.warning('List not found')
|
|
||||||
return
|
|
||||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
|
||||||
container = soup.find('div', 'stream-container')
|
|
||||||
if not container:
|
|
||||||
raise snscrape.base.ScraperException('Unable to find container')
|
|
||||||
items = container.find_all('li', 'js-stream-item')
|
|
||||||
if not items:
|
|
||||||
logger.warning('Empty list')
|
|
||||||
return
|
|
||||||
for item in items:
|
|
||||||
yield User(username = item.find('div', 'account')['data-screen-name'])
|
|
||||||
|
|
||||||
if not container.has_attr('data-min-position') or container['data-min-position'] == '':
|
|
||||||
return
|
|
||||||
maxPosition = container['data-min-position']
|
|
||||||
while True:
|
|
||||||
r = self._get(
|
|
||||||
f'{baseUrl}/timeline?include_available_features=1&include_entities=1&max_position={maxPosition}&reset_error_state=false',
|
|
||||||
headers = headers,
|
|
||||||
responseOkCallback = self._check_json_callback
|
|
||||||
)
|
|
||||||
obj = json.loads(r.text)
|
|
||||||
soup = bs4.BeautifulSoup(obj['items_html'], 'lxml')
|
|
||||||
items = soup.find_all('li', 'js-stream-item')
|
|
||||||
for item in items:
|
|
||||||
yield User(username = item.find('div', 'account')['data-screen-name'])
|
|
||||||
if not obj['has_more_items']:
|
|
||||||
break
|
|
||||||
maxPosition = obj['min_position']
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def setup_parser(cls, subparser):
|
|
||||||
subparser.add_argument('list', help = 'A Twitter list, formatted as "username/listname"')
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_args(cls, args):
|
|
||||||
return cls(args.list, retries = args.retries)
|
|
||||||
|
|||||||
Reference in New Issue
Block a user