mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-08 10:38:28 +03:00
Cache entities
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
import abc
|
||||
import functools
|
||||
import logging
|
||||
import requests
|
||||
import time
|
||||
@@ -18,7 +19,7 @@ class Item:
|
||||
|
||||
|
||||
class Entity:
|
||||
'''An abstract base class for an entity returned by the scraper's get_entity method.
|
||||
'''An abstract base class for an entity returned by the scraper's entity property.
|
||||
|
||||
An entity is typically the account of a person or organisation. The string representation should be the preferred direct URL to the entity's page on the network.'''
|
||||
|
||||
@@ -65,10 +66,16 @@ class Scraper:
|
||||
'''Iterator yielding Items.'''
|
||||
pass
|
||||
|
||||
def get_entity(self):
|
||||
'''Get the entity behind the scraper, if any.'''
|
||||
def _get_entity(self):
|
||||
'''Get the entity behind the scraper, if any.
|
||||
|
||||
This is the method implemented by subclasses for doing the actual retrieval/entity object creation. For accessing the scraper's entity, use the entity property.'''
|
||||
return None
|
||||
|
||||
@functools.cached_property
|
||||
def entity(self):
|
||||
return self._get_entity()
|
||||
|
||||
def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None):
|
||||
for attempt in range(self._retries + 1):
|
||||
# The request is newly prepared on each retry because of potential cookie updates.
|
||||
|
||||
@@ -238,7 +238,7 @@ def main():
|
||||
i = 0
|
||||
with _dump_locals_on_exception():
|
||||
if args.withEntity:
|
||||
entity = scraper.get_entity()
|
||||
entity = scraper.entity
|
||||
if entity:
|
||||
if args.jsonl:
|
||||
print(json.dumps(entity._asdict(), default = json_serialise_datetime))
|
||||
|
||||
@@ -218,7 +218,7 @@ class FacebookUserScraper(FacebookUserAndCommunityScraper):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._baseUrl = f'https://www.facebook.com/{self._username}/'
|
||||
|
||||
def get_entity(self):
|
||||
def _get_entity(self):
|
||||
kwargs = {}
|
||||
|
||||
nameVerifiedMarkupPattern = re.compile(r'"markup":\[\["__markup_a588f507_0_0",\{"__html":(".*?")\}')
|
||||
|
||||
@@ -179,7 +179,7 @@ class InstagramUserScraper(InstagramCommonScraper):
|
||||
def from_args(cls, args):
|
||||
return cls('User', args.username, retries = args.retries)
|
||||
|
||||
def get_entity(self):
|
||||
def _get_entity(self):
|
||||
r = self._initial_page()
|
||||
if r.status_code != 200:
|
||||
return
|
||||
|
||||
@@ -89,7 +89,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
|
||||
def get_entity(self):
|
||||
def _get_entity(self):
|
||||
kwargs = {}
|
||||
# /channel has a more accurate member count and bigger profile picture
|
||||
r = self._get(f'https://t.me/{self._name}', headers = self._headers)
|
||||
|
||||
@@ -305,7 +305,7 @@ class TwitterUserScraper(TwitterSearchScraper):
|
||||
super().__init__(f'from:{username}', **kwargs)
|
||||
self._username = username
|
||||
|
||||
def get_entity(self):
|
||||
def _get_entity(self):
|
||||
self._ensure_guest_token(f'https://twitter.com/{self._username}')
|
||||
params = {'variables': json.dumps({'screen_name': self._username, 'withHighlightedLabel': True}, separators = (',', ':'))}
|
||||
obj = self._get_api_data('https://api.twitter.com/graphql/-xfUfZsnR_zqjFd-IfrN5A/UserByScreenName', params = urllib.parse.urlencode(params, quote_via=urllib.parse.quote))
|
||||
|
||||
@@ -120,7 +120,7 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
||||
soup = bs4.BeautifulSoup(posts, 'lxml')
|
||||
yield from self._soup_to_items(soup)
|
||||
|
||||
def get_entity(self):
|
||||
def _get_entity(self):
|
||||
r, soup = self._initial_page()
|
||||
if r.status_code != 200:
|
||||
return
|
||||
|
||||
Reference in New Issue
Block a user