Cache entities

2026-06-08 10:38:28 +03:00 · 2020-09-01 02:34:21 +00:00
parent 4f24843f89
commit 1a2e367a87
7 changed files with 16 additions and 9 deletions
--- a/snscrape/base.py
+++ b/snscrape/base.py
@@ -1,4 +1,5 @@
 import abc
+import functools
 import logging
 import requests
 import time
@@ -18,7 +19,7 @@ class Item:


 class Entity:
-	'''An abstract base class for an entity returned by the scraper's get_entity method.
+	'''An abstract base class for an entity returned by the scraper's entity property.

 	An entity is typically the account of a person or organisation. The string representation should be the preferred direct URL to the entity's page on the network.'''

@@ -65,10 +66,16 @@ class Scraper:
 		'''Iterator yielding Items.'''
 		pass

-	def get_entity(self):
-		'''Get the entity behind the scraper, if any.'''
+	def _get_entity(self):
+		'''Get the entity behind the scraper, if any.
+
+		This is the method implemented by subclasses for doing the actual retrieval/entity object creation. For accessing the scraper's entity, use the entity property.'''
 		return None

+	@functools.cached_property
+	def entity(self):
+		return self._get_entity()
+
 	def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None):
 		for attempt in range(self._retries + 1):
 			# The request is newly prepared on each retry because of potential cookie updates.
--- a/snscrape/cli.py
+++ b/snscrape/cli.py
@@ -238,7 +238,7 @@ def main():
 	i = 0
 	with _dump_locals_on_exception():
 		if args.withEntity:
-			entity = scraper.get_entity()
+			entity = scraper.entity
 			if entity:
 				if args.jsonl:
 					print(json.dumps(entity._asdict(), default = json_serialise_datetime))
--- a/snscrape/modules/facebook.py
+++ b/snscrape/modules/facebook.py
@@ -218,7 +218,7 @@ class FacebookUserScraper(FacebookUserAndCommunityScraper):
 		super().__init__(*args, **kwargs)
 		self._baseUrl = f'https://www.facebook.com/{self._username}/'

-	def get_entity(self):
+	def _get_entity(self):
 		kwargs = {}

 		nameVerifiedMarkupPattern = re.compile(r'"markup":\[\["__markup_a588f507_0_0",\{"__html":(".*?")\}')
--- a/snscrape/modules/instagram.py
+++ b/snscrape/modules/instagram.py
@@ -179,7 +179,7 @@ class InstagramUserScraper(InstagramCommonScraper):
 	def from_args(cls, args):
 		return cls('User', args.username, retries = args.retries)

-	def get_entity(self):
+	def _get_entity(self):
 		r = self._initial_page()
 		if r.status_code != 200:
 			return
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -89,7 +89,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
 				raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
 			soup = bs4.BeautifulSoup(r.text, 'lxml')

-	def get_entity(self):
+	def _get_entity(self):
 		kwargs = {}
 		# /channel has a more accurate member count and bigger profile picture
 		r = self._get(f'https://t.me/{self._name}', headers = self._headers)
--- a/snscrape/modules/twitter.py
+++ b/snscrape/modules/twitter.py
@@ -305,7 +305,7 @@ class TwitterUserScraper(TwitterSearchScraper):
 		super().__init__(f'from:{username}', **kwargs)
 		self._username = username

-	def get_entity(self):
+	def _get_entity(self):
 		self._ensure_guest_token(f'https://twitter.com/{self._username}')
 		params = {'variables': json.dumps({'screen_name': self._username, 'withHighlightedLabel': True}, separators = (',', ':'))}
 		obj = self._get_api_data('https://api.twitter.com/graphql/-xfUfZsnR_zqjFd-IfrN5A/UserByScreenName', params = urllib.parse.urlencode(params, quote_via=urllib.parse.quote))
--- a/snscrape/modules/vkontakte.py
+++ b/snscrape/modules/vkontakte.py
@@ -120,7 +120,7 @@ class VKontakteUserScraper(snscrape.base.Scraper):
 			soup = bs4.BeautifulSoup(posts, 'lxml')
 			yield from self._soup_to_items(soup)

-	def get_entity(self):
+	def _get_entity(self):
 		r, soup = self._initial_page()
 		if r.status_code != 200:
 			return