Cache entities

This commit is contained in:
JustAnotherArchivist
2020-09-01 02:34:21 +00:00
parent 4f24843f89
commit 1a2e367a87
7 changed files with 16 additions and 9 deletions

View File

@@ -1,4 +1,5 @@
import abc
import functools
import logging
import requests
import time
@@ -18,7 +19,7 @@ class Item:
class Entity:
'''An abstract base class for an entity returned by the scraper's get_entity method.
'''An abstract base class for an entity returned by the scraper's entity property.
An entity is typically the account of a person or organisation. The string representation should be the preferred direct URL to the entity's page on the network.'''
@@ -65,10 +66,16 @@ class Scraper:
'''Iterator yielding Items.'''
pass
def get_entity(self):
'''Get the entity behind the scraper, if any.'''
def _get_entity(self):
'''Get the entity behind the scraper, if any.
This is the method implemented by subclasses for doing the actual retrieval/entity object creation. For accessing the scraper's entity, use the entity property.'''
return None
@functools.cached_property
def entity(self):
return self._get_entity()
def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None):
for attempt in range(self._retries + 1):
# The request is newly prepared on each retry because of potential cookie updates.

View File

@@ -238,7 +238,7 @@ def main():
i = 0
with _dump_locals_on_exception():
if args.withEntity:
entity = scraper.get_entity()
entity = scraper.entity
if entity:
if args.jsonl:
print(json.dumps(entity._asdict(), default = json_serialise_datetime))

View File

@@ -218,7 +218,7 @@ class FacebookUserScraper(FacebookUserAndCommunityScraper):
super().__init__(*args, **kwargs)
self._baseUrl = f'https://www.facebook.com/{self._username}/'
def get_entity(self):
def _get_entity(self):
kwargs = {}
nameVerifiedMarkupPattern = re.compile(r'"markup":\[\["__markup_a588f507_0_0",\{"__html":(".*?")\}')

View File

@@ -179,7 +179,7 @@ class InstagramUserScraper(InstagramCommonScraper):
def from_args(cls, args):
return cls('User', args.username, retries = args.retries)
def get_entity(self):
def _get_entity(self):
r = self._initial_page()
if r.status_code != 200:
return

View File

@@ -89,7 +89,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
soup = bs4.BeautifulSoup(r.text, 'lxml')
def get_entity(self):
def _get_entity(self):
kwargs = {}
# /channel has a more accurate member count and bigger profile picture
r = self._get(f'https://t.me/{self._name}', headers = self._headers)

View File

@@ -305,7 +305,7 @@ class TwitterUserScraper(TwitterSearchScraper):
super().__init__(f'from:{username}', **kwargs)
self._username = username
def get_entity(self):
def _get_entity(self):
self._ensure_guest_token(f'https://twitter.com/{self._username}')
params = {'variables': json.dumps({'screen_name': self._username, 'withHighlightedLabel': True}, separators = (',', ':'))}
obj = self._get_api_data('https://api.twitter.com/graphql/-xfUfZsnR_zqjFd-IfrN5A/UserByScreenName', params = urllib.parse.urlencode(params, quote_via=urllib.parse.quote))

View File

@@ -120,7 +120,7 @@ class VKontakteUserScraper(snscrape.base.Scraper):
soup = bs4.BeautifulSoup(posts, 'lxml')
yield from self._soup_to_items(soup)
def get_entity(self):
def _get_entity(self):
r, soup = self._initial_page()
if r.status_code != 200:
return