mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-12 12:28:28 +03:00
Add support for extracting the entity behind a scrape
Closes #11 Backwards incompatibility: snscrape.modules.twitter.Account is now called User. However, this was previously only used on the list member scraper, which has been broken for a while since the list member list is no longer publicly accessible. For compatibility reasons, the CLI does not output the entity by default; the new option --with-entity enables it.
This commit is contained in:
@@ -17,6 +17,22 @@ class Item:
|
||||
pass
|
||||
|
||||
|
||||
class Entity:
|
||||
'''An abstract base class for an entity returned by the scraper's get_entity method.
|
||||
|
||||
An entity is typically the account of a person or organisation. The string representation should be the preferred direct URL to the entity's page on the network.'''
|
||||
|
||||
@abc.abstractmethod
|
||||
def __str__(self):
|
||||
pass
|
||||
|
||||
|
||||
Granularity = int
|
||||
'''Type of fields storing the unit/granularity of numbers.
|
||||
|
||||
For example, a granularity of 1000 means that the SNS returned something like '42k' and the last three significant digits are unknown.'''
|
||||
|
||||
|
||||
class URLItem(Item):
|
||||
'''A generic item which only holds a URL string.'''
|
||||
|
||||
@@ -49,6 +65,10 @@ class Scraper:
|
||||
'''Iterator yielding Items.'''
|
||||
pass
|
||||
|
||||
def get_entity(self):
|
||||
'''Get the entity behind the scraper, if any.'''
|
||||
return None
|
||||
|
||||
def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None):
|
||||
for attempt in range(self._retries + 1):
|
||||
# The request is newly prepared on each retry because of potential cookie updates.
|
||||
|
||||
Reference in New Issue
Block a user