mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-09 19:08:28 +03:00
Replace named tuples with dataclasses and move JSON conversion logic to the base classes
Named tuples were never really adequate for this since the order aspect of them doesn't make sense. Further, named tuples don't support multiple inheritance. This meant that the objects returned by get_items() were not actually Items, for example. Since Python 3.9, such named tuples cannot be created anymore. Fixes #111
This commit is contained in:
@@ -1,5 +1,8 @@
|
||||
import abc
|
||||
import dataclasses
|
||||
import datetime
|
||||
import functools
|
||||
import json
|
||||
import logging
|
||||
import requests
|
||||
import time
|
||||
@@ -8,7 +11,28 @@ import time
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Item:
|
||||
def _json_serialise_datetime(obj):
|
||||
'''A JSON serialiser that converts datetime.datetime and datetime.date objects to ISO-8601 strings.'''
|
||||
if isinstance(obj, (datetime.datetime, datetime.date)):
|
||||
return obj.isoformat()
|
||||
raise TypeError(f'Object of type {type(obj)} is not JSON serializable')
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class _JSONDataclass:
|
||||
'''A base class for dataclasses for conversion to JSON'''
|
||||
|
||||
def json(self):
|
||||
'''Convert the object to a JSON string'''
|
||||
out = dataclasses.asdict(self)
|
||||
for key, value in out.items():
|
||||
if isinstance(value, _JSONDataclass):
|
||||
out[key] = value.json()
|
||||
return json.dumps(out, default = _json_serialise_datetime)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Item(_JSONDataclass):
|
||||
'''An abstract base class for an item returned by the scraper's get_items generator.
|
||||
|
||||
An item can really be anything. The string representation should be useful for the CLI output (e.g. a direct URL for the item).'''
|
||||
@@ -18,7 +42,8 @@ class Item:
|
||||
pass
|
||||
|
||||
|
||||
class Entity:
|
||||
@dataclasses.dataclass
|
||||
class Entity(_JSONDataclass):
|
||||
'''An abstract base class for an entity returned by the scraper's entity property.
|
||||
|
||||
An entity is typically the account of a person or organisation. The string representation should be the preferred direct URL to the entity's page on the network.'''
|
||||
|
||||
@@ -2,7 +2,6 @@ import argparse
|
||||
import contextlib
|
||||
import datetime
|
||||
import inspect
|
||||
import json
|
||||
import logging
|
||||
import requests.models
|
||||
# Imported in parse_args() after setting up the logger:
|
||||
@@ -226,26 +225,6 @@ def configure_logging(verbosity, dumpLocals_):
|
||||
rootLogger.addHandler(handler)
|
||||
|
||||
|
||||
def json_serialise_datetime(obj):
|
||||
if isinstance(obj, (datetime.datetime, datetime.date)):
|
||||
return obj.isoformat()
|
||||
raise TypeError(f'Object of type {type(obj)} is not JSON serializable')
|
||||
|
||||
|
||||
def namedtuple_to_dict_recursive(obj):
|
||||
# Convert a NamedTuple to a dict; also converts NamedTuples in its values to dicts
|
||||
if (isinstance(obj, tuple) and hasattr(obj, '_asdict')) or isinstance(obj, dict):
|
||||
if isinstance(obj, tuple):
|
||||
obj = obj._asdict()
|
||||
for key, value in obj.items():
|
||||
obj[key] = namedtuple_to_dict_recursive(value)
|
||||
return obj
|
||||
elif isinstance(obj, (tuple, list)):
|
||||
return type(obj)(namedtuple_to_dict_recursive(value) for value in obj)
|
||||
else:
|
||||
return obj
|
||||
|
||||
|
||||
def main():
|
||||
setup_logging()
|
||||
args = parse_args()
|
||||
@@ -256,7 +235,7 @@ def main():
|
||||
with _dump_locals_on_exception():
|
||||
if args.withEntity and (entity := scraper.entity):
|
||||
if args.jsonl:
|
||||
print(json.dumps(namedtuple_to_dict_recursive(entity), default = json_serialise_datetime))
|
||||
print(entity.json())
|
||||
else:
|
||||
print(entity)
|
||||
if args.maxResults == 0:
|
||||
@@ -267,7 +246,7 @@ def main():
|
||||
logger.info(f'Exiting due to reaching older results than {args.since}')
|
||||
break
|
||||
if args.jsonl:
|
||||
print(json.dumps(namedtuple_to_dict_recursive(item), default = json_serialise_datetime))
|
||||
print(item.json())
|
||||
elif args.format is not None:
|
||||
print(args.format.format(**item._asdict()))
|
||||
else:
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import bs4
|
||||
import dataclasses
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
@@ -11,7 +12,8 @@ import urllib.parse
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FacebookPost(typing.NamedTuple, snscrape.base.Item):
|
||||
@dataclasses.dataclass
|
||||
class FacebookPost(snscrape.base.Item):
|
||||
cleanUrl: str
|
||||
dirtyUrl: str
|
||||
date: datetime.datetime
|
||||
@@ -23,7 +25,8 @@ class FacebookPost(typing.NamedTuple, snscrape.base.Item):
|
||||
return self.cleanUrl
|
||||
|
||||
|
||||
class User(typing.NamedTuple, snscrape.base.Entity):
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Entity):
|
||||
username: str
|
||||
pageId: int
|
||||
name: str
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import dataclasses
|
||||
import datetime
|
||||
import hashlib
|
||||
import json
|
||||
@@ -10,7 +11,8 @@ import typing
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class InstagramPost(typing.NamedTuple, snscrape.base.Item):
|
||||
@dataclasses.dataclass
|
||||
class InstagramPost(snscrape.base.Item):
|
||||
cleanUrl: str
|
||||
dirtyUrl: str
|
||||
date: datetime.datetime
|
||||
@@ -27,7 +29,8 @@ class InstagramPost(typing.NamedTuple, snscrape.base.Item):
|
||||
return self.cleanUrl
|
||||
|
||||
|
||||
class User(typing.NamedTuple, snscrape.base.Entity):
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Entity):
|
||||
username: str
|
||||
name: typing.Optional[str]
|
||||
followers: int
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import dataclasses
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
@@ -13,7 +14,8 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
# Most of these fields should never be None, but due to broken data, they sometimes are anyway...
|
||||
|
||||
class Submission(typing.NamedTuple, snscrape.base.Item):
|
||||
@dataclasses.dataclass
|
||||
class Submission(snscrape.base.Item):
|
||||
author: typing.Optional[str] # E.g. submission hf7k6
|
||||
created: datetime.datetime
|
||||
id: str
|
||||
@@ -27,7 +29,8 @@ class Submission(typing.NamedTuple, snscrape.base.Item):
|
||||
return self.url
|
||||
|
||||
|
||||
class Comment(typing.NamedTuple, snscrape.base.Item):
|
||||
@dataclasses.dataclass
|
||||
class Comment(snscrape.base.Item):
|
||||
author: typing.Optional[str]
|
||||
body: str
|
||||
created: datetime.datetime
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import bs4
|
||||
import dataclasses
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
@@ -11,7 +12,8 @@ logger = logging.getLogger(__name__)
|
||||
_SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
|
||||
|
||||
|
||||
class LinkPreview(typing.NamedTuple):
|
||||
@dataclasses.dataclass
|
||||
class LinkPreview:
|
||||
href: str
|
||||
siteName: typing.Optional[str] = None
|
||||
title: typing.Optional[str] = None
|
||||
@@ -19,7 +21,8 @@ class LinkPreview(typing.NamedTuple):
|
||||
image: typing.Optional[str] = None
|
||||
|
||||
|
||||
class TelegramPost(typing.NamedTuple, snscrape.base.Item):
|
||||
@dataclasses.dataclass
|
||||
class TelegramPost(snscrape.base.Item):
|
||||
url: str
|
||||
date: datetime.datetime
|
||||
content: str
|
||||
@@ -31,7 +34,8 @@ class TelegramPost(typing.NamedTuple, snscrape.base.Item):
|
||||
return self.url
|
||||
|
||||
|
||||
class Channel(typing.NamedTuple, snscrape.base.Entity):
|
||||
@dataclasses.dataclass
|
||||
class Channel(snscrape.base.Entity):
|
||||
username: str
|
||||
title: str
|
||||
verified: bool
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import bs4
|
||||
import dataclasses
|
||||
import datetime
|
||||
import email.utils
|
||||
import itertools
|
||||
@@ -17,7 +18,8 @@ logger = logging.getLogger(__name__)
|
||||
_API_AUTHORIZATION_HEADER = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs=1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
|
||||
|
||||
|
||||
class Tweet(typing.NamedTuple, snscrape.base.Item):
|
||||
@dataclasses.dataclass
|
||||
class Tweet(snscrape.base.Item):
|
||||
url: str
|
||||
date: datetime.datetime
|
||||
content: str
|
||||
@@ -49,39 +51,45 @@ class Medium:
|
||||
pass
|
||||
|
||||
|
||||
class Photo(typing.NamedTuple, Medium):
|
||||
@dataclasses.dataclass
|
||||
class Photo(Medium):
|
||||
previewUrl: str
|
||||
fullUrl: str
|
||||
type: str = 'photo'
|
||||
|
||||
|
||||
class VideoVariant(typing.NamedTuple):
|
||||
@dataclasses.dataclass
|
||||
class VideoVariant:
|
||||
contentType: str
|
||||
url: str
|
||||
bitrate: typing.Optional[int]
|
||||
|
||||
|
||||
class Video(typing.NamedTuple, Medium):
|
||||
@dataclasses.dataclass
|
||||
class Video(Medium):
|
||||
thumbnailUrl: str
|
||||
variants: typing.List[VideoVariant]
|
||||
duration: float
|
||||
type: str = 'video'
|
||||
|
||||
|
||||
class Gif(typing.NamedTuple, Medium):
|
||||
@dataclasses.dataclass
|
||||
class Gif(Medium):
|
||||
thumbnailUrl: str
|
||||
variants: typing.List[VideoVariant]
|
||||
type: str = 'gif'
|
||||
|
||||
|
||||
class DescriptionURL(typing.NamedTuple):
|
||||
@dataclasses.dataclass
|
||||
class DescriptionURL:
|
||||
text: str
|
||||
url: str
|
||||
tcourl: str
|
||||
indices: typing.Tuple[int, int]
|
||||
|
||||
|
||||
class User(typing.NamedTuple, snscrape.base.Entity):
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Entity):
|
||||
# Most fields can be None if they're not known.
|
||||
|
||||
username: str
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import bs4
|
||||
import collections
|
||||
import dataclasses
|
||||
import datetime
|
||||
import itertools
|
||||
import logging
|
||||
@@ -11,7 +12,8 @@ import urllib.parse
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class VKontaktePost(typing.NamedTuple, snscrape.base.Item):
|
||||
@dataclasses.dataclass
|
||||
class VKontaktePost(snscrape.base.Item):
|
||||
url: str
|
||||
date: datetime.datetime
|
||||
content: str
|
||||
@@ -20,7 +22,8 @@ class VKontaktePost(typing.NamedTuple, snscrape.base.Item):
|
||||
return self.url
|
||||
|
||||
|
||||
class User(typing.NamedTuple, snscrape.base.Entity):
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Entity):
|
||||
username: str
|
||||
name: str
|
||||
verified: bool
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import bs4
|
||||
import dataclasses
|
||||
import datetime
|
||||
import logging
|
||||
import snscrape.base
|
||||
@@ -10,7 +11,8 @@ logger = logging.getLogger(__name__)
|
||||
_userDoesNotExist = object()
|
||||
|
||||
|
||||
class Post(typing.NamedTuple, snscrape.base.Item):
|
||||
@dataclasses.dataclass
|
||||
class Post(snscrape.base.Item):
|
||||
url: str
|
||||
id: str
|
||||
user: typing.Optional['User']
|
||||
@@ -29,7 +31,8 @@ class Post(typing.NamedTuple, snscrape.base.Item):
|
||||
return self.url
|
||||
|
||||
|
||||
class User(typing.NamedTuple, snscrape.base.Entity):
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Entity):
|
||||
screenname: str
|
||||
uid: int
|
||||
verified: bool
|
||||
|
||||
Reference in New Issue
Block a user