Replace named tuples with dataclasses and move JSON conversion logic to the base classes

Named tuples were never really adequate for this since the order aspect of them doesn't make sense.
Further, named tuples don't support multiple inheritance. This meant that the objects returned by get_items() were not actually Items, for example. Since Python 3.9, such named tuples cannot be created anymore.

Fixes #111
This commit is contained in:
JustAnotherArchivist
2020-10-15 23:41:30 +00:00
parent ffd9289edc
commit bd53e729a0
9 changed files with 76 additions and 45 deletions

View File

@@ -1,5 +1,8 @@
import abc
import dataclasses
import datetime
import functools
import json
import logging
import requests
import time
@@ -8,7 +11,28 @@ import time
logger = logging.getLogger(__name__)
class Item:
def _json_serialise_datetime(obj):
'''A JSON serialiser that converts datetime.datetime and datetime.date objects to ISO-8601 strings.'''
if isinstance(obj, (datetime.datetime, datetime.date)):
return obj.isoformat()
raise TypeError(f'Object of type {type(obj)} is not JSON serializable')
@dataclasses.dataclass
class _JSONDataclass:
'''A base class for dataclasses for conversion to JSON'''
def json(self):
'''Convert the object to a JSON string'''
out = dataclasses.asdict(self)
for key, value in out.items():
if isinstance(value, _JSONDataclass):
out[key] = value.json()
return json.dumps(out, default = _json_serialise_datetime)
@dataclasses.dataclass
class Item(_JSONDataclass):
'''An abstract base class for an item returned by the scraper's get_items generator.
An item can really be anything. The string representation should be useful for the CLI output (e.g. a direct URL for the item).'''
@@ -18,7 +42,8 @@ class Item:
pass
class Entity:
@dataclasses.dataclass
class Entity(_JSONDataclass):
'''An abstract base class for an entity returned by the scraper's entity property.
An entity is typically the account of a person or organisation. The string representation should be the preferred direct URL to the entity's page on the network.'''

View File

@@ -2,7 +2,6 @@ import argparse
import contextlib
import datetime
import inspect
import json
import logging
import requests.models
# Imported in parse_args() after setting up the logger:
@@ -226,26 +225,6 @@ def configure_logging(verbosity, dumpLocals_):
rootLogger.addHandler(handler)
def json_serialise_datetime(obj):
if isinstance(obj, (datetime.datetime, datetime.date)):
return obj.isoformat()
raise TypeError(f'Object of type {type(obj)} is not JSON serializable')
def namedtuple_to_dict_recursive(obj):
# Convert a NamedTuple to a dict; also converts NamedTuples in its values to dicts
if (isinstance(obj, tuple) and hasattr(obj, '_asdict')) or isinstance(obj, dict):
if isinstance(obj, tuple):
obj = obj._asdict()
for key, value in obj.items():
obj[key] = namedtuple_to_dict_recursive(value)
return obj
elif isinstance(obj, (tuple, list)):
return type(obj)(namedtuple_to_dict_recursive(value) for value in obj)
else:
return obj
def main():
setup_logging()
args = parse_args()
@@ -256,7 +235,7 @@ def main():
with _dump_locals_on_exception():
if args.withEntity and (entity := scraper.entity):
if args.jsonl:
print(json.dumps(namedtuple_to_dict_recursive(entity), default = json_serialise_datetime))
print(entity.json())
else:
print(entity)
if args.maxResults == 0:
@@ -267,7 +246,7 @@ def main():
logger.info(f'Exiting due to reaching older results than {args.since}')
break
if args.jsonl:
print(json.dumps(namedtuple_to_dict_recursive(item), default = json_serialise_datetime))
print(item.json())
elif args.format is not None:
print(args.format.format(**item._asdict()))
else:

View File

@@ -1,4 +1,5 @@
import bs4
import dataclasses
import datetime
import json
import logging
@@ -11,7 +12,8 @@ import urllib.parse
logger = logging.getLogger(__name__)
class FacebookPost(typing.NamedTuple, snscrape.base.Item):
@dataclasses.dataclass
class FacebookPost(snscrape.base.Item):
cleanUrl: str
dirtyUrl: str
date: datetime.datetime
@@ -23,7 +25,8 @@ class FacebookPost(typing.NamedTuple, snscrape.base.Item):
return self.cleanUrl
class User(typing.NamedTuple, snscrape.base.Entity):
@dataclasses.dataclass
class User(snscrape.base.Entity):
username: str
pageId: int
name: str

View File

@@ -1,3 +1,4 @@
import dataclasses
import datetime
import hashlib
import json
@@ -10,7 +11,8 @@ import typing
logger = logging.getLogger(__name__)
class InstagramPost(typing.NamedTuple, snscrape.base.Item):
@dataclasses.dataclass
class InstagramPost(snscrape.base.Item):
cleanUrl: str
dirtyUrl: str
date: datetime.datetime
@@ -27,7 +29,8 @@ class InstagramPost(typing.NamedTuple, snscrape.base.Item):
return self.cleanUrl
class User(typing.NamedTuple, snscrape.base.Entity):
@dataclasses.dataclass
class User(snscrape.base.Entity):
username: str
name: typing.Optional[str]
followers: int

View File

@@ -1,3 +1,4 @@
import dataclasses
import datetime
import logging
import re
@@ -13,7 +14,8 @@ logger = logging.getLogger(__name__)
# Most of these fields should never be None, but due to broken data, they sometimes are anyway...
class Submission(typing.NamedTuple, snscrape.base.Item):
@dataclasses.dataclass
class Submission(snscrape.base.Item):
author: typing.Optional[str] # E.g. submission hf7k6
created: datetime.datetime
id: str
@@ -27,7 +29,8 @@ class Submission(typing.NamedTuple, snscrape.base.Item):
return self.url
class Comment(typing.NamedTuple, snscrape.base.Item):
@dataclasses.dataclass
class Comment(snscrape.base.Item):
author: typing.Optional[str]
body: str
created: datetime.datetime

View File

@@ -1,4 +1,5 @@
import bs4
import dataclasses
import datetime
import logging
import re
@@ -11,7 +12,8 @@ logger = logging.getLogger(__name__)
_SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
class LinkPreview(typing.NamedTuple):
@dataclasses.dataclass
class LinkPreview:
href: str
siteName: typing.Optional[str] = None
title: typing.Optional[str] = None
@@ -19,7 +21,8 @@ class LinkPreview(typing.NamedTuple):
image: typing.Optional[str] = None
class TelegramPost(typing.NamedTuple, snscrape.base.Item):
@dataclasses.dataclass
class TelegramPost(snscrape.base.Item):
url: str
date: datetime.datetime
content: str
@@ -31,7 +34,8 @@ class TelegramPost(typing.NamedTuple, snscrape.base.Item):
return self.url
class Channel(typing.NamedTuple, snscrape.base.Entity):
@dataclasses.dataclass
class Channel(snscrape.base.Entity):
username: str
title: str
verified: bool

View File

@@ -1,4 +1,5 @@
import bs4
import dataclasses
import datetime
import email.utils
import itertools
@@ -17,7 +18,8 @@ logger = logging.getLogger(__name__)
_API_AUTHORIZATION_HEADER = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs=1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
class Tweet(typing.NamedTuple, snscrape.base.Item):
@dataclasses.dataclass
class Tweet(snscrape.base.Item):
url: str
date: datetime.datetime
content: str
@@ -49,39 +51,45 @@ class Medium:
pass
class Photo(typing.NamedTuple, Medium):
@dataclasses.dataclass
class Photo(Medium):
previewUrl: str
fullUrl: str
type: str = 'photo'
class VideoVariant(typing.NamedTuple):
@dataclasses.dataclass
class VideoVariant:
contentType: str
url: str
bitrate: typing.Optional[int]
class Video(typing.NamedTuple, Medium):
@dataclasses.dataclass
class Video(Medium):
thumbnailUrl: str
variants: typing.List[VideoVariant]
duration: float
type: str = 'video'
class Gif(typing.NamedTuple, Medium):
@dataclasses.dataclass
class Gif(Medium):
thumbnailUrl: str
variants: typing.List[VideoVariant]
type: str = 'gif'
class DescriptionURL(typing.NamedTuple):
@dataclasses.dataclass
class DescriptionURL:
text: str
url: str
tcourl: str
indices: typing.Tuple[int, int]
class User(typing.NamedTuple, snscrape.base.Entity):
@dataclasses.dataclass
class User(snscrape.base.Entity):
# Most fields can be None if they're not known.
username: str

View File

@@ -1,5 +1,6 @@
import bs4
import collections
import dataclasses
import datetime
import itertools
import logging
@@ -11,7 +12,8 @@ import urllib.parse
logger = logging.getLogger(__name__)
class VKontaktePost(typing.NamedTuple, snscrape.base.Item):
@dataclasses.dataclass
class VKontaktePost(snscrape.base.Item):
url: str
date: datetime.datetime
content: str
@@ -20,7 +22,8 @@ class VKontaktePost(typing.NamedTuple, snscrape.base.Item):
return self.url
class User(typing.NamedTuple, snscrape.base.Entity):
@dataclasses.dataclass
class User(snscrape.base.Entity):
username: str
name: str
verified: bool

View File

@@ -1,4 +1,5 @@
import bs4
import dataclasses
import datetime
import logging
import snscrape.base
@@ -10,7 +11,8 @@ logger = logging.getLogger(__name__)
_userDoesNotExist = object()
class Post(typing.NamedTuple, snscrape.base.Item):
@dataclasses.dataclass
class Post(snscrape.base.Item):
url: str
id: str
user: typing.Optional['User']
@@ -29,7 +31,8 @@ class Post(typing.NamedTuple, snscrape.base.Item):
return self.url
class User(typing.NamedTuple, snscrape.base.Entity):
@dataclasses.dataclass
class User(snscrape.base.Entity):
screenname: str
uid: int
verified: bool