Replace named tuples with dataclasses and move JSON conversion logic to the base classes

Named tuples were never really adequate for this since the order aspect of them doesn't make sense.
Further, named tuples don't support multiple inheritance. This meant that the objects returned by get_items() were not actually Items, for example. Since Python 3.9, such named tuples cannot be created anymore.

Fixes #111
This commit is contained in:
JustAnotherArchivist
2020-10-15 23:41:30 +00:00
parent ffd9289edc
commit bd53e729a0
9 changed files with 76 additions and 45 deletions

View File

@@ -1,5 +1,8 @@
import abc import abc
import dataclasses
import datetime
import functools import functools
import json
import logging import logging
import requests import requests
import time import time
@@ -8,7 +11,28 @@ import time
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class Item: def _json_serialise_datetime(obj):
'''A JSON serialiser that converts datetime.datetime and datetime.date objects to ISO-8601 strings.'''
if isinstance(obj, (datetime.datetime, datetime.date)):
return obj.isoformat()
raise TypeError(f'Object of type {type(obj)} is not JSON serializable')
@dataclasses.dataclass
class _JSONDataclass:
'''A base class for dataclasses for conversion to JSON'''
def json(self):
'''Convert the object to a JSON string'''
out = dataclasses.asdict(self)
for key, value in out.items():
if isinstance(value, _JSONDataclass):
out[key] = value.json()
return json.dumps(out, default = _json_serialise_datetime)
@dataclasses.dataclass
class Item(_JSONDataclass):
'''An abstract base class for an item returned by the scraper's get_items generator. '''An abstract base class for an item returned by the scraper's get_items generator.
An item can really be anything. The string representation should be useful for the CLI output (e.g. a direct URL for the item).''' An item can really be anything. The string representation should be useful for the CLI output (e.g. a direct URL for the item).'''
@@ -18,7 +42,8 @@ class Item:
pass pass
class Entity: @dataclasses.dataclass
class Entity(_JSONDataclass):
'''An abstract base class for an entity returned by the scraper's entity property. '''An abstract base class for an entity returned by the scraper's entity property.
An entity is typically the account of a person or organisation. The string representation should be the preferred direct URL to the entity's page on the network.''' An entity is typically the account of a person or organisation. The string representation should be the preferred direct URL to the entity's page on the network.'''

View File

@@ -2,7 +2,6 @@ import argparse
import contextlib import contextlib
import datetime import datetime
import inspect import inspect
import json
import logging import logging
import requests.models import requests.models
# Imported in parse_args() after setting up the logger: # Imported in parse_args() after setting up the logger:
@@ -226,26 +225,6 @@ def configure_logging(verbosity, dumpLocals_):
rootLogger.addHandler(handler) rootLogger.addHandler(handler)
def json_serialise_datetime(obj):
if isinstance(obj, (datetime.datetime, datetime.date)):
return obj.isoformat()
raise TypeError(f'Object of type {type(obj)} is not JSON serializable')
def namedtuple_to_dict_recursive(obj):
# Convert a NamedTuple to a dict; also converts NamedTuples in its values to dicts
if (isinstance(obj, tuple) and hasattr(obj, '_asdict')) or isinstance(obj, dict):
if isinstance(obj, tuple):
obj = obj._asdict()
for key, value in obj.items():
obj[key] = namedtuple_to_dict_recursive(value)
return obj
elif isinstance(obj, (tuple, list)):
return type(obj)(namedtuple_to_dict_recursive(value) for value in obj)
else:
return obj
def main(): def main():
setup_logging() setup_logging()
args = parse_args() args = parse_args()
@@ -256,7 +235,7 @@ def main():
with _dump_locals_on_exception(): with _dump_locals_on_exception():
if args.withEntity and (entity := scraper.entity): if args.withEntity and (entity := scraper.entity):
if args.jsonl: if args.jsonl:
print(json.dumps(namedtuple_to_dict_recursive(entity), default = json_serialise_datetime)) print(entity.json())
else: else:
print(entity) print(entity)
if args.maxResults == 0: if args.maxResults == 0:
@@ -267,7 +246,7 @@ def main():
logger.info(f'Exiting due to reaching older results than {args.since}') logger.info(f'Exiting due to reaching older results than {args.since}')
break break
if args.jsonl: if args.jsonl:
print(json.dumps(namedtuple_to_dict_recursive(item), default = json_serialise_datetime)) print(item.json())
elif args.format is not None: elif args.format is not None:
print(args.format.format(**item._asdict())) print(args.format.format(**item._asdict()))
else: else:

View File

@@ -1,4 +1,5 @@
import bs4 import bs4
import dataclasses
import datetime import datetime
import json import json
import logging import logging
@@ -11,7 +12,8 @@ import urllib.parse
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class FacebookPost(typing.NamedTuple, snscrape.base.Item): @dataclasses.dataclass
class FacebookPost(snscrape.base.Item):
cleanUrl: str cleanUrl: str
dirtyUrl: str dirtyUrl: str
date: datetime.datetime date: datetime.datetime
@@ -23,7 +25,8 @@ class FacebookPost(typing.NamedTuple, snscrape.base.Item):
return self.cleanUrl return self.cleanUrl
class User(typing.NamedTuple, snscrape.base.Entity): @dataclasses.dataclass
class User(snscrape.base.Entity):
username: str username: str
pageId: int pageId: int
name: str name: str

View File

@@ -1,3 +1,4 @@
import dataclasses
import datetime import datetime
import hashlib import hashlib
import json import json
@@ -10,7 +11,8 @@ import typing
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class InstagramPost(typing.NamedTuple, snscrape.base.Item): @dataclasses.dataclass
class InstagramPost(snscrape.base.Item):
cleanUrl: str cleanUrl: str
dirtyUrl: str dirtyUrl: str
date: datetime.datetime date: datetime.datetime
@@ -27,7 +29,8 @@ class InstagramPost(typing.NamedTuple, snscrape.base.Item):
return self.cleanUrl return self.cleanUrl
class User(typing.NamedTuple, snscrape.base.Entity): @dataclasses.dataclass
class User(snscrape.base.Entity):
username: str username: str
name: typing.Optional[str] name: typing.Optional[str]
followers: int followers: int

View File

@@ -1,3 +1,4 @@
import dataclasses
import datetime import datetime
import logging import logging
import re import re
@@ -13,7 +14,8 @@ logger = logging.getLogger(__name__)
# Most of these fields should never be None, but due to broken data, they sometimes are anyway... # Most of these fields should never be None, but due to broken data, they sometimes are anyway...
class Submission(typing.NamedTuple, snscrape.base.Item): @dataclasses.dataclass
class Submission(snscrape.base.Item):
author: typing.Optional[str] # E.g. submission hf7k6 author: typing.Optional[str] # E.g. submission hf7k6
created: datetime.datetime created: datetime.datetime
id: str id: str
@@ -27,7 +29,8 @@ class Submission(typing.NamedTuple, snscrape.base.Item):
return self.url return self.url
class Comment(typing.NamedTuple, snscrape.base.Item): @dataclasses.dataclass
class Comment(snscrape.base.Item):
author: typing.Optional[str] author: typing.Optional[str]
body: str body: str
created: datetime.datetime created: datetime.datetime

View File

@@ -1,4 +1,5 @@
import bs4 import bs4
import dataclasses
import datetime import datetime
import logging import logging
import re import re
@@ -11,7 +12,8 @@ logger = logging.getLogger(__name__)
_SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$') _SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
class LinkPreview(typing.NamedTuple): @dataclasses.dataclass
class LinkPreview:
href: str href: str
siteName: typing.Optional[str] = None siteName: typing.Optional[str] = None
title: typing.Optional[str] = None title: typing.Optional[str] = None
@@ -19,7 +21,8 @@ class LinkPreview(typing.NamedTuple):
image: typing.Optional[str] = None image: typing.Optional[str] = None
class TelegramPost(typing.NamedTuple, snscrape.base.Item): @dataclasses.dataclass
class TelegramPost(snscrape.base.Item):
url: str url: str
date: datetime.datetime date: datetime.datetime
content: str content: str
@@ -31,7 +34,8 @@ class TelegramPost(typing.NamedTuple, snscrape.base.Item):
return self.url return self.url
class Channel(typing.NamedTuple, snscrape.base.Entity): @dataclasses.dataclass
class Channel(snscrape.base.Entity):
username: str username: str
title: str title: str
verified: bool verified: bool

View File

@@ -1,4 +1,5 @@
import bs4 import bs4
import dataclasses
import datetime import datetime
import email.utils import email.utils
import itertools import itertools
@@ -17,7 +18,8 @@ logger = logging.getLogger(__name__)
_API_AUTHORIZATION_HEADER = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs=1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' _API_AUTHORIZATION_HEADER = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs=1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
class Tweet(typing.NamedTuple, snscrape.base.Item): @dataclasses.dataclass
class Tweet(snscrape.base.Item):
url: str url: str
date: datetime.datetime date: datetime.datetime
content: str content: str
@@ -49,39 +51,45 @@ class Medium:
pass pass
class Photo(typing.NamedTuple, Medium): @dataclasses.dataclass
class Photo(Medium):
previewUrl: str previewUrl: str
fullUrl: str fullUrl: str
type: str = 'photo' type: str = 'photo'
class VideoVariant(typing.NamedTuple): @dataclasses.dataclass
class VideoVariant:
contentType: str contentType: str
url: str url: str
bitrate: typing.Optional[int] bitrate: typing.Optional[int]
class Video(typing.NamedTuple, Medium): @dataclasses.dataclass
class Video(Medium):
thumbnailUrl: str thumbnailUrl: str
variants: typing.List[VideoVariant] variants: typing.List[VideoVariant]
duration: float duration: float
type: str = 'video' type: str = 'video'
class Gif(typing.NamedTuple, Medium): @dataclasses.dataclass
class Gif(Medium):
thumbnailUrl: str thumbnailUrl: str
variants: typing.List[VideoVariant] variants: typing.List[VideoVariant]
type: str = 'gif' type: str = 'gif'
class DescriptionURL(typing.NamedTuple): @dataclasses.dataclass
class DescriptionURL:
text: str text: str
url: str url: str
tcourl: str tcourl: str
indices: typing.Tuple[int, int] indices: typing.Tuple[int, int]
class User(typing.NamedTuple, snscrape.base.Entity): @dataclasses.dataclass
class User(snscrape.base.Entity):
# Most fields can be None if they're not known. # Most fields can be None if they're not known.
username: str username: str

View File

@@ -1,5 +1,6 @@
import bs4 import bs4
import collections import collections
import dataclasses
import datetime import datetime
import itertools import itertools
import logging import logging
@@ -11,7 +12,8 @@ import urllib.parse
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class VKontaktePost(typing.NamedTuple, snscrape.base.Item): @dataclasses.dataclass
class VKontaktePost(snscrape.base.Item):
url: str url: str
date: datetime.datetime date: datetime.datetime
content: str content: str
@@ -20,7 +22,8 @@ class VKontaktePost(typing.NamedTuple, snscrape.base.Item):
return self.url return self.url
class User(typing.NamedTuple, snscrape.base.Entity): @dataclasses.dataclass
class User(snscrape.base.Entity):
username: str username: str
name: str name: str
verified: bool verified: bool

View File

@@ -1,4 +1,5 @@
import bs4 import bs4
import dataclasses
import datetime import datetime
import logging import logging
import snscrape.base import snscrape.base
@@ -10,7 +11,8 @@ logger = logging.getLogger(__name__)
_userDoesNotExist = object() _userDoesNotExist = object()
class Post(typing.NamedTuple, snscrape.base.Item): @dataclasses.dataclass
class Post(snscrape.base.Item):
url: str url: str
id: str id: str
user: typing.Optional['User'] user: typing.Optional['User']
@@ -29,7 +31,8 @@ class Post(typing.NamedTuple, snscrape.base.Item):
return self.url return self.url
class User(typing.NamedTuple, snscrape.base.Entity): @dataclasses.dataclass
class User(snscrape.base.Entity):
screenname: str screenname: str
uid: int uid: int
verified: bool verified: bool