From bd53e729a09eed73a6778a565229111be0578aed Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Thu, 15 Oct 2020 23:41:30 +0000 Subject: [PATCH] Replace named tuples with dataclasses and move JSON conversion logic to the base classes Named tuples were never really adequate for this since the order aspect of them doesn't make sense. Further, named tuples don't support multiple inheritance. This meant that the objects returned by get_items() were not actually Items, for example. Since Python 3.9, such named tuples cannot be created anymore. Fixes #111 --- snscrape/base.py | 29 +++++++++++++++++++++++++++-- snscrape/cli.py | 25 ++----------------------- snscrape/modules/facebook.py | 7 +++++-- snscrape/modules/instagram.py | 7 +++++-- snscrape/modules/reddit.py | 7 +++++-- snscrape/modules/telegram.py | 10 +++++++--- snscrape/modules/twitter.py | 22 +++++++++++++++------- snscrape/modules/vkontakte.py | 7 +++++-- snscrape/modules/weibo.py | 7 +++++-- 9 files changed, 76 insertions(+), 45 deletions(-) diff --git a/snscrape/base.py b/snscrape/base.py index 714e23e..37ab12d 100644 --- a/snscrape/base.py +++ b/snscrape/base.py @@ -1,5 +1,8 @@ import abc +import dataclasses +import datetime import functools +import json import logging import requests import time @@ -8,7 +11,28 @@ import time logger = logging.getLogger(__name__) -class Item: +def _json_serialise_datetime(obj): + '''A JSON serialiser that converts datetime.datetime and datetime.date objects to ISO-8601 strings.''' + if isinstance(obj, (datetime.datetime, datetime.date)): + return obj.isoformat() + raise TypeError(f'Object of type {type(obj)} is not JSON serializable') + + +@dataclasses.dataclass +class _JSONDataclass: + '''A base class for dataclasses for conversion to JSON''' + + def json(self): + '''Convert the object to a JSON string''' + out = dataclasses.asdict(self) + for key, value in out.items(): + if isinstance(value, _JSONDataclass): + out[key] = value.json() + return json.dumps(out, default = _json_serialise_datetime) + + +@dataclasses.dataclass +class Item(_JSONDataclass): '''An abstract base class for an item returned by the scraper's get_items generator. An item can really be anything. The string representation should be useful for the CLI output (e.g. a direct URL for the item).''' @@ -18,7 +42,8 @@ class Item: pass -class Entity: +@dataclasses.dataclass +class Entity(_JSONDataclass): '''An abstract base class for an entity returned by the scraper's entity property. An entity is typically the account of a person or organisation. The string representation should be the preferred direct URL to the entity's page on the network.''' diff --git a/snscrape/cli.py b/snscrape/cli.py index c279aa7..2ccd9c0 100644 --- a/snscrape/cli.py +++ b/snscrape/cli.py @@ -2,7 +2,6 @@ import argparse import contextlib import datetime import inspect -import json import logging import requests.models # Imported in parse_args() after setting up the logger: @@ -226,26 +225,6 @@ def configure_logging(verbosity, dumpLocals_): rootLogger.addHandler(handler) -def json_serialise_datetime(obj): - if isinstance(obj, (datetime.datetime, datetime.date)): - return obj.isoformat() - raise TypeError(f'Object of type {type(obj)} is not JSON serializable') - - -def namedtuple_to_dict_recursive(obj): - # Convert a NamedTuple to a dict; also converts NamedTuples in its values to dicts - if (isinstance(obj, tuple) and hasattr(obj, '_asdict')) or isinstance(obj, dict): - if isinstance(obj, tuple): - obj = obj._asdict() - for key, value in obj.items(): - obj[key] = namedtuple_to_dict_recursive(value) - return obj - elif isinstance(obj, (tuple, list)): - return type(obj)(namedtuple_to_dict_recursive(value) for value in obj) - else: - return obj - - def main(): setup_logging() args = parse_args() @@ -256,7 +235,7 @@ def main(): with _dump_locals_on_exception(): if args.withEntity and (entity := scraper.entity): if args.jsonl: - print(json.dumps(namedtuple_to_dict_recursive(entity), default = json_serialise_datetime)) + print(entity.json()) else: print(entity) if args.maxResults == 0: @@ -267,7 +246,7 @@ def main(): logger.info(f'Exiting due to reaching older results than {args.since}') break if args.jsonl: - print(json.dumps(namedtuple_to_dict_recursive(item), default = json_serialise_datetime)) + print(item.json()) elif args.format is not None: print(args.format.format(**item._asdict())) else: diff --git a/snscrape/modules/facebook.py b/snscrape/modules/facebook.py index 1bae599..0e3e58b 100644 --- a/snscrape/modules/facebook.py +++ b/snscrape/modules/facebook.py @@ -1,4 +1,5 @@ import bs4 +import dataclasses import datetime import json import logging @@ -11,7 +12,8 @@ import urllib.parse logger = logging.getLogger(__name__) -class FacebookPost(typing.NamedTuple, snscrape.base.Item): +@dataclasses.dataclass +class FacebookPost(snscrape.base.Item): cleanUrl: str dirtyUrl: str date: datetime.datetime @@ -23,7 +25,8 @@ class FacebookPost(typing.NamedTuple, snscrape.base.Item): return self.cleanUrl -class User(typing.NamedTuple, snscrape.base.Entity): +@dataclasses.dataclass +class User(snscrape.base.Entity): username: str pageId: int name: str diff --git a/snscrape/modules/instagram.py b/snscrape/modules/instagram.py index 357c4e4..8afc15c 100644 --- a/snscrape/modules/instagram.py +++ b/snscrape/modules/instagram.py @@ -1,3 +1,4 @@ +import dataclasses import datetime import hashlib import json @@ -10,7 +11,8 @@ import typing logger = logging.getLogger(__name__) -class InstagramPost(typing.NamedTuple, snscrape.base.Item): +@dataclasses.dataclass +class InstagramPost(snscrape.base.Item): cleanUrl: str dirtyUrl: str date: datetime.datetime @@ -27,7 +29,8 @@ class InstagramPost(typing.NamedTuple, snscrape.base.Item): return self.cleanUrl -class User(typing.NamedTuple, snscrape.base.Entity): +@dataclasses.dataclass +class User(snscrape.base.Entity): username: str name: typing.Optional[str] followers: int diff --git a/snscrape/modules/reddit.py b/snscrape/modules/reddit.py index 4f6d322..892cbbf 100644 --- a/snscrape/modules/reddit.py +++ b/snscrape/modules/reddit.py @@ -1,3 +1,4 @@ +import dataclasses import datetime import logging import re @@ -13,7 +14,8 @@ logger = logging.getLogger(__name__) # Most of these fields should never be None, but due to broken data, they sometimes are anyway... -class Submission(typing.NamedTuple, snscrape.base.Item): +@dataclasses.dataclass +class Submission(snscrape.base.Item): author: typing.Optional[str] # E.g. submission hf7k6 created: datetime.datetime id: str @@ -27,7 +29,8 @@ class Submission(typing.NamedTuple, snscrape.base.Item): return self.url -class Comment(typing.NamedTuple, snscrape.base.Item): +@dataclasses.dataclass +class Comment(snscrape.base.Item): author: typing.Optional[str] body: str created: datetime.datetime diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py index 5e83917..e566f70 100644 --- a/snscrape/modules/telegram.py +++ b/snscrape/modules/telegram.py @@ -1,4 +1,5 @@ import bs4 +import dataclasses import datetime import logging import re @@ -11,7 +12,8 @@ logger = logging.getLogger(__name__) _SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$') -class LinkPreview(typing.NamedTuple): +@dataclasses.dataclass +class LinkPreview: href: str siteName: typing.Optional[str] = None title: typing.Optional[str] = None @@ -19,7 +21,8 @@ class LinkPreview(typing.NamedTuple): image: typing.Optional[str] = None -class TelegramPost(typing.NamedTuple, snscrape.base.Item): +@dataclasses.dataclass +class TelegramPost(snscrape.base.Item): url: str date: datetime.datetime content: str @@ -31,7 +34,8 @@ class TelegramPost(typing.NamedTuple, snscrape.base.Item): return self.url -class Channel(typing.NamedTuple, snscrape.base.Entity): +@dataclasses.dataclass +class Channel(snscrape.base.Entity): username: str title: str verified: bool diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index f5f7bae..9b8c078 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1,4 +1,5 @@ import bs4 +import dataclasses import datetime import email.utils import itertools @@ -17,7 +18,8 @@ logger = logging.getLogger(__name__) _API_AUTHORIZATION_HEADER = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs=1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' -class Tweet(typing.NamedTuple, snscrape.base.Item): +@dataclasses.dataclass +class Tweet(snscrape.base.Item): url: str date: datetime.datetime content: str @@ -49,39 +51,45 @@ class Medium: pass -class Photo(typing.NamedTuple, Medium): +@dataclasses.dataclass +class Photo(Medium): previewUrl: str fullUrl: str type: str = 'photo' -class VideoVariant(typing.NamedTuple): +@dataclasses.dataclass +class VideoVariant: contentType: str url: str bitrate: typing.Optional[int] -class Video(typing.NamedTuple, Medium): +@dataclasses.dataclass +class Video(Medium): thumbnailUrl: str variants: typing.List[VideoVariant] duration: float type: str = 'video' -class Gif(typing.NamedTuple, Medium): +@dataclasses.dataclass +class Gif(Medium): thumbnailUrl: str variants: typing.List[VideoVariant] type: str = 'gif' -class DescriptionURL(typing.NamedTuple): +@dataclasses.dataclass +class DescriptionURL: text: str url: str tcourl: str indices: typing.Tuple[int, int] -class User(typing.NamedTuple, snscrape.base.Entity): +@dataclasses.dataclass +class User(snscrape.base.Entity): # Most fields can be None if they're not known. username: str diff --git a/snscrape/modules/vkontakte.py b/snscrape/modules/vkontakte.py index 2684be1..89fa63c 100644 --- a/snscrape/modules/vkontakte.py +++ b/snscrape/modules/vkontakte.py @@ -1,5 +1,6 @@ import bs4 import collections +import dataclasses import datetime import itertools import logging @@ -11,7 +12,8 @@ import urllib.parse logger = logging.getLogger(__name__) -class VKontaktePost(typing.NamedTuple, snscrape.base.Item): +@dataclasses.dataclass +class VKontaktePost(snscrape.base.Item): url: str date: datetime.datetime content: str @@ -20,7 +22,8 @@ class VKontaktePost(typing.NamedTuple, snscrape.base.Item): return self.url -class User(typing.NamedTuple, snscrape.base.Entity): +@dataclasses.dataclass +class User(snscrape.base.Entity): username: str name: str verified: bool diff --git a/snscrape/modules/weibo.py b/snscrape/modules/weibo.py index c5106b2..69f7256 100644 --- a/snscrape/modules/weibo.py +++ b/snscrape/modules/weibo.py @@ -1,4 +1,5 @@ import bs4 +import dataclasses import datetime import logging import snscrape.base @@ -10,7 +11,8 @@ logger = logging.getLogger(__name__) _userDoesNotExist = object() -class Post(typing.NamedTuple, snscrape.base.Item): +@dataclasses.dataclass +class Post(snscrape.base.Item): url: str id: str user: typing.Optional['User'] @@ -29,7 +31,8 @@ class Post(typing.NamedTuple, snscrape.base.Item): return self.url -class User(typing.NamedTuple, snscrape.base.Entity): +@dataclasses.dataclass +class User(snscrape.base.Entity): screenname: str uid: int verified: bool