diff --git a/snscrape/base.py b/snscrape/base.py index 714e23e..37ab12d 100644 --- a/snscrape/base.py +++ b/snscrape/base.py @@ -1,5 +1,8 @@ import abc +import dataclasses +import datetime import functools +import json import logging import requests import time @@ -8,7 +11,28 @@ import time logger = logging.getLogger(__name__) -class Item: +def _json_serialise_datetime(obj): + '''A JSON serialiser that converts datetime.datetime and datetime.date objects to ISO-8601 strings.''' + if isinstance(obj, (datetime.datetime, datetime.date)): + return obj.isoformat() + raise TypeError(f'Object of type {type(obj)} is not JSON serializable') + + +@dataclasses.dataclass +class _JSONDataclass: + '''A base class for dataclasses for conversion to JSON''' + + def json(self): + '''Convert the object to a JSON string''' + out = dataclasses.asdict(self) + for key, value in out.items(): + if isinstance(value, _JSONDataclass): + out[key] = value.json() + return json.dumps(out, default = _json_serialise_datetime) + + +@dataclasses.dataclass +class Item(_JSONDataclass): '''An abstract base class for an item returned by the scraper's get_items generator. An item can really be anything. The string representation should be useful for the CLI output (e.g. a direct URL for the item).''' @@ -18,7 +42,8 @@ class Item: pass -class Entity: +@dataclasses.dataclass +class Entity(_JSONDataclass): '''An abstract base class for an entity returned by the scraper's entity property. An entity is typically the account of a person or organisation. The string representation should be the preferred direct URL to the entity's page on the network.''' diff --git a/snscrape/cli.py b/snscrape/cli.py index c279aa7..2ccd9c0 100644 --- a/snscrape/cli.py +++ b/snscrape/cli.py @@ -2,7 +2,6 @@ import argparse import contextlib import datetime import inspect -import json import logging import requests.models # Imported in parse_args() after setting up the logger: @@ -226,26 +225,6 @@ def configure_logging(verbosity, dumpLocals_): rootLogger.addHandler(handler) -def json_serialise_datetime(obj): - if isinstance(obj, (datetime.datetime, datetime.date)): - return obj.isoformat() - raise TypeError(f'Object of type {type(obj)} is not JSON serializable') - - -def namedtuple_to_dict_recursive(obj): - # Convert a NamedTuple to a dict; also converts NamedTuples in its values to dicts - if (isinstance(obj, tuple) and hasattr(obj, '_asdict')) or isinstance(obj, dict): - if isinstance(obj, tuple): - obj = obj._asdict() - for key, value in obj.items(): - obj[key] = namedtuple_to_dict_recursive(value) - return obj - elif isinstance(obj, (tuple, list)): - return type(obj)(namedtuple_to_dict_recursive(value) for value in obj) - else: - return obj - - def main(): setup_logging() args = parse_args() @@ -256,7 +235,7 @@ def main(): with _dump_locals_on_exception(): if args.withEntity and (entity := scraper.entity): if args.jsonl: - print(json.dumps(namedtuple_to_dict_recursive(entity), default = json_serialise_datetime)) + print(entity.json()) else: print(entity) if args.maxResults == 0: @@ -267,7 +246,7 @@ def main(): logger.info(f'Exiting due to reaching older results than {args.since}') break if args.jsonl: - print(json.dumps(namedtuple_to_dict_recursive(item), default = json_serialise_datetime)) + print(item.json()) elif args.format is not None: print(args.format.format(**item._asdict())) else: diff --git a/snscrape/modules/facebook.py b/snscrape/modules/facebook.py index 1bae599..0e3e58b 100644 --- a/snscrape/modules/facebook.py +++ b/snscrape/modules/facebook.py @@ -1,4 +1,5 @@ import bs4 +import dataclasses import datetime import json import logging @@ -11,7 +12,8 @@ import urllib.parse logger = logging.getLogger(__name__) -class FacebookPost(typing.NamedTuple, snscrape.base.Item): +@dataclasses.dataclass +class FacebookPost(snscrape.base.Item): cleanUrl: str dirtyUrl: str date: datetime.datetime @@ -23,7 +25,8 @@ class FacebookPost(typing.NamedTuple, snscrape.base.Item): return self.cleanUrl -class User(typing.NamedTuple, snscrape.base.Entity): +@dataclasses.dataclass +class User(snscrape.base.Entity): username: str pageId: int name: str diff --git a/snscrape/modules/instagram.py b/snscrape/modules/instagram.py index 357c4e4..8afc15c 100644 --- a/snscrape/modules/instagram.py +++ b/snscrape/modules/instagram.py @@ -1,3 +1,4 @@ +import dataclasses import datetime import hashlib import json @@ -10,7 +11,8 @@ import typing logger = logging.getLogger(__name__) -class InstagramPost(typing.NamedTuple, snscrape.base.Item): +@dataclasses.dataclass +class InstagramPost(snscrape.base.Item): cleanUrl: str dirtyUrl: str date: datetime.datetime @@ -27,7 +29,8 @@ class InstagramPost(typing.NamedTuple, snscrape.base.Item): return self.cleanUrl -class User(typing.NamedTuple, snscrape.base.Entity): +@dataclasses.dataclass +class User(snscrape.base.Entity): username: str name: typing.Optional[str] followers: int diff --git a/snscrape/modules/reddit.py b/snscrape/modules/reddit.py index 4f6d322..892cbbf 100644 --- a/snscrape/modules/reddit.py +++ b/snscrape/modules/reddit.py @@ -1,3 +1,4 @@ +import dataclasses import datetime import logging import re @@ -13,7 +14,8 @@ logger = logging.getLogger(__name__) # Most of these fields should never be None, but due to broken data, they sometimes are anyway... -class Submission(typing.NamedTuple, snscrape.base.Item): +@dataclasses.dataclass +class Submission(snscrape.base.Item): author: typing.Optional[str] # E.g. submission hf7k6 created: datetime.datetime id: str @@ -27,7 +29,8 @@ class Submission(typing.NamedTuple, snscrape.base.Item): return self.url -class Comment(typing.NamedTuple, snscrape.base.Item): +@dataclasses.dataclass +class Comment(snscrape.base.Item): author: typing.Optional[str] body: str created: datetime.datetime diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py index 5e83917..e566f70 100644 --- a/snscrape/modules/telegram.py +++ b/snscrape/modules/telegram.py @@ -1,4 +1,5 @@ import bs4 +import dataclasses import datetime import logging import re @@ -11,7 +12,8 @@ logger = logging.getLogger(__name__) _SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$') -class LinkPreview(typing.NamedTuple): +@dataclasses.dataclass +class LinkPreview: href: str siteName: typing.Optional[str] = None title: typing.Optional[str] = None @@ -19,7 +21,8 @@ class LinkPreview(typing.NamedTuple): image: typing.Optional[str] = None -class TelegramPost(typing.NamedTuple, snscrape.base.Item): +@dataclasses.dataclass +class TelegramPost(snscrape.base.Item): url: str date: datetime.datetime content: str @@ -31,7 +34,8 @@ class TelegramPost(typing.NamedTuple, snscrape.base.Item): return self.url -class Channel(typing.NamedTuple, snscrape.base.Entity): +@dataclasses.dataclass +class Channel(snscrape.base.Entity): username: str title: str verified: bool diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index f5f7bae..9b8c078 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1,4 +1,5 @@ import bs4 +import dataclasses import datetime import email.utils import itertools @@ -17,7 +18,8 @@ logger = logging.getLogger(__name__) _API_AUTHORIZATION_HEADER = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs=1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' -class Tweet(typing.NamedTuple, snscrape.base.Item): +@dataclasses.dataclass +class Tweet(snscrape.base.Item): url: str date: datetime.datetime content: str @@ -49,39 +51,45 @@ class Medium: pass -class Photo(typing.NamedTuple, Medium): +@dataclasses.dataclass +class Photo(Medium): previewUrl: str fullUrl: str type: str = 'photo' -class VideoVariant(typing.NamedTuple): +@dataclasses.dataclass +class VideoVariant: contentType: str url: str bitrate: typing.Optional[int] -class Video(typing.NamedTuple, Medium): +@dataclasses.dataclass +class Video(Medium): thumbnailUrl: str variants: typing.List[VideoVariant] duration: float type: str = 'video' -class Gif(typing.NamedTuple, Medium): +@dataclasses.dataclass +class Gif(Medium): thumbnailUrl: str variants: typing.List[VideoVariant] type: str = 'gif' -class DescriptionURL(typing.NamedTuple): +@dataclasses.dataclass +class DescriptionURL: text: str url: str tcourl: str indices: typing.Tuple[int, int] -class User(typing.NamedTuple, snscrape.base.Entity): +@dataclasses.dataclass +class User(snscrape.base.Entity): # Most fields can be None if they're not known. username: str diff --git a/snscrape/modules/vkontakte.py b/snscrape/modules/vkontakte.py index 2684be1..89fa63c 100644 --- a/snscrape/modules/vkontakte.py +++ b/snscrape/modules/vkontakte.py @@ -1,5 +1,6 @@ import bs4 import collections +import dataclasses import datetime import itertools import logging @@ -11,7 +12,8 @@ import urllib.parse logger = logging.getLogger(__name__) -class VKontaktePost(typing.NamedTuple, snscrape.base.Item): +@dataclasses.dataclass +class VKontaktePost(snscrape.base.Item): url: str date: datetime.datetime content: str @@ -20,7 +22,8 @@ class VKontaktePost(typing.NamedTuple, snscrape.base.Item): return self.url -class User(typing.NamedTuple, snscrape.base.Entity): +@dataclasses.dataclass +class User(snscrape.base.Entity): username: str name: str verified: bool diff --git a/snscrape/modules/weibo.py b/snscrape/modules/weibo.py index c5106b2..69f7256 100644 --- a/snscrape/modules/weibo.py +++ b/snscrape/modules/weibo.py @@ -1,4 +1,5 @@ import bs4 +import dataclasses import datetime import logging import snscrape.base @@ -10,7 +11,8 @@ logger = logging.getLogger(__name__) _userDoesNotExist = object() -class Post(typing.NamedTuple, snscrape.base.Item): +@dataclasses.dataclass +class Post(snscrape.base.Item): url: str id: str user: typing.Optional['User'] @@ -29,7 +31,8 @@ class Post(typing.NamedTuple, snscrape.base.Item): return self.url -class User(typing.NamedTuple, snscrape.base.Entity): +@dataclasses.dataclass +class User(snscrape.base.Entity): screenname: str uid: int verified: bool