mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-11 11:58:28 +03:00
Replace named tuples with dataclasses and move JSON conversion logic to the base classes
Named tuples were never really adequate for this since the order aspect of them doesn't make sense. Further, named tuples don't support multiple inheritance. This meant that the objects returned by get_items() were not actually Items, for example. Since Python 3.9, such named tuples cannot be created anymore. Fixes #111
This commit is contained in:
@@ -1,5 +1,8 @@
|
|||||||
import abc
|
import abc
|
||||||
|
import dataclasses
|
||||||
|
import datetime
|
||||||
import functools
|
import functools
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import requests
|
import requests
|
||||||
import time
|
import time
|
||||||
@@ -8,7 +11,28 @@ import time
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Item:
|
def _json_serialise_datetime(obj):
|
||||||
|
'''A JSON serialiser that converts datetime.datetime and datetime.date objects to ISO-8601 strings.'''
|
||||||
|
if isinstance(obj, (datetime.datetime, datetime.date)):
|
||||||
|
return obj.isoformat()
|
||||||
|
raise TypeError(f'Object of type {type(obj)} is not JSON serializable')
|
||||||
|
|
||||||
|
|
||||||
|
@dataclasses.dataclass
|
||||||
|
class _JSONDataclass:
|
||||||
|
'''A base class for dataclasses for conversion to JSON'''
|
||||||
|
|
||||||
|
def json(self):
|
||||||
|
'''Convert the object to a JSON string'''
|
||||||
|
out = dataclasses.asdict(self)
|
||||||
|
for key, value in out.items():
|
||||||
|
if isinstance(value, _JSONDataclass):
|
||||||
|
out[key] = value.json()
|
||||||
|
return json.dumps(out, default = _json_serialise_datetime)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclasses.dataclass
|
||||||
|
class Item(_JSONDataclass):
|
||||||
'''An abstract base class for an item returned by the scraper's get_items generator.
|
'''An abstract base class for an item returned by the scraper's get_items generator.
|
||||||
|
|
||||||
An item can really be anything. The string representation should be useful for the CLI output (e.g. a direct URL for the item).'''
|
An item can really be anything. The string representation should be useful for the CLI output (e.g. a direct URL for the item).'''
|
||||||
@@ -18,7 +42,8 @@ class Item:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Entity:
|
@dataclasses.dataclass
|
||||||
|
class Entity(_JSONDataclass):
|
||||||
'''An abstract base class for an entity returned by the scraper's entity property.
|
'''An abstract base class for an entity returned by the scraper's entity property.
|
||||||
|
|
||||||
An entity is typically the account of a person or organisation. The string representation should be the preferred direct URL to the entity's page on the network.'''
|
An entity is typically the account of a person or organisation. The string representation should be the preferred direct URL to the entity's page on the network.'''
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ import argparse
|
|||||||
import contextlib
|
import contextlib
|
||||||
import datetime
|
import datetime
|
||||||
import inspect
|
import inspect
|
||||||
import json
|
|
||||||
import logging
|
import logging
|
||||||
import requests.models
|
import requests.models
|
||||||
# Imported in parse_args() after setting up the logger:
|
# Imported in parse_args() after setting up the logger:
|
||||||
@@ -226,26 +225,6 @@ def configure_logging(verbosity, dumpLocals_):
|
|||||||
rootLogger.addHandler(handler)
|
rootLogger.addHandler(handler)
|
||||||
|
|
||||||
|
|
||||||
def json_serialise_datetime(obj):
|
|
||||||
if isinstance(obj, (datetime.datetime, datetime.date)):
|
|
||||||
return obj.isoformat()
|
|
||||||
raise TypeError(f'Object of type {type(obj)} is not JSON serializable')
|
|
||||||
|
|
||||||
|
|
||||||
def namedtuple_to_dict_recursive(obj):
|
|
||||||
# Convert a NamedTuple to a dict; also converts NamedTuples in its values to dicts
|
|
||||||
if (isinstance(obj, tuple) and hasattr(obj, '_asdict')) or isinstance(obj, dict):
|
|
||||||
if isinstance(obj, tuple):
|
|
||||||
obj = obj._asdict()
|
|
||||||
for key, value in obj.items():
|
|
||||||
obj[key] = namedtuple_to_dict_recursive(value)
|
|
||||||
return obj
|
|
||||||
elif isinstance(obj, (tuple, list)):
|
|
||||||
return type(obj)(namedtuple_to_dict_recursive(value) for value in obj)
|
|
||||||
else:
|
|
||||||
return obj
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
setup_logging()
|
setup_logging()
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
@@ -256,7 +235,7 @@ def main():
|
|||||||
with _dump_locals_on_exception():
|
with _dump_locals_on_exception():
|
||||||
if args.withEntity and (entity := scraper.entity):
|
if args.withEntity and (entity := scraper.entity):
|
||||||
if args.jsonl:
|
if args.jsonl:
|
||||||
print(json.dumps(namedtuple_to_dict_recursive(entity), default = json_serialise_datetime))
|
print(entity.json())
|
||||||
else:
|
else:
|
||||||
print(entity)
|
print(entity)
|
||||||
if args.maxResults == 0:
|
if args.maxResults == 0:
|
||||||
@@ -267,7 +246,7 @@ def main():
|
|||||||
logger.info(f'Exiting due to reaching older results than {args.since}')
|
logger.info(f'Exiting due to reaching older results than {args.since}')
|
||||||
break
|
break
|
||||||
if args.jsonl:
|
if args.jsonl:
|
||||||
print(json.dumps(namedtuple_to_dict_recursive(item), default = json_serialise_datetime))
|
print(item.json())
|
||||||
elif args.format is not None:
|
elif args.format is not None:
|
||||||
print(args.format.format(**item._asdict()))
|
print(args.format.format(**item._asdict()))
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import bs4
|
import bs4
|
||||||
|
import dataclasses
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
@@ -11,7 +12,8 @@ import urllib.parse
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class FacebookPost(typing.NamedTuple, snscrape.base.Item):
|
@dataclasses.dataclass
|
||||||
|
class FacebookPost(snscrape.base.Item):
|
||||||
cleanUrl: str
|
cleanUrl: str
|
||||||
dirtyUrl: str
|
dirtyUrl: str
|
||||||
date: datetime.datetime
|
date: datetime.datetime
|
||||||
@@ -23,7 +25,8 @@ class FacebookPost(typing.NamedTuple, snscrape.base.Item):
|
|||||||
return self.cleanUrl
|
return self.cleanUrl
|
||||||
|
|
||||||
|
|
||||||
class User(typing.NamedTuple, snscrape.base.Entity):
|
@dataclasses.dataclass
|
||||||
|
class User(snscrape.base.Entity):
|
||||||
username: str
|
username: str
|
||||||
pageId: int
|
pageId: int
|
||||||
name: str
|
name: str
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import dataclasses
|
||||||
import datetime
|
import datetime
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
@@ -10,7 +11,8 @@ import typing
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class InstagramPost(typing.NamedTuple, snscrape.base.Item):
|
@dataclasses.dataclass
|
||||||
|
class InstagramPost(snscrape.base.Item):
|
||||||
cleanUrl: str
|
cleanUrl: str
|
||||||
dirtyUrl: str
|
dirtyUrl: str
|
||||||
date: datetime.datetime
|
date: datetime.datetime
|
||||||
@@ -27,7 +29,8 @@ class InstagramPost(typing.NamedTuple, snscrape.base.Item):
|
|||||||
return self.cleanUrl
|
return self.cleanUrl
|
||||||
|
|
||||||
|
|
||||||
class User(typing.NamedTuple, snscrape.base.Entity):
|
@dataclasses.dataclass
|
||||||
|
class User(snscrape.base.Entity):
|
||||||
username: str
|
username: str
|
||||||
name: typing.Optional[str]
|
name: typing.Optional[str]
|
||||||
followers: int
|
followers: int
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import dataclasses
|
||||||
import datetime
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
@@ -13,7 +14,8 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
# Most of these fields should never be None, but due to broken data, they sometimes are anyway...
|
# Most of these fields should never be None, but due to broken data, they sometimes are anyway...
|
||||||
|
|
||||||
class Submission(typing.NamedTuple, snscrape.base.Item):
|
@dataclasses.dataclass
|
||||||
|
class Submission(snscrape.base.Item):
|
||||||
author: typing.Optional[str] # E.g. submission hf7k6
|
author: typing.Optional[str] # E.g. submission hf7k6
|
||||||
created: datetime.datetime
|
created: datetime.datetime
|
||||||
id: str
|
id: str
|
||||||
@@ -27,7 +29,8 @@ class Submission(typing.NamedTuple, snscrape.base.Item):
|
|||||||
return self.url
|
return self.url
|
||||||
|
|
||||||
|
|
||||||
class Comment(typing.NamedTuple, snscrape.base.Item):
|
@dataclasses.dataclass
|
||||||
|
class Comment(snscrape.base.Item):
|
||||||
author: typing.Optional[str]
|
author: typing.Optional[str]
|
||||||
body: str
|
body: str
|
||||||
created: datetime.datetime
|
created: datetime.datetime
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import bs4
|
import bs4
|
||||||
|
import dataclasses
|
||||||
import datetime
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
@@ -11,7 +12,8 @@ logger = logging.getLogger(__name__)
|
|||||||
_SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
|
_SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
|
||||||
|
|
||||||
|
|
||||||
class LinkPreview(typing.NamedTuple):
|
@dataclasses.dataclass
|
||||||
|
class LinkPreview:
|
||||||
href: str
|
href: str
|
||||||
siteName: typing.Optional[str] = None
|
siteName: typing.Optional[str] = None
|
||||||
title: typing.Optional[str] = None
|
title: typing.Optional[str] = None
|
||||||
@@ -19,7 +21,8 @@ class LinkPreview(typing.NamedTuple):
|
|||||||
image: typing.Optional[str] = None
|
image: typing.Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
class TelegramPost(typing.NamedTuple, snscrape.base.Item):
|
@dataclasses.dataclass
|
||||||
|
class TelegramPost(snscrape.base.Item):
|
||||||
url: str
|
url: str
|
||||||
date: datetime.datetime
|
date: datetime.datetime
|
||||||
content: str
|
content: str
|
||||||
@@ -31,7 +34,8 @@ class TelegramPost(typing.NamedTuple, snscrape.base.Item):
|
|||||||
return self.url
|
return self.url
|
||||||
|
|
||||||
|
|
||||||
class Channel(typing.NamedTuple, snscrape.base.Entity):
|
@dataclasses.dataclass
|
||||||
|
class Channel(snscrape.base.Entity):
|
||||||
username: str
|
username: str
|
||||||
title: str
|
title: str
|
||||||
verified: bool
|
verified: bool
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import bs4
|
import bs4
|
||||||
|
import dataclasses
|
||||||
import datetime
|
import datetime
|
||||||
import email.utils
|
import email.utils
|
||||||
import itertools
|
import itertools
|
||||||
@@ -17,7 +18,8 @@ logger = logging.getLogger(__name__)
|
|||||||
_API_AUTHORIZATION_HEADER = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs=1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
|
_API_AUTHORIZATION_HEADER = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs=1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
|
||||||
|
|
||||||
|
|
||||||
class Tweet(typing.NamedTuple, snscrape.base.Item):
|
@dataclasses.dataclass
|
||||||
|
class Tweet(snscrape.base.Item):
|
||||||
url: str
|
url: str
|
||||||
date: datetime.datetime
|
date: datetime.datetime
|
||||||
content: str
|
content: str
|
||||||
@@ -49,39 +51,45 @@ class Medium:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Photo(typing.NamedTuple, Medium):
|
@dataclasses.dataclass
|
||||||
|
class Photo(Medium):
|
||||||
previewUrl: str
|
previewUrl: str
|
||||||
fullUrl: str
|
fullUrl: str
|
||||||
type: str = 'photo'
|
type: str = 'photo'
|
||||||
|
|
||||||
|
|
||||||
class VideoVariant(typing.NamedTuple):
|
@dataclasses.dataclass
|
||||||
|
class VideoVariant:
|
||||||
contentType: str
|
contentType: str
|
||||||
url: str
|
url: str
|
||||||
bitrate: typing.Optional[int]
|
bitrate: typing.Optional[int]
|
||||||
|
|
||||||
|
|
||||||
class Video(typing.NamedTuple, Medium):
|
@dataclasses.dataclass
|
||||||
|
class Video(Medium):
|
||||||
thumbnailUrl: str
|
thumbnailUrl: str
|
||||||
variants: typing.List[VideoVariant]
|
variants: typing.List[VideoVariant]
|
||||||
duration: float
|
duration: float
|
||||||
type: str = 'video'
|
type: str = 'video'
|
||||||
|
|
||||||
|
|
||||||
class Gif(typing.NamedTuple, Medium):
|
@dataclasses.dataclass
|
||||||
|
class Gif(Medium):
|
||||||
thumbnailUrl: str
|
thumbnailUrl: str
|
||||||
variants: typing.List[VideoVariant]
|
variants: typing.List[VideoVariant]
|
||||||
type: str = 'gif'
|
type: str = 'gif'
|
||||||
|
|
||||||
|
|
||||||
class DescriptionURL(typing.NamedTuple):
|
@dataclasses.dataclass
|
||||||
|
class DescriptionURL:
|
||||||
text: str
|
text: str
|
||||||
url: str
|
url: str
|
||||||
tcourl: str
|
tcourl: str
|
||||||
indices: typing.Tuple[int, int]
|
indices: typing.Tuple[int, int]
|
||||||
|
|
||||||
|
|
||||||
class User(typing.NamedTuple, snscrape.base.Entity):
|
@dataclasses.dataclass
|
||||||
|
class User(snscrape.base.Entity):
|
||||||
# Most fields can be None if they're not known.
|
# Most fields can be None if they're not known.
|
||||||
|
|
||||||
username: str
|
username: str
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import bs4
|
import bs4
|
||||||
import collections
|
import collections
|
||||||
|
import dataclasses
|
||||||
import datetime
|
import datetime
|
||||||
import itertools
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
@@ -11,7 +12,8 @@ import urllib.parse
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class VKontaktePost(typing.NamedTuple, snscrape.base.Item):
|
@dataclasses.dataclass
|
||||||
|
class VKontaktePost(snscrape.base.Item):
|
||||||
url: str
|
url: str
|
||||||
date: datetime.datetime
|
date: datetime.datetime
|
||||||
content: str
|
content: str
|
||||||
@@ -20,7 +22,8 @@ class VKontaktePost(typing.NamedTuple, snscrape.base.Item):
|
|||||||
return self.url
|
return self.url
|
||||||
|
|
||||||
|
|
||||||
class User(typing.NamedTuple, snscrape.base.Entity):
|
@dataclasses.dataclass
|
||||||
|
class User(snscrape.base.Entity):
|
||||||
username: str
|
username: str
|
||||||
name: str
|
name: str
|
||||||
verified: bool
|
verified: bool
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import bs4
|
import bs4
|
||||||
|
import dataclasses
|
||||||
import datetime
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
import snscrape.base
|
import snscrape.base
|
||||||
@@ -10,7 +11,8 @@ logger = logging.getLogger(__name__)
|
|||||||
_userDoesNotExist = object()
|
_userDoesNotExist = object()
|
||||||
|
|
||||||
|
|
||||||
class Post(typing.NamedTuple, snscrape.base.Item):
|
@dataclasses.dataclass
|
||||||
|
class Post(snscrape.base.Item):
|
||||||
url: str
|
url: str
|
||||||
id: str
|
id: str
|
||||||
user: typing.Optional['User']
|
user: typing.Optional['User']
|
||||||
@@ -29,7 +31,8 @@ class Post(typing.NamedTuple, snscrape.base.Item):
|
|||||||
return self.url
|
return self.url
|
||||||
|
|
||||||
|
|
||||||
class User(typing.NamedTuple, snscrape.base.Entity):
|
@dataclasses.dataclass
|
||||||
|
class User(snscrape.base.Entity):
|
||||||
screenname: str
|
screenname: str
|
||||||
uid: int
|
uid: int
|
||||||
verified: bool
|
verified: bool
|
||||||
|
|||||||
Reference in New Issue
Block a user