added User dataclass as argument to VKontaktePost dataclass

Merge pull request #4 from JustAnotherArchivist/master
upstream merge
2026-06-08 18:48:28 +03:00 · 2022-07-05 10:21:59 -05:00 · 2022-05-24 23:10:38 -07:00 · 2022-05-23 23:06:16 +00:00 · 2022-05-23 23:31:44 +01:00 · 2022-05-09 09:37:36 -05:00
17 changed files with 3745 additions and 696 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,4 @@
+__pycache__/
+/dist/
+/snscrape.egg-info/
+/.eggs/
--- a/README.md
+++ b/README.md
@@ -1,16 +1,19 @@
 # snscrape
-snscrape is a scraper for social networking services (SNS). It scrapes things like user profiles, hashtags, or searches and returns the discovered items, e.g. the relevant posts.                  
+snscrape is a scraper for social networking services (SNS). It scrapes things like user profiles, hashtags, or searches and returns the discovered items, e.g. the relevant posts.

 The following services are currently supported:
-* Facebook: user profiles and groups
-* Gab: user profile posts, media, and comments
-* Google+: user profiles
+
+* Facebook: user profiles, groups, and communities (aka visitor posts)
 * Instagram: user profiles, hashtags, and locations
-* Twitter: user profiles, hashtags, searches, threads, and lists (members as well as posts)
+* Mastodon: user profiles and toots (single or thread)
+* Reddit: users, subreddits, and searches (via Pushshift)
+* Telegram: channels
+* Twitter: users, user profiles, hashtags, searches, tweets (single or surrounding thread), list posts, and trends
 * VKontakte: user profiles
+* Weibo (Sina Weibo): user profiles

 ## Requirements
-snscrape requires Python 3.6 or higher. The Python package dependencies are installed automatically when you install snscrape.
+snscrape requires Python 3.8 or higher. The Python package dependencies are installed automatically when you install snscrape.

 Note that one of the dependencies, lxml, also requires libxml2 and libxslt to be installed.

@@ -22,11 +25,28 @@ If you want to use the development version:
    pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git

 ## Usage
-To get all tweets by Jason Scott (@textfiles):
+### CLI
+The generic syntax of snscrape's CLI is:
+
+    snscrape [GLOBAL-OPTIONS] SCRAPER-NAME [SCRAPER-OPTIONS] [SCRAPER-ARGUMENTS...]
+
+`snscrape --help` and `snscrape SCRAPER-NAME --help` provide details on the options and arguments. `snscrape --help` also lists all available scrapers.
+
+The default output of the CLI is the URL of each result.
+
+Some noteworthy global options are:
+
+* `--jsonl` to get output as JSONL. This includes all information extracted by snscrape (e.g. message content, datetime, images; details vary by scraper).
+* `--max-results NUMBER` to only return the first `NUMBER` results.
+* `--with-entity` to get an item on the entity being scraped, e.g. the user or channel. This is not supported on all scrapers. (You can use this together with `--max-results 0` to only fetch the entity info.)
+
+#### Examples
+Collect all tweets by Jason Scott (@textfiles):

    snscrape twitter-user textfiles

-It's usually useful to redirect the output to a file for further processing, e.g. in bash using the filename `@textfiles-tweets`:
+It's usually useful to redirect the output to a file for further processing, e.g. in bash using the filename `twitter-@textfiles`:
+
 ```bash
 snscrape twitter-user textfiles >twitter-@textfiles
 ```
@@ -35,8 +55,7 @@ To get the latest 100 tweets with the hashtag #archiveteam:

    snscrape --max-results 100 twitter-hashtag archiveteam

-`snscrape --help` or `snscrape <module> --help` provides details on the available options. `snscrape --help` also lists all available modules.
-
+### Library
 It is also possible to use snscrape as a library in Python, but this is currently undocumented.

 ## Issue reporting
--- a/setup.py
+++ b/setup.py
@@ -1,23 +1,42 @@
+import os.path
 import setuptools


+with open(os.path.join(os.path.dirname(__file__), 'README.md')) as fp:
+	readme = fp.read()
+
+
 setuptools.setup(
 	name = 'snscrape',
 	description = 'A social networking service scraper',
+	long_description = readme,
+	long_description_content_type = 'text/markdown',
 	author = 'JustAnotherArchivist',
 	url = 'https://github.com/JustAnotherArchivist/snscrape',
 	classifiers = [
 		'Development Status :: 4 - Beta',
 		'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)',
-		'Programming Language :: Python :: 3.6',
+		'Programming Language :: Python :: 3.8',
+		'Programming Language :: Python :: 3.9',
+		'Programming Language :: Python :: 3.10',
 	],
 	packages = ['snscrape', 'snscrape.modules'],
 	setup_requires = ['setuptools_scm'],
 	use_scm_version = True,
-	install_requires = ['requests[socks]', 'lxml', 'beautifulsoup4'],
+	install_requires = [
+		'requests[socks]',
+		'lxml',
+		'beautifulsoup4',
+		'pytz; python_version < "3.9.0"',
+		'filelock',
+	],
+	python_requires = '~=3.8',
+	extras_require = {
+		'test': ['coverage'],
+	},
 	entry_points = {
 		'console_scripts': [
-			'snscrape = snscrape.cli:main',
+			'snscrape = snscrape._cli:main',
 		],
 	},
 )
--- a/snscrape/_cli.py
+++ b/snscrape/_cli.py
@@ -1,13 +1,17 @@
 import argparse
+import collections
 import contextlib
+import dataclasses
 import datetime
+import importlib.metadata
 import inspect
 import logging
-import requests.models
+import requests
 # Imported in parse_args() after setting up the logger:
 #import snscrape.base
 #import snscrape.modules
 #import snscrape.version
+import sys
 import tempfile


@@ -41,30 +45,31 @@ class Logger(logging.Logger):
 			super().log(level, *args, **kwargs)


-def _requests_preparedrequest_repr(name, request):
+def _requests_request_repr(name, request):
 	ret = []
-	ret.append(repr(request))
+	ret.append(f'{name} = {request!r}')
 	ret.append(f'\n  {name}.method = {request.method}')
 	ret.append(f'\n  {name}.url = {request.url}')
 	ret.append(f'\n  {name}.headers = \\')
 	for field in request.headers:
 		ret.append(f'\n    {field} = {_repr("_", request.headers[field])}')
-	if request.body:
-		ret.append(f'\n  {name}.body = ')
-		ret.append(_repr('_', request.body).replace('\n', '\n  '))
+	for attr in ('body', 'params', 'data'):
+		if hasattr(request, attr) and getattr(request, attr):
+			ret.append(f'\n  {name}.{attr} = ')
+			ret.append(_repr('_', getattr(request, attr)).replace('\n', '\n  '))
 	return ''.join(ret)


 def _requests_response_repr(name, response, withHistory = True):
 	ret = []
-	ret.append(repr(response))
+	ret.append(f'{name} = {response!r}')
 	ret.append(f'\n  {name}.url = {response.url}')
 	ret.append(f'\n  {name}.request = ')
 	ret.append(_repr('_', response.request).replace('\n', '\n  '))
 	if withHistory and response.history:
 		ret.append(f'\n  {name}.history = [')
 		for previousResponse in response.history:
-			ret.append(f'\n    ')
+			ret.append('\n    ')
 			ret.append(_requests_response_repr('_', previousResponse, withHistory = False).replace('\n', '\n    '))
 		ret.append('\n  ]')
 	ret.append(f'\n  {name}.status_code = {response.status_code}')
@@ -75,12 +80,31 @@ def _requests_response_repr(name, response, withHistory = True):
 	return ''.join(ret)


+def _requests_exception_repr(name, exc):
+	ret = []
+	ret.append(f'{name} = {exc!r}')
+	ret.append('\n  ' + _repr(f'{name}.request', exc.request).replace('\n', '\n  '))
+	ret.append('\n  ' + _repr(f'{name}.response', exc.response).replace('\n', '\n  '))
+	return ''.join(ret)
+
+
 def _repr(name, value):
-	if type(value) is requests.models.Response:
+	if type(value) is requests.Response:
 		return _requests_response_repr(name, value)
-	if type(value) is requests.models.PreparedRequest:
-		return _requests_preparedrequest_repr(name, value)
-	valueRepr = repr(value)
+	if type(value) in (requests.PreparedRequest, requests.Request):
+		return _requests_request_repr(name, value)
+	if isinstance(value, requests.exceptions.RequestException):
+		return _requests_exception_repr(name, value)
+	if isinstance(value, dict):
+		return f'{name} = <{type(value).__module__}.{type(value).__name__}>\n  ' + \
+		       '\n  '.join(_repr(f'{name}[{k!r}]', v).replace('\n', '\n  ') for k, v in value.items())
+	if isinstance(value, (list, tuple, collections.deque)) and not all(isinstance(v, (int, str)) for v in value):
+		return f'{name} = <{type(value).__module__}.{type(value).__name__}>\n  ' + \
+		       '\n  '.join(_repr(f'{name}[{i}]', v).replace('\n', '\n  ') for i, v in enumerate(value))
+	if dataclasses.is_dataclass(value) and not isinstance(value, type):
+		return f'{name} = <{type(value).__module__}.{type(value).__name__}>\n  ' + \
+		       '\n  '.join(_repr(f'{name}.{f.name}', f.name) + ' = ' + _repr(f'{name}.{f.name}', getattr(value, f.name)).replace('\n', '\n  ') for f in dataclasses.fields(value))
+	valueRepr = f'{name} = {value!r}'
 	if '\n' in valueRepr:
 		return ''.join(['\\\n  ', valueRepr.replace('\n', '\n  ')])
 	return valueRepr
@@ -93,22 +117,38 @@ def _dump_locals_on_exception():
 	except Exception as e:
 		trace = inspect.trace()
 		if len(trace) >= 2:
-			name = _dump_stack_and_locals(trace[1:])
+			name = _dump_stack_and_locals(trace[1:], exc = e)
 			logger.fatal(f'Dumped stack and locals to {name}')
 		raise


-def _dump_stack_and_locals(trace):
+def _dump_stack_and_locals(trace, exc = None):
 	with tempfile.NamedTemporaryFile('w', prefix = 'snscrape_locals_', delete = False) as fp:
+		if exc is not None:
+			fp.write('Exception:\n')
+			fp.write(f'  {type(exc).__module__}.{type(exc).__name__}: {exc!s}\n')
+			fp.write(f'  args: {exc.args!r}\n')
+			fp.write('\n')
+
 		fp.write('Stack:\n')
 		for frameRecord in trace:
 			fp.write(f'  File "{frameRecord.filename}", line {frameRecord.lineno}, in {frameRecord.function}\n')
-			for line in frameRecord.code_context:
-				fp.write(f'    {line.strip()}\n')
+			if frameRecord.code_context is not None:
+				for line in frameRecord.code_context:
+					fp.write(f'    {line.strip()}\n')
 		fp.write('\n')

-		for frameRecord in trace:
-			module = inspect.getmodule(frameRecord[0])
+		modules = [inspect.getmodule(frameRecord[0]) for frameRecord in trace]
+		for i, (module, frameRecord) in enumerate(zip(modules, trace)):
+			if module is None:
+				# Module-less frame, e.g. dataclass.__init__
+				for j in reversed(range(i)):
+					if modules[j] is not None:
+						break
+				else:
+					# No previous module scope
+					continue
+				module = modules[j]
 			if not module.__name__.startswith('snscrape.') and module.__name__ != 'snscrape':
 				continue
 			locals_ = frameRecord[0].f_locals
@@ -121,7 +161,7 @@ def _dump_stack_and_locals(trace):
 				fp.write('\n')
 			fp.write('\n')
 			if 'self' in locals_ and hasattr(locals_['self'], '__dict__'):
-				fp.write(f'Object dict:\n')
+				fp.write('Object dict:\n')
 				fp.write(repr(locals_['self'].__dict__))
 				fp.write('\n\n')
 		name = fp.name
@@ -148,6 +188,45 @@ def parse_datetime_arg(arg):
 	raise argparse.ArgumentTypeError(f'Cannot parse {arg!r} into a datetime object')


+def parse_format(arg):
+	# Replace '{' by '{0.' to use properties of the item, but keep '{{' intact
+	parts = arg.split('{')
+	out = ''
+	it = iter(zip(parts, parts[1:]))
+	for part, nextPart in it:
+		out += part
+		if nextPart == '': # Double brace
+			out += '{{'
+			next(it)
+		else: # Single brace
+			out += '{0.'
+	out += parts[-1]
+	return out
+
+
+class CitationAction(argparse.Action):
+	def __init__(self, option_strings, dest = argparse.SUPPRESS, *args, default = argparse.SUPPRESS, **kwargs):
+		super().__init__(option_strings, dest, *args, **kwargs)
+
+	def __call__(self, parser, namespace, values, optionString):
+		try:
+			m = importlib.metadata.metadata('snscrape')
+		except importlib.metadata.PackageNotFoundError:
+			print('Error: could not find snscrape installation. --citation does not work without the package being installed.', file = sys.stderr)
+			parser.exit(1)
+		print(f'Author: {m["author"]}')
+		print(f'Title: {m["name"]}: {m["summary"]}')
+		print(f'URL: {m["home-page"]}')
+		print(f'Version: {m["version"]}')
+		print(f'Date: 2018‒{m["version"].split(".", 3)[3][:4]}')
+
+		if '.dev' in m['version']:
+			print()
+			print('WARNING! You are running a development version. The date range may be incorrect. Please adjust the upper end of the range to the year of the commit.')
+
+		parser.exit()
+
+
 def parse_args():
 	import snscrape.base
 	import snscrape.modules
@@ -155,28 +234,35 @@ def parse_args():

 	parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
 	parser.add_argument('--version', action = 'version', version = f'snscrape {snscrape.version.__version__}')
+	parser.add_argument('--citation', action = CitationAction, nargs = 0, help = 'Display recommended citation information and exit')
 	parser.add_argument('-v', '--verbose', '--verbosity', dest = 'verbosity', action = 'count', default = 0, help = 'Increase output verbosity')
 	parser.add_argument('--dump-locals', dest = 'dumpLocals', action = 'store_true', default = False, help = 'Dump local variables on serious log messages (warnings or higher)')
 	parser.add_argument('--retry', '--retries', dest = 'retries', type = int, default = 3, metavar = 'N',
 		help = 'When the connection fails or the server returns an unexpected response, retry up to N times with an exponential backoff')
-	parser.add_argument('-n', '--max-results', dest = 'maxResults', type = int, metavar = 'N', help = 'Only return the first N results')
-	parser.add_argument('-f', '--format', dest = 'format', type = str, default = None, help = 'Output format')
+	parser.add_argument('-n', '--max-results', dest = 'maxResults', type = lambda x: int(x) if int(x) >= 0 else parser.error('--max-results N must be zero or positive'), metavar = 'N', help = 'Only return the first N results')
+	group = parser.add_mutually_exclusive_group(required = False)
+	group.add_argument('-f', '--format', dest = 'format', type = parse_format, default = None, help = 'Output format')
+	group.add_argument('--jsonl', dest = 'jsonl', action = 'store_true', default = False, help = 'Output JSONL')
+	parser.add_argument('--with-entity', dest = 'withEntity', action = 'store_true', default = False, help = 'Include the entity (e.g. user, channel) as the first output item')
 	parser.add_argument('--since', type = parse_datetime_arg, metavar = 'DATETIME', help = 'Only return results newer than DATETIME')
+	parser.add_argument('--progress', action = 'store_true', default = False, help = 'Report progress on stderr')

-	subparsers = parser.add_subparsers(dest = 'scraper', help = 'The scraper you want to use')
+	subparsers = parser.add_subparsers(dest = 'scraper', metavar = 'SCRAPER', title = 'scrapers', required = True)
 	classes = snscrape.base.Scraper.__subclasses__()
+	scrapers = {}
 	for cls in classes:
 		if cls.name is not None:
-			subparser = subparsers.add_parser(cls.name, formatter_class = argparse.ArgumentDefaultsHelpFormatter)
-			cls.setup_parser(subparser)
-			subparser.set_defaults(cls = cls)
+			scrapers[cls.name] = cls
 		classes.extend(cls.__subclasses__())
+	for scraper, cls in sorted(scrapers.items()):
+		subparser = subparsers.add_parser(cls.name, help = '', formatter_class = argparse.ArgumentDefaultsHelpFormatter)
+		cls._cli_setup_parser(subparser)
+		subparser.set_defaults(cls = cls)

 	args = parser.parse_args()

-	# http://bugs.python.org/issue16308 / https://bugs.python.org/issue26510 (fixed in Python 3.7)
-	if not args.scraper:
-		raise RuntimeError('Error: no scraper specified')
+	if not args.withEntity and args.maxResults == 0:
+		parser.error('--max-results 0 is only valid when used with --with-entity')

 	return args

@@ -217,20 +303,36 @@ def main():
 	setup_logging()
 	args = parse_args()
 	configure_logging(args.verbosity, args.dumpLocals)
-	scraper = args.cls.from_args(args)
+	scraper = args.cls._cli_from_args(args)

 	i = 0
 	with _dump_locals_on_exception():
+		if args.withEntity and (entity := scraper.entity):
+			if args.jsonl:
+				print(entity.json())
+			else:
+				print(entity)
+		if args.maxResults == 0:
+			logger.info('Exiting after 0 results')
+			return
 		for i, item in enumerate(scraper.get_items(), start = 1):
 			if args.since is not None and item.date < args.since:
 				logger.info(f'Exiting due to reaching older results than {args.since}')
 				break
-			if args.format is not None:
-				print(args.format.format(**item._asdict()))
+			if args.jsonl:
+				print(item.json())
+			elif args.format is not None:
+				print(args.format.format(item))
 			else:
 				print(item)
+			if args.progress and i % 100 == 0:
+				print(f'Scraping, {i} results so far', file = sys.stderr)
 			if args.maxResults and i >= args.maxResults:
 				logger.info(f'Exiting after {i} results')
+				if args.progress:
+					print(f'Stopped scraping after {i} results due to --max-results', file = sys.stderr)
 				break
 		else:
 			logger.info(f'Done, found {i} results')
+			if args.progress:
+				print(f'Finished, {i} results', file = sys.stderr)
--- a/snscrape/base.py
+++ b/snscrape/base.py
@@ -1,22 +1,121 @@
 import abc
+import copy
+import dataclasses
+import datetime
+import functools
+import json
 import logging
 import requests
 import time
+import warnings


 logger = logging.getLogger(__name__)


-class Item:
+class _DeprecatedProperty:
+	def __init__(self, name, repl, replStr):
+		self.name = name
+		self.repl = repl
+		self.replStr = replStr
+
+	def __get__(self, obj, objType):
+		if obj is None: # if the access is through the class using _DeprecatedProperty rather than an instance of the class:
+			return self
+		warnings.warn(f'{self.name} is deprecated, use {self.replStr} instead', FutureWarning, stacklevel = 2)
+		return self.repl(obj)
+
+
+def _json_serialise_datetime(obj):
+	'''A JSON serialiser that converts datetime.datetime and datetime.date objects to ISO-8601 strings.'''
+
+	if isinstance(obj, (datetime.datetime, datetime.date)):
+		return obj.isoformat()
+	raise TypeError(f'Object of type {type(obj)} is not JSON serializable')
+
+
+def _json_dataclass_to_dict(obj):
+	if isinstance(obj, _JSONDataclass) or dataclasses.is_dataclass(obj):
+		out = {}
+		out['_type'] = f'{type(obj).__module__}.{type(obj).__name__}'
+		for field in dataclasses.fields(obj):
+			assert field.name != '_type'
+			if field.name.startswith('_'):
+				continue
+			out[field.name] = _json_dataclass_to_dict(getattr(obj, field.name))
+		# Add in (non-deprecated) properties
+		for k in dir(obj):
+			if isinstance(getattr(type(obj), k, None), property):
+				assert k != '_type'
+				if k.startswith('_'):
+					continue
+				out[k] = _json_dataclass_to_dict(getattr(obj, k))
+		return out
+	elif isinstance(obj, (tuple, list)):
+		return type(obj)(_json_dataclass_to_dict(x) for x in obj)
+	elif isinstance(obj, dict):
+		return {_json_dataclass_to_dict(k): _json_dataclass_to_dict(v) for k, v in obj.items()}
+	elif isinstance(obj, set):
+		return {_json_dataclass_to_dict(v) for v in obj}
+	else:
+		return copy.deepcopy(obj)
+
+
+@dataclasses.dataclass
+class _JSONDataclass:
+	'''A base class for dataclasses for conversion to JSON'''
+
+	def json(self):
+		'''Convert the object to a JSON string'''
+
+		out = _json_dataclass_to_dict(self)
+		for key, value in list(out.items()): # Modifying the dict below, so make a copy first
+			if isinstance(value, IntWithGranularity):
+				out[key] = int(value)
+				assert f'{key}.granularity' not in out, f'Granularity collision on {key}.granularity'
+				out[f'{key}.granularity'] = value.granularity
+		return json.dumps(out, default = _json_serialise_datetime)
+
+
+@dataclasses.dataclass
+class Item(_JSONDataclass):
 	'''An abstract base class for an item returned by the scraper's get_items generator.

-	An item can really be anything. The string representation should be useful for the CLI output (e.g. a direct URL for the item).'''
+	An item can really be anything. The string representation should be useful for the CLI output (e.g. a direct URL for the item).
+	'''

 	@abc.abstractmethod
 	def __str__(self):
 		pass


+@dataclasses.dataclass
+class Entity(_JSONDataclass):
+	'''An abstract base class for an entity returned by the scraper's entity property.
+
+	An entity is typically the account of a person or organisation. The string representation should be the preferred direct URL to the entity's page on the network.
+	'''
+
+	@abc.abstractmethod
+	def __str__(self):
+		pass
+
+
+class IntWithGranularity(int):
+	'''A number with an associated granularity
+
+	For example, an IntWithGranularity(42000, 1000) represents a number on the order of 42000 with two significant digits, i.e. something counted with a granularity of 1000.
+	'''
+
+	def __new__(cls, value, granularity, *args, **kwargs):
+		obj = super().__new__(cls, value, *args, **kwargs)
+		obj.granularity = granularity
+		return obj
+
+	def __reduce__(self):
+		return (IntWithGranularity, (int(self), self.granularity))
+
+
 class URLItem(Item):
 	'''A generic item which only holds a URL string.'''

@@ -40,34 +139,57 @@ class Scraper:

 	name = None

-	def __init__(self, retries = 3):
+	def __init__(self, *, retries = 3, proxies = None):
 		self._retries = retries
+		self._proxies = proxies
 		self._session = requests.Session()

 	@abc.abstractmethod
 	def get_items(self):
 		'''Iterator yielding Items.'''
+
 		pass

-	def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None):
+	def _get_entity(self):
+		'''Get the entity behind the scraper, if any.
+
+		This is the method implemented by subclasses for doing the actual retrieval/entity object creation. For accessing the scraper's entity, use the entity property.
+		'''
+
+		return None
+
+	@functools.cached_property
+	def entity(self):
+		return self._get_entity()
+
+	def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None, allowRedirects = True, proxies = None):
+		proxies = proxies or self._proxies or {}
 		for attempt in range(self._retries + 1):
 			# The request is newly prepared on each retry because of potential cookie updates.
 			req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
+			environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None)
 			logger.info(f'Retrieving {req.url}')
 			logger.debug(f'... with headers: {headers!r}')
 			if data:
 				logger.debug(f'... with data: {data!r}')
+			if environmentSettings:
+				logger.debug(f'... with environmentSettings: {environmentSettings!r}')
 			try:
-				r = self._session.send(req, timeout = timeout)
+				r = self._session.send(req, allow_redirects = allowRedirects, timeout = timeout, **environmentSettings)
 			except requests.exceptions.RequestException as exc:
 				if attempt < self._retries:
 					retrying = ', retrying'
-					level = logging.WARNING
+					level = logging.INFO
 				else:
 					retrying = ''
 					level = logging.ERROR
 				logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
 			else:
+				redirected = f' (redirected to {r.url})' if r.history else ''
+				logger.info(f'Retrieved {req.url}{redirected}: {r.status_code}')
+				if r.history:
+					for i, redirect in enumerate(r.history):
+						logger.debug(f'... request {i}: {redirect.request.url}: {r.status_code} (Location: {r.headers.get("Location")})')
 				if responseOkCallback is not None:
 					success, msg = responseOkCallback(r)
 				else:
@@ -80,7 +202,7 @@ class Scraper:
 				else:
 					if attempt < self._retries:
 						retrying = ', retrying'
-						level = logging.WARNING
+						level = logging.INFO
 					else:
 						retrying = ''
 						level = logging.ERROR
@@ -102,11 +224,23 @@ class Scraper:
 		return self._request('POST', *args, **kwargs)

 	@classmethod
-	@abc.abstractmethod
-	def setup_parser(cls, subparser):
+	def _cli_setup_parser(cls, subparser):
 		pass

 	@classmethod
-	@abc.abstractmethod
-	def from_args(cls, args):
-		pass
+	def _cli_from_args(cls, args):
+		return cls._construct(args)
+
+	@classmethod
+	def _cli_construct(cls, argparseArgs, *args, **kwargs):
+		return cls(*args, **kwargs, retries = argparseArgs.retries)
+
+
+def nonempty_string(name):
+	def f(s):
+		s = s.strip()
+		if s:
+			return s
+		raise ValueError('must not be an empty string')
+	f.__name__ = name
+	return f
--- a/snscrape/modules/init.py
+++ b/snscrape/modules/init.py
@@ -1,15 +1,17 @@
-import importlib
-import os
-import snscrape.base
+import pkgutil
+
+
+__all__ = []


 def _import_modules():
-	files = os.listdir(__path__[0])
-	for fn in files:
-		if fn.endswith('.py') and fn != '__init__.py':
-			# Import module if not already imported
-			moduleName = f'snscrape.modules.{fn[:-3]}'
-			module = importlib.import_module(moduleName)
+	prefixLen = len(__name__) + 1
+	for importer, moduleName, isPkg in pkgutil.iter_modules(__path__, prefix = f'{__name__}.'):
+		assert not isPkg
+		moduleNameWithoutPrefix = moduleName[prefixLen:]
+		__all__.append(moduleNameWithoutPrefix)
+		module = importer.find_module(moduleName).load_module(moduleName)
+		globals()[moduleNameWithoutPrefix] = module


 _import_modules()
--- a/snscrape/modules/facebook.py
+++ b/snscrape/modules/facebook.py
@@ -1,4 +1,8 @@
+__all__ = ['FacebookPost', 'User', 'FacebookUserScraper', 'FacebookCommunityScraper', 'FacebookGroupScraper']
+
+
 import bs4
+import dataclasses
 import datetime
 import json
 import logging
@@ -8,22 +12,44 @@ import typing
 import urllib.parse


-logger = logging.getLogger(__name__)
+_logger = logging.getLogger(__name__)


-class FacebookPost(typing.NamedTuple, snscrape.base.Item):
+@dataclasses.dataclass
+class FacebookPost(snscrape.base.Item):
 	cleanUrl: str
 	dirtyUrl: str
 	date: datetime.datetime
 	content: typing.Optional[str]
 	outlinks: list
-	outlinksss: str
+
+	outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks), 'outlinks')

 	def __str__(self):
 		return self.cleanUrl


-class FacebookCommonScraper(snscrape.base.Scraper):
+@dataclasses.dataclass
+class User(snscrape.base.Entity):
+	username: str
+	pageId: int
+	name: str
+	verified: bool
+	created: typing.Optional[datetime.date] = None
+	pageOwner: typing.Optional[str] = None
+	likes: typing.Optional[int] = None
+	followers: typing.Optional[int] = None
+	checkins: typing.Optional[int] = None
+	address: typing.Optional[str] = None
+	phone: typing.Optional[str] = None
+	web: typing.Optional[str] = None
+	keywords: typing.Optional[typing.List[str]] = None
+
+	def __str__(self):
+		return f'https://www.facebook.com/{self.username}/'
+
+
+class _FacebookCommonScraper(snscrape.base.Scraper):
 	def _clean_url(self, dirtyUrl):
 		u = urllib.parse.urlparse(dirtyUrl)
 		if u.path == '/permalink.php':
@@ -41,7 +67,7 @@ class FacebookCommonScraper(snscrape.base.Scraper):
 			if setVal.rstrip('0123456789').endswith('.a.'):
 				setVal = f'a.{setVal.rsplit(".", 1)[1]}'
 			clean = (u.scheme, u.netloc, u.path, urllib.parse.urlencode((('set', setVal),)), '')
-		elif u.path.split('/')[2] == 'posts' or u.path.startswith('/events/') or u.path.startswith('/notes/'):
+		elif u.path.split('/')[2] == 'posts' or u.path.startswith('/events/') or u.path.startswith('/notes/') or u.path.split('/')[1:4:2] == ['groups', 'permalink']:
 			# No manipulation of the path needed, but strip the query string
 			clean = (u.scheme, u.netloc, u.path, '', '')
 		elif u.path.split('/')[2] in ('photos', 'videos'):
@@ -80,11 +106,23 @@ class FacebookCommonScraper(snscrape.base.Scraper):
 			return False, None

 	def _soup_to_items(self, soup, baseUrl, mode):
+		cleanUrl = None # Value from previous iteration is used for warning on link-less entries
 		for entry in soup.find_all('div', class_ = '_5pcr'): # also class 'fbUserContent' in 2017 and 'userContentWrapper' in 2019
+			# Check that this is not inside another div._5pcr to avoid duplicates or extracting the wrong URL (e.g. 'X was mentioned in a post' on community pages)
+			parent = entry.parent
+			isNested = False
+			while parent:
+				if parent.name == 'div' and 'class' in parent.attrs and '_5pcr' in parent.attrs['class']:
+					isNested = True
+					break
+				parent = parent.parent
+			if isNested:
+				continue
+
 			entryA = entry.find('a', class_ = '_5pcq') # There can be more than one, e.g. when a post is shared by another user, but the first one is always the one of this entry.
 			mediaSetA = entry.find('a', class_ = '_17z-')
 			if not mediaSetA and not entryA:
-				logger.warning(f'Ignoring link-less entry after {cleanUrl}: {entry.text!r}')
+				_logger.warning(f'Ignoring link-less entry after {cleanUrl}: {entry.text!r}')
 				continue
 			if mediaSetA and (not entryA or entryA['href'] == '#'):
 				href = mediaSetA['href']
@@ -93,12 +131,12 @@ class FacebookCommonScraper(snscrape.base.Scraper):
 			oddLink, warn = self._is_odd_link(href, entry.text, mode)
 			if oddLink:
 				if warn:
-					logger.warning(f'Ignoring odd link: {href}')
+					_logger.warning(f'Ignoring odd link: {href}')
 				continue
 			dirtyUrl = urllib.parse.urljoin(baseUrl, href)
+			cleanUrl = self._clean_url(dirtyUrl)
 			date = datetime.datetime.fromtimestamp(int(entry.find('abbr', class_ = '_5ptz')['data-utime']), datetime.timezone.utc)
-			contentDiv = entry.find('div', class_ = '_5pbx')
-			if contentDiv:
+			if (contentDiv := entry.find('div', class_ = '_5pbx')):
 				content = contentDiv.text
 			else:
 				content = None
@@ -111,71 +149,157 @@ class FacebookCommonScraper(snscrape.base.Scraper):
 					continue
 				query = urllib.parse.parse_qs(urllib.parse.urlparse(href).query)
 				if 'u' not in query or len(query['u']) != 1:
-					logger.warning(f'Ignoring odd outlink: {href}')
+					_logger.warning(f'Ignoring odd outlink: {href}')
 					continue
 				outlink = query['u'][0]
 				if outlink.startswith('http://') or outlink.startswith('https://') and outlink not in outlinks:
 					outlinks.append(outlink)
-			yield FacebookPost(cleanUrl = self._clean_url(dirtyUrl), dirtyUrl = dirtyUrl, date = date, content = content, outlinks = outlinks, outlinksss = ' '.join(outlinks))
+			yield FacebookPost(cleanUrl = cleanUrl, dirtyUrl = dirtyUrl, date = date, content = content, outlinks = outlinks)


-class FacebookUserScraper(FacebookCommonScraper):
-	name = 'facebook-user'
-
+class _FacebookUserAndCommunityScraper(_FacebookCommonScraper):
 	def __init__(self, username, **kwargs):
 		super().__init__(**kwargs)
 		self._username = username
+		self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:78.0) Gecko/20100101 Firefox/78.0', 'Accept-Language': 'en-US,en;q=0.5'}
+		self._initialPage = None
+		self._initialPageSoup = None
+
+	def _initial_page(self):
+		if self._initialPage is None:
+			_logger.info('Retrieving initial data')
+			r = self._get(self._baseUrl, headers = self._headers)
+			if r.status_code not in (200, 404):
+				raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
+			self._initialPage = r
+			self._initialPageSoup = bs4.BeautifulSoup(r.text, 'lxml')
+		return self._initialPage, self._initialPageSoup

 	def get_items(self):
-		headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', 'Accept-Language': 'en-US,en;q=0.5'}
-
 		nextPageLinkPattern = re.compile(r'^/pages_reaction_units/more/\?page_id=')
 		spuriousForLoopPattern = re.compile(r'^for \(;;\);')

-		logger.info('Retrieving initial data')
-		baseUrl = f'https://www.facebook.com/{self._username}/'
-		r = self._get(baseUrl, headers = headers)
+		r, soup = self._initial_page()
 		if r.status_code == 404:
-			logger.warning('User does not exist')
+			_logger.warning('User does not exist')
 			return
-		elif r.status_code != 200:
-			logger.error('Got status code {r.status_code}')
-			return
-		soup = bs4.BeautifulSoup(r.text, 'lxml')
-		yield from self._soup_to_items(soup, baseUrl, 'user')
-		nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern)
+		yield from self._soup_to_items(soup, self._baseUrl, 'user')

-		while nextPageLink:
-			logger.info('Retrieving next page')
+		while (nextPageLink := soup.find('a', ajaxify = nextPageLinkPattern)):
+			_logger.info('Retrieving next page')

 			# The web app sends a bunch of additional parameters. Most of them would be easy to add, but there's also __dyn, which is a compressed list of the "modules" loaded in the browser.
 			# Reproducing that would be difficult to get right, especially as Facebook's codebase evolves, so it's just not sent at all here.
-			r = self._get(urllib.parse.urljoin(baseUrl, nextPageLink.get('ajaxify')) + '&__a=1', headers = headers)
+			r = self._get(urllib.parse.urljoin(self._baseUrl, nextPageLink.get('ajaxify')) + '&__a=1', headers = self._headers)
 			if r.status_code != 200:
-				logger.error(f'Got status code {r.status_code}')
-				return
+				raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
 			response = json.loads(spuriousForLoopPattern.sub('', r.text))
 			assert 'domops' in response
 			assert len(response['domops']) == 1
 			assert len(response['domops'][0]) == 4
 			assert response['domops'][0][0] == 'replace', f'{response["domops"][0]} is not "replace"'
-			assert response['domops'][0][1] == '#www_pages_reaction_see_more_unitwww_pages_home'
+			assert response['domops'][0][1] in ('#www_pages_reaction_see_more_unitwww_pages_home', '#www_pages_reaction_see_more_unitwww_pages_community_tab')
 			assert response['domops'][0][2] == False
 			assert '__html' in response['domops'][0][3]
 			soup = bs4.BeautifulSoup(response['domops'][0][3]['__html'], 'lxml')
-			yield from self._soup_to_items(soup, baseUrl, 'user')
-			nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern)
+			yield from self._soup_to_items(soup, self._baseUrl, 'user')

 	@classmethod
-	def setup_parser(cls, subparser):
-		subparser.add_argument('username', help = 'A Facebook username or user ID')
+	def _cli_setup_parser(cls, subparser):
+		subparser.add_argument('username', type = snscrape.base.nonempty_string('username'), help = 'A Facebook username or user ID')

 	@classmethod
-	def from_args(cls, args):
-		return cls(args.username, retries = args.retries)
+	def _cli_from_args(cls, args):
+		return cls._cli_construct(args, args.username)


-class FacebookGroupScraper(FacebookCommonScraper):
+class FacebookUserScraper(_FacebookUserAndCommunityScraper):
+	name = 'facebook-user'
+
+	def __init__(self, *args, **kwargs):
+		super().__init__(*args, **kwargs)
+		self._baseUrl = f'https://www.facebook.com/{self._username}/'
+
+	def _get_entity(self):
+		kwargs = {}
+
+		nameVerifiedMarkupPattern = re.compile(r'"markup":\[\["__markup_a588f507_0_0",\{"__html":(".*?")\}')
+		handleDivPattern = re.compile(r'<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>')
+		handlePattern = re.compile(r'<a\s[^>]*(?<=\s)href="/([^/]+)')
+		months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
+		createdDatePattern = re.compile('^(' + '|'.join(months) + r') (\d+), (\d+)$')
+
+		r, soup = self._initial_page()
+		if r.status_code != 200:
+			return
+
+		handleDiv = handleDivPattern.search(r.text)
+		handle = handlePattern.search(handleDiv.group(0))
+		kwargs['username'] = handle.group(1)
+
+		nameVerifiedMarkup = nameVerifiedMarkupPattern.search(r.text)
+		nameVerifiedMarkup = json.loads(nameVerifiedMarkup.group(1))
+		nameVerifiedSoup = bs4.BeautifulSoup(nameVerifiedMarkup, 'lxml')
+		kwargs['name'] = nameVerifiedSoup.find('a', class_ = '_64-f').text
+		kwargs['verified'] = bool(nameVerifiedSoup.find('a', class_ = '_56_f'))
+
+		pageTransparencyContentDiv = soup.find('div', class_ = '_61-0')
+		if pageTransparencyContentDiv.text.startswith('Page created - '):
+			createdDateMess = pageTransparencyContentDiv.text.split(' - ', 1)[1]
+			m = createdDatePattern.match(createdDateMess)
+			assert m, 'unexpected created div content'
+			kwargs['created'] = datetime.date(int(m.group(3)), months.index(m.group(1)) + 1, int(m.group(2)))
+		if pageTransparencyContentDiv.text.startswith('Confirmed Page Owner: '):
+			kwargs['pageOwner'] = pageTransparencyContentDiv.text.split(': ', 1)[1]
+
+		communityDiv = soup.find('div', class_ = '_6590')
+		for div in communityDiv.find_all('div', class_ = '_4bl9'):
+			text = div.text
+			if text.endswith(' people like this'):
+				kwargs['likes'] = int(text.split(' ', 1)[0].replace(',', ''))
+			elif text.endswith(' people follow this'):
+				kwargs['followers'] = int(text.split(' ', 1)[0].replace(',', ''))
+			elif text.endswith(' check-ins'):
+				kwargs['checkins'] = int(text.split(' ', 1)[0].replace(',', ''))
+
+		aboutDiv = soup.find('div', class_ = '_u9q')
+		if aboutDiv:
+			# As if the above wasn't already ugly enough, this is where it gets really bad...
+			for div in aboutDiv.find_all('div', class_ = '_2pi9'):
+				img = div.find('img', class_ = '_3-91')
+				if not img:
+					continue
+				if img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/y5/r/vfXKA62x4Da.png': # Address
+					rawAddress = div.find('div', class_ = '_2wzd').text
+					kwargs['address'] = re.sub(r' \((\d+,)?\d+(\.\d+)? mi\)', '\n', rawAddress) # Remove distance from inferred IP location, restore linebreak
+				elif img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/yW/r/mYv88EsODOI.png': # Phone number
+					kwargs['phone'] = div.find('div', class_ = '_4bl9').text
+				elif img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/yx/r/xVA3lB-GVep.png': # Web link
+					for a in div.find_all('a'):
+						if a.text == '' or 'href' not in a.attrs or a.find('span'):
+							continue
+						dirtyWeb = a['href']
+						assert dirtyWeb.startswith('https://l.facebook.com/l.php?u='), 'unexpected web link'
+						kwargs['web'] = urllib.parse.unquote(dirtyWeb.split('=', 1)[1].split('&', 1)[0])
+				elif img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/yl/r/LwDWwC1d0Rx.png': # Keywords
+					kwargs['keywords'] = div.find('div', class_ = '_4bl9').text.split(' · ')
+
+		androidUrlMeta = soup.find('meta', property = 'al:android:url')
+		assert androidUrlMeta['content'].startswith('fb://page/') and androidUrlMeta['content'].endswith('?referrer=app_link')
+		kwargs['pageId'] = int(androidUrlMeta['content'][10:-18])
+
+		return User(**kwargs)
+
+
+class FacebookCommunityScraper(_FacebookUserAndCommunityScraper):
+	name = 'facebook-community'
+
+	def __init__(self, *args, **kwargs):
+		super().__init__(*args, **kwargs)
+		self._baseUrl = f'https://www.facebook.com/{self._username}/community/'
+
+
+class FacebookGroupScraper(_FacebookCommonScraper):
 	name = 'facebook-group'

 	def __init__(self, group, **kwargs):
@@ -189,18 +313,16 @@ class FacebookGroupScraper(FacebookCommonScraper):
 		pageletDataPrefixLength = len('"GroupEntstreamPagelet",')
 		spuriousForLoopPattern = re.compile(r'^for \(;;\);')

-		baseUrl = f'https://www.facebook.com/groups/{self._group}/'
+		baseUrl = f'https://upload.facebook.com/groups/{self._group}/?sorting_setting=CHRONOLOGICAL'
 		r = self._get(baseUrl, headers = headers)
 		if r.status_code == 404:
-			logger.warning('Group does not exist')
+			_logger.warning('Group does not exist')
 			return
 		elif r.status_code != 200:
-			logger.error('Got status code {r.status_code}')
-			return
+			raise snscrape.base.ScraperException(f'Got status code {r.status_code}')

 		if 'content:{pagelet_group_mall:{container_id:"' not in r.text:
-			logger.error('Code container ID marker not found (does the group exist?)')
-			return
+			raise snscrape.base.ScraperException('Code container ID marker not found (does the group exist?)')

 		soup = bs4.BeautifulSoup(r.text, 'lxml')

@@ -210,35 +332,33 @@ class FacebookGroupScraper(FacebookCommonScraper):
 			codeContainerId = r.text[codeContainerIdPos : r.text.index('"', codeContainerIdPos)]
 			codeContainer = soup.find('code', id = codeContainerId)
 			if not codeContainer:
-				raise RuntimeError('Code container not found')
+				raise snscrape.base.ScraperException('Code container not found')
 			if type(codeContainer.string) is not bs4.element.Comment:
-				raise RuntimeError('Code container does not contain a comment')
+				raise snscrape.base.ScraperException('Code container does not contain a comment')
 			codeSoup = bs4.BeautifulSoup(codeContainer.string, 'lxml')
 			yield from self._soup_to_items(codeSoup, baseUrl, 'group')

 		# Pagination
-		data = pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:]
-		while True:
+		while (data := pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:]):
 			# As on the user profile pages, the web app sends a lot of additional parameters, but those all seem to be unnecessary (although some change the response format, e.g. from JSON to HTML)
 			r = self._get(
-				f'https://www.facebook.com/ajax/pagelet/generic.php/GroupEntstreamPagelet',
+				'https://upload.facebook.com/ajax/pagelet/generic.php/GroupEntstreamPagelet',
 				params = {'data': data, '__a': 1},
 				headers = headers,
 			  )
 			if r.status_code != 200:
-				raise RuntimeError(f'Got status code {r.status_code}')
+				raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
 			obj = json.loads(spuriousForLoopPattern.sub('', r.text))
 			if obj['payload'] == '':
 				# End of pagination
 				break
 			soup = bs4.BeautifulSoup(obj['payload'], 'lxml')
 			yield from self._soup_to_items(soup, baseUrl, 'group')
-			data = pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:]

 	@classmethod
-	def setup_parser(cls, subparser):
-		subparser.add_argument('group', help = 'A group name or ID')
+	def _cli_setup_parser(cls, subparser):
+		subparser.add_argument('group', type = snscrape.base.nonempty_string('group'), help = 'A group name or ID')

 	@classmethod
-	def from_args(cls, args):
-		return cls(args.group, retries = args.retries)
+	def _cli_from_args(cls, args):
+		return cls._cli_construct(args, args.group)
--- a/snscrape/modules/gab.py
+++ b/snscrape/modules/gab.py
@@ -1,115 +0,0 @@
-import datetime
-import json
-import logging
-import snscrape.base
-import time
-import typing
-import urllib.parse
-
-
-logger = logging.getLogger(__name__)
-
-
-class GabPost(typing.NamedTuple, snscrape.base.Item):
-	url: str
-	date: datetime.datetime
-	content: str
-
-	def __str__(self):
-		return self.url
-
-
-class GabUserCommonScraper(snscrape.base.Scraper):
-	def __init__(self, mode, username, **kwargs):
-		super().__init__(**kwargs)
-		if mode not in ('posts', 'comments', 'media'):
-			raise ValueError('Invalid mode')
-		self._mode = mode
-		self._username = username
-		if mode == 'posts':
-			self._baseUrl = f'https://gab.com/api/feed/{username}'
-			self._beforeGlue = '?'
-		elif mode == 'comments':
-			self._baseUrl = f'https://gab.com/api/feed/{username}/comments?includes=post.conversation_parent'
-			self._beforeGlue = '&'
-		elif mode == 'media':
-			self._baseUrl = f'https://gab.com/api/feed/{username}/media'
-			self._beforeGlue = '?'
-
-	def _response_to_items(self, response):
-		yielded = set()
-		for post in response['data']:
-			if post['post']['id'] not in yielded:
-				yield GabPost(
-				  url = f'https://gab.com/{post["post"]["user"]["username"]}/posts/{post["post"]["id"]}',
-				  date = datetime.datetime.strptime(post['post']['created_at'].replace('-', '', 2).replace(':', ''), '%Y%m%dT%H%M%S%z'),
-				  content = post['post']['body'],
-				 )
-				yielded.add(post['post']['id'])
-
-	def get_items(self):
-		headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept-Language': 'en-US,en;q=0.5'}
-
-		logger.info('Retrieving initial data')
-		r = self._get(self._baseUrl, headers = headers)
-		if r.status_code == 404:
-			logger.error('User does not exist')
-			return
-		elif r.status_code != 200:
-			logger.error(f'Got status code {r.status_code}')
-			return
-
-		response = json.loads(r.text)
-		if not response['data']:
-			logger.error('User has no posts')
-			return
-		yield from self._response_to_items(response)
-		if self._mode == 'posts':
-			before = response['data'][-1]['published_at']
-		elif self._mode in ('comments', 'media'):
-			before = 30
-
-		while True:
-			logger.info('Retrieving next page')
-			r = self._get(f'{self._baseUrl}{self._beforeGlue}before={before}', headers = headers)
-			if r.status_code != 200:
-				logger.error(f'Got status code {r.status_code}')
-				return
-			response = json.loads(r.text)
-			yield from self._response_to_items(response)
-			if response['no-more'] or not response['data']:
-				# Last page
-				return
-			if self._mode == 'posts':
-				before = response['data'][-1]['published_at']
-			elif self._mode in ('comments', 'media'):
-				before += 30
-			time.sleep(1) # Gab's API is pretty quick but doesn't like being hammered...
-
-	@classmethod
-	def setup_parser(cls, subparser):
-		subparser.add_argument('username', help = 'A Gab username')
-
-
-class GabUserPostsScraper(GabUserCommonScraper):
-	name = 'gab-user'
-
-	@classmethod
-	def from_args(cls, args):
-		return cls('posts', args.username, retries = args.retries)
-
-
-class GabUserCommentsScraper(GabUserCommonScraper):
-	name = 'gab-user-comments'
-
-	@classmethod
-	def from_args(cls, args):
-		return cls('comments', args.username, retries = args.retries)
-
-
-class GabUserMediaScraper(GabUserCommonScraper):
-	name = 'gab-user-media'
-
-	@classmethod
-	def from_args(cls, args):
-		return cls('media', args.username, retries = args.retries)
--- a/snscrape/modules/googleplus.py
+++ b/snscrape/modules/googleplus.py
@@ -1,102 +0,0 @@
-import datetime
-import itertools
-import json
-import logging
-import re
-import snscrape.base
-
-
-logger = logging.getLogger(__name__)
-
-
-class GooglePlusUserScraper(snscrape.base.Scraper):
-	name = 'googleplus-user'
-
-	def __init__(self, user, **kwargs):
-		super().__init__(**kwargs)
-		self._user = user
-
-	def get_items(self):
-		headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
-
-		logger.info('Retrieving initial data')
-		r = self._get(f'https://plus.google.com/{self._user}', headers = headers)
-		if r.status_code == 404:
-			logger.warning('User does not exist')
-			return
-		elif r.status_code != 200:
-			logger.error(f'Got status code {r.status_code}')
-			return
-
-		# Global data; only needed for the session ID
-		#TODO: Make this more robust somehow
-		match = re.search(r'''(['"])FdrFJe\1\s*:\s*(['"])(?P<sid>.*?)\2''', r.text)
-		if not match:
-			logger.error('Unable to find session ID')
-			return
-		sid = match.group('sid')
-
-		# Page data
-		# As of 2018-05-18, the much simpler regex r'''<script[^>]*>AF_initDataCallback\(\{key: 'ds:6',.*?return (.*?)\}\}\);</script>''' would work also, but this is more generic and less likely to break:
-		match = re.search(r'''<script[^>]*>\s*(?:.*?)\s*\(\s*\{(?:|.*?,)\s*key\s*:\s*(['"])ds:6\1\s*,.*?,\s*data\s*:\s*function\s*\(\s*\)\s*\{\s*return\s*(?P<data>.*?)\}\s*\}\s*\)\s*;\s*</script>''', r.text, re.DOTALL)
-		if not match:
-			logger.error('Unable to extract data')
-			return
-		jsonData = match.group('data')
-		response = json.loads(jsonData)
-		if response[0][7] is None:
-			logger.info('User has no posts')
-			return
-		for postObj in response[0][7]:
-			yield snscrape.base.URLItem(f'https://plus.google.com/{postObj[6]["33558957"][21]}')
-		cursor = response[0][1] # 'ADSJ_x'
-		if cursor is None:
-			# No further pages
-			return
-		baseDate = datetime.datetime.utcnow()
-		baseSeconds = baseDate.hour * 3600 + baseDate.minute * 60 + baseDate.second
-		userid = response[1] # Alternatively and more ugly: response[0][7][0][6]['33558957'][16]
-
-		for counter in itertools.count(start = 2):
-			logger.info('Retrieving next page')
-			reqid = 1 + baseSeconds + int(1e5) * counter
-			r = self._post(
-			    f'https://plus.google.com/_/PlusAppUi/data?ds.extension=74333095&f.sid={sid}&hl=en-US&soc-app=199&soc-platform=1&soc-device=1&_reqid={reqid}&rt=c',
-			    data = [('f.req', '[[[74333095,[{"74333095":["' + cursor + '","' + userid + '"]}],null,null,0]]]'), ('', '')],
-			    headers = headers
-			  )
-			if r.status_code != 200:
-				logger.error(f'Got status code {r.status_code}')
-				return
-
-			# As if everything up to here wasn't terrible already, this is where it gets *really* bad.
-			# The API contains a few junk characters at the beginning, apparently as an anti-CSRF measure.
-			# The remainder is effectively a self-made chunked transfer encoding but with decimal digits and including everything except the digits themselves in the chunk size.
-			# It sucks.
-			# Each chunk is actually one JSON object; you'd think that we can just read the first one and parse that, but there are some quirks that make this difficult.
-			# I was unable to figure out what the "chunk size" actually covers exactly; the response is UTF-8 encoded, but the chunk size matches neither the binary nor the decoded length.
-			# Enter the awful workaround: strip away the initial chunk size, then parse the beginning of the remaining data using a parser that doesn't care if there's junk after the JSON.
-
-			garbage = r.text
-			assert garbage[:6] == ")]}'\n\n" # anti-CSRF and two newlines
-			data = []
-			pos = 6
-			while garbage[pos].isdigit() or garbage[pos].isspace(): # Also strip leading whitespace
-				pos += 1
-			response = json.JSONDecoder().raw_decode(''.join(garbage[pos:]))[0] # Parses only the first structure in the data stream without throwing an error about the extra data at the end
-
-			for postObj in response[0][2]['74333095'][0][7]:
-				yield snscrape.base.URLItem(f'https://plus.google.com/{postObj[6]["33558957"][21]}')
-
-			cursor = response[0][2]['74333095'][0][1]
-
-			if cursor is None:
-				break
-
-	@classmethod
-	def setup_parser(cls, subparser):
-		subparser.add_argument('user', help = 'A Google Plus username (with leading "+") or numeric ID')
-
-	@classmethod
-	def from_args(cls, args):
-		return cls(args.user, retries = args.retries)
--- a/snscrape/modules/instagram.py
+++ b/snscrape/modules/instagram.py
@@ -1,73 +1,87 @@
+__all__ = ['InstagramPost', 'User', 'InstagramUserScraper', 'InstagramHashtagScraper', 'InstagramLocationScraper']
+
+
+import dataclasses
 import datetime
 import hashlib
 import json
 import logging
+import re
 import snscrape.base
 import typing


-logger = logging.getLogger(__name__)
+_logger = logging.getLogger(__name__)


-class InstagramPost(typing.NamedTuple, snscrape.base.Item):
-	cleanUrl: str
-	dirtyUrl: str
+@dataclasses.dataclass
+class InstagramPost(snscrape.base.Item):
+	url: str
 	date: datetime.datetime
-	content: str
+	content: typing.Optional[str]
 	thumbnailUrl: str
 	displayUrl: str
+	username: typing.Optional[str]
+	likes: int
+	comments: int
+	commentsDisabled: bool
+	isVideo: bool

 	def __str__(self):
-		return self.cleanUrl
+		return self.url


-class InstagramCommonScraper(snscrape.base.Scraper):
-	def __init__(self, mode, name, **kwargs):
+@dataclasses.dataclass
+class User(snscrape.base.Entity):
+	username: str
+	name: typing.Optional[str]
+	followers: snscrape.base.IntWithGranularity
+	following: snscrape.base.IntWithGranularity
+	posts: snscrape.base.IntWithGranularity
+
+	followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
+	followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
+	postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
+
+	def __str__(self):
+		return f'https://www.instagram.com/{self.username}/'
+
+
+class _InstagramCommonScraper(snscrape.base.Scraper):
+	def __init__(self, **kwargs):
 		super().__init__(**kwargs)
-		if mode not in ('User', 'Hashtag', 'Location'):
-			raise ValueError('Invalid mode')
-		self._mode = mode
-		self._name = name
-
-		if self._mode == 'User':
-			self._initialUrl = f'https://www.instagram.com/{self._name}/'
-			self._pageName = 'ProfilePage'
-			self._responseContainer = 'user'
-			self._edgeXToMedia = 'edge_owner_to_timeline_media'
-			self._pageIDKey = 'id'
-			self._queryHash = 'f2405b236d85e8296cf30347c9f08c2a'
-			self._variablesFormat = '{{"id":"{pageID}","first":50,"after":"{endCursor}"}}'
-		elif self._mode == 'Hashtag':
-			self._initialUrl = f'https://www.instagram.com/explore/tags/{self._name}/'
-			self._pageName = 'TagPage'
-			self._responseContainer = 'hashtag'
-			self._edgeXToMedia = 'edge_hashtag_to_media'
-			self._pageIDKey = 'name'
-			self._queryHash = 'f92f56d47dc7a55b606908374b43a314'
-			self._variablesFormat = '{{"tag_name":"{pageID}","first":50,"after":"{endCursor}"}}'
-		elif self._mode == 'Location':
-			self._initialUrl = f'https://www.instagram.com/explore/locations/{self._name}/'
-			self._pageName = 'LocationsPage'
-			self._responseContainer = 'location'
-			self._edgeXToMedia = 'edge_location_to_media'
-			self._pageIDKey = 'id'
-			self._queryHash = '1b84447a4d8b6d6d0426fefb34514485'
-			self._variablesFormat = '{{"id":"{pageID}","first":50,"after":"{endCursor}"}}'
+		self._headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
+		self._initialPage = None

 	def _response_to_items(self, response):
 		for node in response[self._responseContainer][self._edgeXToMedia]['edges']:
 			code = node['node']['shortcode']
-			usernameQuery = '?taken-by=' + node['node']['owner']['username'] if 'username' in node['node']['owner'] else ''
-			cleanUrl = f'https://www.instagram.com/p/{code}/'
+			username = node['node']['owner']['username'] if 'username' in node['node']['owner'] else None
+			url = f'https://www.instagram.com/p/{code}/'
 			yield InstagramPost(
-			  cleanUrl = cleanUrl,
-			  dirtyUrl = f'{cleanUrl}{usernameQuery}',
+			  url = url,
 			  date = datetime.datetime.fromtimestamp(node['node']['taken_at_timestamp'], datetime.timezone.utc),
 			  content = node['node']['edge_media_to_caption']['edges'][0]['node']['text'] if len(node['node']['edge_media_to_caption']['edges']) else None,
 			  thumbnailUrl = node['node']['thumbnail_src'],
 			  displayUrl = node['node']['display_url'],
+			  username = username,
+			  likes = node['node']['edge_media_preview_like']['count'],
+			  comments = node['node']['edge_media_to_comment']['count'],
+			  commentsDisabled = node['node']['comments_disabled'],
+			  isVideo = node['node']['is_video'],
 			 )

+	def _initial_page(self):
+		if self._initialPage is None:
+			_logger.info('Retrieving initial data')
+			r = self._get(self._initialUrl, headers = self._headers, responseOkCallback = self._check_initial_page_callback)
+			if r.status_code not in (200, 404):
+				raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
+			elif r.url.startswith('https://www.instagram.com/accounts/login/'):
+				raise snscrape.base.ScraperException('Redirected to login page')
+			self._initialPage = r
+		return self._initialPage
+
 	def _check_initial_page_callback(self, r):
 		if r.status_code != 200:
 			return True, None
@@ -82,6 +96,8 @@ class InstagramCommonScraper(snscrape.base.Scraper):
 	def _check_json_callback(self, r):
 		if r.status_code != 200:
 			return False, f'status code {r.status_code}'
+		if r.url.startswith('https://www.instagram.com/accounts/login/'):
+			raise snscrape.base.ScraperException('Redirected to login page')
 		try:
 			obj = json.loads(r.text)
 		except json.JSONDecodeError as e:
@@ -90,23 +106,17 @@ class InstagramCommonScraper(snscrape.base.Scraper):
 		return True, None

 	def get_items(self):
-		headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
-
-		logger.info('Retrieving initial data')
-		r = self._get(self._initialUrl, headers = headers, responseOkCallback = self._check_initial_page_callback)
+		r = self._initial_page()
 		if r.status_code == 404:
-			logger.warning(f'{self._mode} does not exist')
-			return
-		elif r.status_code != 200:
-			logger.error(f'Got status code {r.status_code}')
+			_logger.warning('Page does not exist')
 			return
 		response = r._snscrape_json_obj
 		rhxGis = response['rhx_gis'] if 'rhx_gis' in response else ''
 		if response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['count'] == 0:
-			logger.info(f'{self._mode} has no posts')
+			_logger.info('Page has no posts')
 			return
 		if not response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['edges']:
-			logger.warning('Private account')
+			_logger.warning('Private account')
 			return
 		pageID = response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._pageIDKey]
 		yield from self._response_to_items(response['entry_data'][self._pageName][0]['graphql'])
@@ -114,16 +124,16 @@ class InstagramCommonScraper(snscrape.base.Scraper):
 			return
 		endCursor = response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['page_info']['end_cursor']

+		headers = self._headers.copy()
 		while True:
-			logger.info(f'Retrieving endCursor = {endCursor!r}')
+			_logger.info(f'Retrieving endCursor = {endCursor!r}')
 			variables = self._variablesFormat.format(**locals())
 			headers['X-Requested-With'] = 'XMLHttpRequest'
 			headers['X-Instagram-GIS'] = hashlib.md5(f'{rhxGis}:{variables}'.encode('utf-8')).hexdigest()
 			r = self._get(f'https://www.instagram.com/graphql/query/?query_hash={self._queryHash}&variables={variables}', headers = headers, responseOkCallback = self._check_json_callback)

 			if r.status_code != 200:
-				logger.error(f'Got status code {r.status_code}')
-				return
+				raise snscrape.base.ScraperException(f'Got status code {r.status_code}')

 			response = r._snscrape_json_obj
 			if not response['data'][self._responseContainer][self._edgeXToMedia]['edges']:
@@ -134,37 +144,100 @@ class InstagramCommonScraper(snscrape.base.Scraper):
 			endCursor = response['data'][self._responseContainer][self._edgeXToMedia]['page_info']['end_cursor']


-class InstagramUserScraper(InstagramCommonScraper):
+class InstagramUserScraper(_InstagramCommonScraper):
 	name = 'instagram-user'

-	@classmethod
-	def setup_parser(cls, subparser):
-		subparser.add_argument('username', help = 'An Instagram username (no leading @)')
+	def __init__(self, username, **kwargs):
+		super().__init__(**kwargs)
+		self._initialUrl = f'https://www.instagram.com/{username}/'
+		self._pageName = 'ProfilePage'
+		self._responseContainer = 'user'
+		self._edgeXToMedia = 'edge_owner_to_timeline_media'
+		self._pageIDKey = 'id'
+		self._queryHash = 'f2405b236d85e8296cf30347c9f08c2a'
+		self._variablesFormat = '{{"id":"{pageID}","first":50,"after":"{endCursor}"}}'
+
+	def _get_entity(self):
+		r = self._initial_page()
+		if r.status_code != 200:
+			return
+		if '<meta property="og:description" content="' not in r.text:
+			return
+		ogDescriptionContentPos = r.text.index('<meta property="og:description" content="') + len('<meta property="og:description" content="')
+		ogDescription = r.text[ogDescriptionContentPos : r.text.index('"', ogDescriptionContentPos)]
+
+		numPattern = r'\d+(?:\.\d+)?m|\d+(?:\.\d+)?k|\d+,\d+|\d+'
+		ogDescriptionPattern = re.compile('^(' + numPattern + ') Followers, (' + numPattern + ') Following, (' + numPattern + r') Posts - See Instagram photos and videos from (?:(.*?) \(@([a-z0-9_.]+)\)|@([a-z0-9_-]+))$')
+		m = ogDescriptionPattern.match(ogDescription)
+		assert m, 'unexpected og:description format'
+
+		def parse_num(s):
+			if s.endswith('m'):
+				return int(float(s[:-1].replace(',', '')) * 1e6), 10 ** (6 if '.' not in s else 6 - len(s[:-1].replace(',', '').split('.')[1]))
+			elif s.endswith('k'):
+				return int(float(s[:-1].replace(',', '')) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].replace(',', '').split('.')[1]))
+			else:
+				return int(s.replace(',', '')), 1
+
+		followers = snscrape.base.IntWithGranularity(*parse_num(m.group(1)))
+		following = snscrape.base.IntWithGranularity(*parse_num(m.group(2)))
+		posts = snscrape.base.IntWithGranularity(*parse_num(m.group(3)))
+		return User(
+			username = m.group(5) or m.group(6),
+			name = m.group(4) or None,
+			followers = followers,
+			following = following,
+			posts = posts,
+		  )

 	@classmethod
-	def from_args(cls, args):
-		return cls('User', args.username, retries = args.retries)
+	def _cli_setup_parser(cls, subparser):
+		subparser.add_argument('username', type = snscrape.base.nonempty_string('username'), help = 'An Instagram username (no leading @)')
+
+	@classmethod
+	def _cli_from_args(cls, args):
+		return cls._cli_construct(args, args.username)


-class InstagramHashtagScraper(InstagramCommonScraper):
+class InstagramHashtagScraper(_InstagramCommonScraper):
 	name = 'instagram-hashtag'

-	@classmethod
-	def setup_parser(cls, subparser):
-		subparser.add_argument('hashtag', help = 'An Instagram hashtag (no leading #)')
+	def __init__(self, hashtag, **kwargs):
+		super().__init__(**kwargs)
+		self._initialUrl = f'https://www.instagram.com/explore/tags/{hashtag}/'
+		self._pageName = 'TagPage'
+		self._responseContainer = 'hashtag'
+		self._edgeXToMedia = 'edge_hashtag_to_media'
+		self._pageIDKey = 'name'
+		self._queryHash = 'f92f56d47dc7a55b606908374b43a314'
+		self._variablesFormat = '{{"tag_name":"{pageID}","first":50,"after":"{endCursor}"}}'

 	@classmethod
-	def from_args(cls, args):
-		return cls('Hashtag', args.hashtag, retries = args.retries)
+	def _cli_setup_parser(cls, subparser):
+		subparser.add_argument('hashtag', type = snscrape.base.nonempty_string('hashtag'), help = 'An Instagram hashtag (no leading #)')
+
+	@classmethod
+	def _cli_from_args(cls, args):
+		return cls._cli_construct(args, args.hashtag)


-class InstagramLocationScraper(InstagramCommonScraper):
+class InstagramLocationScraper(_InstagramCommonScraper):
 	name = 'instagram-location'

+	def __init__(self, locationId, **kwargs):
+		super().__init__(**kwargs)
+		self._initialUrl = f'https://www.instagram.com/explore/locations/{locationId}/'
+		self._pageName = 'LocationsPage'
+		self._responseContainer = 'location'
+		self._edgeXToMedia = 'edge_location_to_media'
+		self._pageIDKey = 'id'
+		self._queryHash = '1b84447a4d8b6d6d0426fefb34514485'
+		self._variablesFormat = '{{"id":"{pageID}","first":50,"after":"{endCursor}"}}'
+
 	@classmethod
-	def setup_parser(cls, subparser):
+	def _cli_setup_parser(cls, subparser):
 		subparser.add_argument('locationid', help = 'An Instagram location ID', type = int)

 	@classmethod
-	def from_args(cls, args):
-		return cls('Location', args.locationid, retries = args.retries)
+	def _cli_from_args(cls, args):
+		return cls._cli_construct(args, args.locationid)
--- a/snscrape/modules/mastodon.py
+++ b/snscrape/modules/mastodon.py
@@ -0,0 +1,340 @@
+__all__ = ['Toot', 'Boost', 'Attachment', 'Poll', 'PollOption', 'User', 'CustomEmoji', 'MastodonProfileScraper', 'MastodonTootScraperMode', 'MastodonTootScraper']
+
+
+import bs4
+import dataclasses
+import datetime
+import enum
+import json
+import logging
+import snscrape.base
+import time
+import typing
+import urllib.parse
+
+
+_logger = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass
+class Toot(snscrape.base.Item):
+	url: str
+	id: str
+	user: 'User'
+	date: datetime.datetime
+	text: str
+	spoilerText: typing.Optional[str] = None
+	attachments: typing.Optional[typing.List['Attachment']] = None
+	links: typing.Optional[typing.List[str]] = None
+	mentionedUsers: typing.Optional[typing.List['User']] = None
+	hashtags: typing.Optional[typing.List[str]] = None
+	poll: typing.Optional['Poll'] = None
+
+	def __str__(self):
+		return self.url
+
+
+@dataclasses.dataclass
+class Boost(snscrape.base.Item):
+	user: 'User'
+	toot: Toot
+
+	def __str__(self):
+		# Boosts don't have their own URLs
+		return str(self.toot)
+
+
+@dataclasses.dataclass
+class Attachment:
+	url: str
+	name: str
+
+
+@dataclasses.dataclass
+class Poll:
+	id: str
+	expirationDate: datetime.datetime
+	multiple: bool
+	options: typing.List['PollOption']
+	votesCount: int
+	votersCount: typing.Optional[int] = None # Available since version 3.0.0 (commit 3babf846)
+
+
+@dataclasses.dataclass
+class PollOption:
+	title: str
+	votesCount: int
+
+
+@dataclasses.dataclass
+class User(snscrape.base.Entity):
+	account: str # @username@domain.invalid
+	displayName: typing.Optional[str] = None
+	displayNameWithCustomEmojis: typing.Optional[typing.List[typing.Union[str, 'CustomEmoji']]] = None
+	avatarUrl: typing.Optional[str] = None
+	_url: typing.Optional[str] = None
+
+	@property
+	def url(self):
+		if self._url:
+			return self._url
+		return f'https://{"/@".join(reversed(self.account[1:].split("@")))}'
+
+	def __str__(self):
+		return self.url
+
+
+@dataclasses.dataclass
+class CustomEmoji:
+	shortName: str
+	url: str
+	staticUrl: str
+
+
+class _MastodonCommonScraper(snscrape.base.Scraper):
+	def __init__(self, **kwargs):
+		super().__init__(**kwargs)
+		self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept-Language': 'en-US,en;q=0.5'}
+		self._lastRequest = 0
+
+	def _rate_limited_get(self, *args, **kwargs):
+		if (diff := time.time() - self._lastRequest) < 3:
+			time.sleep(3 - diff)
+		self._lastRequest = time.time()
+		return self._get(*args, **kwargs)
+
+	def _entries_to_items(self, entries, url):
+		for entry in entries:
+			if entry.find('a', class_ = 'load-more'):
+				continue
+
+			tootKwargs = {}
+
+			info = entry.find('div', class_ = 'status__info')
+			if not info: # Before 2.5.0 (commit bb71538b)
+				info = entry.find('div', class_ = 'status__header')
+			if not info: # Detailed status (i.e. toot page rather than timeline)?
+				info = entry.find('div', class_ = 'detailed-status__meta')
+			link = info.find('a', class_ = 'status__relative-time')
+			if not link: # Detailed status?
+				link = info.find('a', class_ = 'detailed-status__datetime')
+			tootKwargs['url'] = link['href']
+			tootKwargs['id'] = tootKwargs['url'].rsplit('/', 1)[1]
+			tootKwargs['date'] = datetime.datetime.strptime(info.find('data', class_ = 'dt-published')['value'], '%Y-%m-%dT%H:%M:%S+00:00').replace(tzinfo = datetime.timezone.utc)
+
+			userKwargs = {}
+			userLink = info.find('a', class_ = 'status__display-name')
+			if not userLink: # Detailed status?
+				userLink = entry.find('a', class_ = 'detailed-status__display-name')
+			userNameSpan = userLink.find('span', class_ = 'display-name')
+			userKwargs['account'] = userNameSpan.find('span').text.strip()
+			if userKwargs['account'].count('@') == 1: # Ancient versions don't include the instance for posts from accounts on the instance itself
+				userKwargs['account'] = self._url_to_account(userLink['href'])
+			userKwargs['_url'] = urllib.parse.urljoin(url, userLink['href'])
+			userKwargs['displayName'], userKwargs['displayNameWithCustomEmojis'] = self._display_name(userNameSpan.find('strong'), url)
+			userKwargs['avatarUrl'] = urllib.parse.urljoin(url, userLink.find('img', class_ = 'u-photo')['src'])
+			tootKwargs['user'] = User(**userKwargs)
+
+			content = entry.find('div', class_ = 'status__content')
+			if not content.find(class_ = 'status__content__spoiler-link'):
+				tootKwargs['text'] = '\n\n'.join(p.text for p in content.find_all('p'))
+			else:
+				tootKwargs['text'] = content.find('span', class_ = 'p-summary').text
+				tootKwargs['spoilerText'] = '\n\n'.join(p.text for p in content.find('div', class_ = 'e-content').find_all('p'))
+
+			if (attachmentsDiv := entry.find('div', class_ = 'attachment-list')):
+				attachments = []
+				for a in attachmentsDiv.find_all('a'):
+					attachments.append(Attachment(url = urllib.parse.urljoin(url, a['href']), name = a.text.strip()))
+				tootKwargs['attachments'] = attachments
+			elif (mediaGalleryDiv := entry.find('div', attrs = {'data-component': 'MediaGallery'})): # Before 2.7.0 (https://github.com/mastodon/mastodon/issues/6714)
+				o = json.loads(mediaGalleryDiv['data-props'])
+				attachments = []
+				for medium in o['media']:
+					attachments.append(Attachment(url = urllib.parse.urljoin(url, medium['url']), name = medium['url'].rsplit('/', 1)[-1].strip()))
+				tootKwargs['attachments'] = attachments
+			elif (attachmentsDiv := entry.find('div', class_ = 'status__attachments')): # Before 2.3.0 (commit 2bbf987a)
+				attachments = []
+				for a in attachmentsDiv.find_all('a'):
+					attachments.append(Attachment(url = urllib.parse.urljoin(url, a['href']), name = a['href'].rsplit('/', 1)[1]))
+				tootKwargs['attachments'] = attachments
+
+			links = []
+			mentionedUsers = []
+			hashtags = []
+			for a in content.find_all('a'):
+				cls = a.get('class', [])
+				if 'mention' in cls and 'u-url' in cls:
+					mentionUrl = urllib.parse.urljoin(url, a['href'])
+					mentionedUsers.append(User(account = self._url_to_account(mentionUrl), _url = mentionUrl))
+				elif 'mention' in cls and 'hashtag' in cls:
+					hashtags.append(a.text.strip())
+				else:
+					links.append(urllib.parse.urljoin(url, a['href']))
+			if links:
+				tootKwargs['links'] = links
+			if mentionedUsers:
+				tootKwargs['mentionedUsers'] = mentionedUsers
+			if hashtags:
+				tootKwargs['hashtags'] = hashtags
+
+			if (pollDiv := entry.find('div', attrs = {'data-component': 'Poll'})):
+				o = json.loads(pollDiv['data-props'])
+				pollKwargs = {}
+				pollKwargs['id'] = o['poll']['id']
+				pollKwargs['expirationDate'] = datetime.datetime.strptime(o['poll']['expires_at'], '%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo = datetime.timezone.utc)
+				pollKwargs['multiple'] = o['poll']['multiple']
+				pollKwargs['options'] = [PollOption(title = op['title'], votesCount = op['votes_count']) for op in o['poll']['options']]
+				pollKwargs['votesCount'] = o['poll']['votes_count']
+				if 'voters_count' in o['poll']: # 3.0.0 (commit 3babf846)
+					pollKwargs['votersCount'] = o['poll']['voters_count']
+				tootKwargs['poll'] = Poll(**pollKwargs)
+
+			toot = Toot(**tootKwargs)
+
+			# Boosts
+			prepend = entry.find('div', class_ = 'status__prepend')
+			if not prepend: # Before 2.5.0 (commit bb71538b)
+				prepend = entry.find('div', class_ = 'pre-header')
+			if prepend and prepend.find('i', class_ = 'fa-retweet'): # Is a boost
+				userKwargs = {}
+				userLink = prepend.find('a', class_ = 'status__display-name')
+				# The user is always on this instance since that's the only place where boosts are shown, hence there is no explicit account span. Reconstruct from URL.
+				userUrl = urllib.parse.urljoin(url, userLink['href'])
+				assert userUrl.count('/') == 3 and userUrl.count('/@') == 1
+				userKwargs['account'] = '@'.join(reversed(userUrl.split('/')[2:]))
+				userKwargs['displayName'], userKwargs['displayNameWithCustomEmojis'] = self._display_name(userLink.find('strong'), url)
+				toot = Boost(user = User(**userKwargs), toot = toot)
+
+			yield toot
+
+	def _display_name(self, strong, url):
+		outPlain = []
+		outFull = []
+		hasCustomEmoji = False
+		for child in strong.children:
+			if isinstance(child, bs4.element.NavigableString):
+				outPlain.append(str(child))
+				outFull.append(str(child))
+			elif child.name == 'img' and 'custom-emoji' in child.get('class', []):
+				hasCustomEmoji = True
+				outPlain.append(child['alt'])
+				outFull.append(CustomEmoji(shortName = child['alt'], url = urllib.parse.urljoin(url, child['data-original']), staticUrl = urllib.parse.urljoin(url, child['data-static'])))
+			elif child.name == 'img' and 'emojione' in child.get('class', []):
+				# Version 2.0.0 (which first added custom emojis) to 2.9.4: no data-* attributes, only gets one of the URLs with no (easy, reliable) way of knowing which it is.
+				hasCustomEmoji = True
+				outPlain.append(child['alt'])
+				outFull.append(CustomEmoji(shortName = child['alt'], url = urllib.parse.urljoin(url, child['src'])))
+			else:
+				_logger.warning(f'Unexpected display name child: {child!r}')
+		return ''.join(outPlain), outFull if hasCustomEmoji else None
+
+	@staticmethod
+	def _url_to_account(url):
+		if url.count('/') == 3 and url.count('/@') == 1:
+			return '@'.join(reversed(url.split('/')[2:]))
+		if url.count('/') == 4 and '/users/' in url: # E.g. Pleroma, also supported by Mastodon
+			return '@' + '@'.join(reversed(url.split('/')[2::2]))
+		if url.count('/') == 4 and '/accounts/' in url: # E.g. Peertube
+			return '@' + '@'.join(reversed(url.split('/')[2::2]))
+		if url.count('/') == 4 and '/profile/' in url: # E.g. Friendica
+			return '@' + '@'.join(reversed(url.split('/')[2::2]))
+		raise ValueError('Unrecognised account URL format')
+
+
+class MastodonProfileScraper(_MastodonCommonScraper):
+	name = 'mastodon-profile'
+
+	def __init__(self, account, **kwargs):
+		super().__init__(**kwargs)
+		if account.startswith('@') and account.count('@') == 2:
+			account, domain = account[1:].split('@')
+			url = f'https://{domain}/@{account}'
+		else:
+			url = account
+		self._url = url
+
+	def get_items(self):
+		initial = True
+		while True:
+			if initial:
+				r = self._rate_limited_get(f'{self._url}/with_replies', headers = self._headers)
+				if r.status_code not in (200, 404):
+					raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
+				if r.status_code == 404: # Possibly an old instance where with_replies doesn't exist, try without that.
+					r = self._rate_limited_get(self._url, headers = self._headers)
+					if r.status_code not in (200, 404):
+						raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
+					if r.status_code == 404:
+						_logger.warning('Account does not exist')
+						return
+					_logger.warning('Old Mastodon instance, cannot retrieve reply toots')
+				initial = False
+			else:
+				r = self._rate_limited_get(url, headers = self._headers)
+				if r.status_code != 200:
+					raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
+			soup = bs4.BeautifulSoup(r.text, 'lxml')
+
+			yield from self._entries_to_items(soup.find('div', class_ = 'activity-stream').find_all('div', class_ = 'entry'), r.url)
+
+			nextA = soup.find('a', class_ = 'load-more', href = lambda x: '?max_id=' in x or '&max_id=' in x)
+			if not nextA: # Before 2.5.0 (commit bb71538b)
+				paginationDiv = soup.find('div', class_ = 'pagination')
+				if paginationDiv:
+					nextA = paginationDiv.find('a', class_ = 'next')
+			if not nextA: # End of pagination
+				break
+			url = urllib.parse.urljoin(r.url, nextA['href'])
+
+	@classmethod
+	def _cli_setup_parser(cls, subparser):
+		subparser.add_argument('account', type = snscrape.base.nonempty_string('account'), help = 'A Mastodon account. This can be either a URL to the profile page or a string of the form @account@instance.example.org')
+
+	@classmethod
+	def _cli_from_args(cls, args):
+		return cls._cli_construct(args, args.account)
+
+
+class MastodonTootScraperMode(enum.Enum):
+	SINGLE = 'single'
+	THREAD = 'thread'
+
+	@classmethod
+	def _cli_from_args(cls, args):
+		if args.thread:
+			return cls.THREAD
+		return cls.SINGLE
+
+
+class MastodonTootScraper(_MastodonCommonScraper):
+	name = 'mastodon-toot'
+
+	def __init__(self, url, *, mode = MastodonTootScraperMode.SINGLE, **kwargs):
+		super().__init__(**kwargs)
+		self._url = url
+		self._mode = mode
+
+	def get_items(self):
+		r = self._rate_limited_get(self._url, headers = self._headers)
+		if r.status_code == 404:
+			_logger.warning('Toot does not exist')
+			return
+		if r.status_code != 200:
+			raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
+		soup = bs4.BeautifulSoup(r.text, 'lxml')
+		if self._mode is MastodonTootScraperMode.SINGLE:
+			status = soup.find('div', class_ = 'detailed-status')
+			entry = status.parent
+			yield from self._entries_to_items([entry], r.url)
+		elif self._mode is MastodonTootScraperMode.THREAD:
+			yield from self._entries_to_items(soup.find('div', class_ = 'activity-stream').find_all('div', class_ = 'entry'), r.url)
+
+	@classmethod
+	def _cli_setup_parser(cls, subparser):
+		subparser.add_argument('--thread', action = 'store_true', help = 'Collect thread around the toot referenced by the URL')
+		subparser.add_argument('url', type = snscrape.base.nonempty_string('url'), help = 'A URL for a toot')
+
+	@classmethod
+	def _cli_from_args(cls, args):
+		return cls._cli_construct(args, args.url, mode = MastodonTootScraperMode._cli_from_args(args))
--- a/snscrape/modules/reddit.py
+++ b/snscrape/modules/reddit.py
@@ -0,0 +1,285 @@
+__all__ = ['Submission', 'Comment', 'RedditUserScraper', 'RedditSubredditScraper', 'RedditSearchScraper', 'RedditSubmissionScraper']
+
+
+import dataclasses
+import datetime
+import logging
+import re
+import snscrape.base
+import snscrape.version
+import string
+import time
+import typing
+
+
+_logger = logging.getLogger(__name__)
+
+
+# Most of these fields should never be None, but due to broken data, they sometimes are anyway...
+
+@dataclasses.dataclass
+class Submission(snscrape.base.Item):
+	author: typing.Optional[str] # E.g. submission hf7k6
+	date: datetime.datetime
+	id: str
+	link: typing.Optional[str]
+	selftext: typing.Optional[str]
+	subreddit: typing.Optional[str] # E.g. submission 617p51
+	title: str
+	url: str
+
+	created = snscrape.base._DeprecatedProperty('created', lambda self: self.date, 'date')
+
+	def __str__(self):
+		return self.url
+
+
+@dataclasses.dataclass
+class Comment(snscrape.base.Item):
+	author: typing.Optional[str]
+	body: str
+	date: datetime.datetime
+	id: str
+	parentId: typing.Optional[str]
+	subreddit: typing.Optional[str]
+	url: str
+
+	created = snscrape.base._DeprecatedProperty('created', lambda self: self.date, 'date')
+
+	def __str__(self):
+		return self.url
+
+
+def _cmp_id(id1, id2):
+	'''Compare two Reddit IDs. Returns -1 if id1 is less than id2, 0 if they are equal, and 1 if id1 is greater than id2.
+
+	id1 and id2 may have prefixes like t1_, but if included, they must be present on both and equal.'''
+
+	if id1.startswith('t') and '_' in id1:
+		prefix, id1 = id1.split('_', 1)
+		if not id2.startswith(f'{prefix}_'):
+			raise ValueError('id2 must have the same prefix as id1')
+		_, id2 = id2.split('_', 1)
+	if id1.strip(string.ascii_lowercase + string.digits) != '':
+		raise ValueError('invalid characters in id1')
+	if id2.strip(string.ascii_lowercase + string.digits) != '':
+		raise ValueError('invalid characters in id2')
+	if len(id1) < len(id2):
+		return -1
+	if len(id1) > len(id2):
+		return 1
+	if id1 < id2:
+		return -1
+	if id1 > id2:
+		return 1
+	return 0
+
+
+class _RedditPushshiftScraper(snscrape.base.Scraper):
+	def __init__(self, **kwargs):
+		super().__init__(**kwargs)
+		self._headers = {'User-Agent': f'snscrape/{snscrape.version.__version__}'}
+
+	def _handle_rate_limiting(self, r):
+		if r.status_code == 429:
+			_logger.info('Got 429 response, sleeping')
+			time.sleep(10)
+			return False, 'rate-limited'
+		if r.status_code != 200:
+			return False, 'non-200 status code'
+		return True, None
+
+	def _get_api(self, url, params = None):
+		r = self._get(url, params = params, headers = self._headers, responseOkCallback = self._handle_rate_limiting)
+		if r.status_code != 200:
+			raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
+		return r.json()
+
+	def _api_obj_to_item(self, d):
+		cls = Submission if 'title' in d else Comment
+
+		# Pushshift doesn't always return a permalink; sometimes, there's a permalink_url instead, and sometimes there's nothing at all
+		permalink = d.get('permalink')
+		if permalink is None:
+			# E.g. comment dovj2v7
+			permalink = d.get('permalink_url')
+			if permalink is None:
+				if 'link_id' in d and d['link_id'].startswith('t3_'): # E.g. comment doraazf
+					if 'subreddit' in d:
+						permalink = f'/r/{d["subreddit"]}/comments/{d["link_id"][3:]}/_/{d["id"]}/'
+					else: # E.g. submission 617p51 but can likely happen for comments as well
+						permalink = f'/comments/{d["link_id"][3:]}/_/{d["id"]}/'
+				else:
+					_logger.warning('Unable to find or construct permalink')
+					permalink = '/'
+
+		kwargs = {
+			'author': d.get('author'),
+			'date': datetime.datetime.fromtimestamp(d['created_utc'], datetime.timezone.utc),
+			'url': f'https://old.reddit.com{permalink}',
+			'subreddit': d.get('subreddit'),
+		}
+		if cls is Submission:
+			kwargs['selftext'] = d.get('selftext') or None
+			kwargs['link'] = (d['url'] if not d['url'].startswith('/') else f'https://old.reddit.com{d["url"]}') if not kwargs['selftext'] else None
+			if kwargs['link'] == kwargs['url'] or kwargs['url'].replace('//old.reddit.com/', '//www.reddit.com/') == kwargs['link']:
+				kwargs['link'] = None
+			kwargs['title'] = d['title']
+			kwargs['id'] = f't3_{d["id"]}'
+		else:
+			kwargs['body'] = d['body']
+			kwargs['parentId'] = d.get('parent_id')
+			kwargs['id'] = f't1_{d["id"]}'
+
+		return cls(**kwargs)
+
+
+class _RedditPushshiftSearchScraper(_RedditPushshiftScraper):
+	def __init__(self, name, *, submissions = True, comments = True, before = None, after = None, **kwargs):
+		super().__init__(**kwargs)
+		self._name = name
+		self._submissions = submissions
+		self._comments = comments
+		self._before = before
+		self._after = after
+
+		if not type(self)._validationFunc(self._name):
+			raise ValueError(f'invalid {type(self).name.split("-", 1)[1]} name')
+		if not self._submissions and not self._comments:
+			raise ValueError('At least one of submissions and comments must be True')
+
+	def _iter_api(self, url, params = None):
+		'''Iterate through the Pushshift API using the 'before' parameter and yield the items.'''
+		lowestIdSeen = None
+		if params is None:
+			params = {}
+		if self._before is not None:
+			params['before'] = self._before
+		if self._after is not None:
+			params['after'] = self._after
+		params['sort'] = 'desc'
+		while True:
+			obj = self._get_api(url, params = params)
+			if not obj['data'] or (lowestIdSeen is not None and all(_cmp_id(d['id'], lowestIdSeen) >= 0 for d in obj['data'])): # end of pagination
+				break
+			for d in obj['data']:
+				if lowestIdSeen is None or _cmp_id(d['id'], lowestIdSeen) == -1:
+					yield self._api_obj_to_item(d)
+					lowestIdSeen = d['id']
+			params['before'] = obj["data"][-1]["created_utc"] + 1
+
+	def _iter_api_submissions_and_comments(self, params: dict):
+		# Retrieve both submissions and comments, interleave the results to get a reverse-chronological order
+		params['size'] = '1000'
+		if self._submissions:
+			submissionsIter = self._iter_api('https://api.pushshift.io/reddit/search/submission/', params.copy()) # Pass copies to prevent the two iterators from messing each other up by using the same dict
+		else:
+			submissionsIter = iter(())
+		if self._comments:
+			commentsIter = self._iter_api('https://api.pushshift.io/reddit/search/comment/', params.copy())
+		else:
+			commentsIter = iter(())
+
+		try:
+			tipSubmission = next(submissionsIter)
+		except StopIteration:
+			# There are no submissions, just yield comments and return
+			yield from commentsIter
+			return
+		try:
+			tipComment = next(commentsIter)
+		except StopIteration:
+			# There are no comments, just yield submissions and return
+			yield tipSubmission
+			yield from submissionsIter
+			return
+
+		while True:
+			# Return newer first; if both have the same creation datetime, return the comment first
+			if tipSubmission.date > tipComment.date:
+				yield tipSubmission
+				try:
+					tipSubmission = next(submissionsIter)
+				except StopIteration:
+					# Reached the end of submissions, just yield the remaining comments and stop
+					yield tipComment
+					yield from commentsIter
+					break
+			else:
+				yield tipComment
+				try:
+					tipComment = next(commentsIter)
+				except StopIteration:
+					yield tipSubmission
+					yield from submissionsIter
+					break
+
+	def get_items(self):
+		yield from self._iter_api_submissions_and_comments({type(self)._apiField: self._name})
+
+	@classmethod
+	def _cli_setup_parser(cls, subparser):
+		subparser.add_argument('--no-submissions', dest = 'noSubmissions', action = 'store_true', default = False, help = 'Don\'t list submissions')
+		subparser.add_argument('--no-comments', dest = 'noComments', action = 'store_true', default = False, help = 'Don\'t list comments')
+		subparser.add_argument('--before', metavar = 'TIMESTAMP', type = int, help = 'Fetch results before a Unix timestamp')
+		subparser.add_argument('--after', metavar = 'TIMESTAMP', type = int, help = 'Fetch results after a Unix timestamp')
+		name = cls.name.split('-', 1)[1]
+		subparser.add_argument(name, type = snscrape.base.nonempty_string(name))
+
+	@classmethod
+	def _cli_from_args(cls, args):
+		name = cls.name.split('-', 1)[1]
+		return cls._cli_construct(args, getattr(args, name), submissions = not args.noSubmissions, comments = not args.noComments, before = args.before, after = args.after)
+
+
+class RedditUserScraper(_RedditPushshiftSearchScraper):
+	name = 'reddit-user'
+	_validationFunc = lambda x: re.match('^[A-Za-z0-9_-]{3,20}$', x)
+	_apiField = 'author'
+
+
+class RedditSubredditScraper(_RedditPushshiftSearchScraper):
+	name = 'reddit-subreddit'
+	_validationFunc = lambda x: re.match('^[A-Za-z0-9][A-Za-z0-9_]{2,20}$', x)
+	_apiField = 'subreddit'
+
+
+class RedditSearchScraper(_RedditPushshiftSearchScraper):
+	name = 'reddit-search'
+	_validationFunc = lambda x: True
+	_apiField = 'q'
+
+
+class RedditSubmissionScraper(_RedditPushshiftScraper):
+	name = 'reddit-submission'
+
+	def __init__(self, submissionId, **kwargs):
+		if (submissionId[3:] if submissionId.startswith('t3_') else submissionId).strip(string.ascii_lowercase + string.digits) != '':
+			raise ValueError('invalid submissionId')
+		super().__init__(**kwargs)
+		self._submissionId = submissionId
+
+	def get_items(self):
+		obj = self._get_api(f'https://api.pushshift.io/reddit/search/submission/?ids={self._submissionId}')
+		if not obj['data']:
+			return
+		if len(obj['data']) != 1:
+			raise snscrape.base.ScraperException(f'Got {len(obj["data"])} results instead of 1')
+		yield self._api_obj_to_item(obj['data'][0])
+
+		obj = self._get_api(f'https://api.pushshift.io/reddit/submission/comment_ids/{self._submissionId}')
+		if not obj['data']:
+			return
+		commentIds = obj['data']
+		for i in range(0, len(commentIds), 500):
+			ids = commentIds[i : i + 500]
+			obj = self._get_api(f'https://api.pushshift.io/reddit/comment/search?ids={",".join(ids)}')
+			yield from map(self._api_obj_to_item, obj['data'])
+
+	@classmethod
+	def _cli_setup_parser(cls, subparser):
+		subparser.add_argument('submissionId', type = snscrape.base.nonempty_string('submissionId'))
+
+	@classmethod
+	def _cli_from_args(cls, args):
+		return cls._cli_construct(args, args.submissionId)
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -0,0 +1,331 @@
+__all__ = ['LinkPreview', 'TelegramPost', 'Channel', 'TelegramChannelScraper']
+
+
+import bs4
+import dataclasses
+import datetime
+import logging
+import re
+import snscrape.base
+import typing
+import urllib.parse
+import base64
+
+_logger = logging.getLogger(__name__)
+_SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
+_STYLE_MEDIA_URL_PATTERN = re.compile(r'url\(\'(.*?)\'\)')
+
+@dataclasses.dataclass
+class LinkPreview:
+	href: str
+	siteName: typing.Optional[str] = None
+	title: typing.Optional[str] = None
+	description: typing.Optional[str] = None
+	image: typing.Optional[str] = None
+
+
+@dataclasses.dataclass
+class Channel(snscrape.base.Entity):
+	username: str
+	title: typing.Optional[str] = None
+	verified: typing.Optional[bool] = None
+	photo: typing.Optional[str] = None
+	description: typing.Optional[str] = None
+	members: typing.Optional[int] = None
+	photos: typing.Optional[snscrape.base.IntWithGranularity] = None
+	videos: typing.Optional[snscrape.base.IntWithGranularity] = None
+	links: typing.Optional[snscrape.base.IntWithGranularity] = None
+	files: typing.Optional[snscrape.base.IntWithGranularity] = None
+
+	photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
+	videosGranularity = snscrape.base._DeprecatedProperty('videosGranularity', lambda self: self.videos.granularity, 'videos.granularity')
+	linksGranularity = snscrape.base._DeprecatedProperty('linksGranularity', lambda self: self.links.granularity, 'links.granularity')
+	filesGranularity = snscrape.base._DeprecatedProperty('filesGranularity', lambda self: self.files.granularity, 'files.granularity')
+
+	def __str__(self):
+		return f'https://t.me/s/{self.username}'
+
+
+@dataclasses.dataclass
+class TelegramPost(snscrape.base.Item):
+	url: str
+	date: datetime.datetime
+	content: str
+	outlinks: typing.List[str] = None
+	mentions: typing.List[str] = None
+	hashtags: typing.List[str] = None
+	forwarded: typing.Optional['Channel'] = None
+	forwardedUrl: typing.Optional[str] = None
+	media: typing.Optional[typing.List['Medium']] = None
+	views: typing.Optional[int] = None
+	linkPreview: typing.Optional[LinkPreview] = None
+
+	outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks), 'outlinks')
+
+	def __str__(self):
+		return self.url
+
+
+class Medium:
+	pass
+
+
+@dataclasses.dataclass
+class Photo(Medium):
+	url: str
+
+
+@dataclasses.dataclass
+class Video(Medium):
+	thumbnailUrl: str
+	duration: float
+	url: typing.Optional[str] = None
+
+
+@dataclasses.dataclass
+class VoiceMessage(Medium):
+	url: str
+	duration: str
+	bars:typing.List[float]
+
+
+@dataclasses.dataclass
+class Gif(Medium):
+	thumbnailUrl: str
+	url: typing.Optional[str] = None
+
+
+class TelegramChannelScraper(snscrape.base.Scraper):
+	name = 'telegram-channel'
+
+	def __init__(self, name, **kwargs):
+		super().__init__(**kwargs)
+		self._name = name
+		self._headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
+		self._initialPage = None
+		self._initialPageSoup = None
+
+	def _initial_page(self):
+		if self._initialPage is None:
+			r = self._get(f'https://t.me/s/{self._name}', headers = self._headers)
+			if r.status_code != 200:
+				raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
+			self._initialPage, self._initialPageSoup = r, bs4.BeautifulSoup(r.text, 'lxml')
+		return self._initialPage, self._initialPageSoup
+
+	def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
+		posts = soup.find_all('div', attrs = {'class': 'tgme_widget_message', 'data-post': True})
+		for post in reversed(posts):
+			if onlyUsername:
+				yield post['data-post'].split('/')[0]
+				return
+			dateDiv = post.find('div', class_ = 'tgme_widget_message_footer').find('a', class_ = 'tgme_widget_message_date')
+			rawUrl = dateDiv['href']
+			if not rawUrl.startswith('https://t.me/') or sum(x == '/' for x in rawUrl) != 4 or rawUrl.rsplit('/', 1)[1].strip('0123456789') != '':
+				_logger.warning(f'Possibly incorrect URL: {rawUrl!r}')
+			url = rawUrl.replace('//t.me/', '//t.me/s/')
+			date = datetime.datetime.strptime(dateDiv.find('time', datetime = True)['datetime'].replace('-', '', 2).replace(':', ''), '%Y%m%dT%H%M%S%z')
+			media = []
+			outlinks = []
+			mentions = []
+			hashtags = []
+			forwarded = None
+			forwardedUrl = None
+
+			if (forwardTag := post.find('a', class_ = 'tgme_widget_message_forwarded_from_name')):
+				forwardedUrl = forwardTag['href']
+				forwardedName = forwardedUrl.split('t.me/')[1].split('/')[0]
+				forwarded = Channel(username = forwardedName)
+
+			if (message := post.find('div', class_ = 'tgme_widget_message_text')):
+				content = message.get_text(separator="\n")
+			else:
+				content = None
+
+			for link in post.find_all('a'):
+				if any(x in link.parent.attrs.get('class', []) for x in ('tgme_widget_message_user', 'tgme_widget_message_author')):
+					# Author links at the top (avatar and name)
+					continue
+				if link['href'] == rawUrl or link['href'] == url:
+					style = link.attrs.get('style', '')
+					# Generic filter of links to the post itself, catches videos, photos, and the date link
+					if style != '':
+						imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style)
+						if len(imageUrls) == 1:
+							media.append(Photo(url = imageUrls[0]))
+						continue
+				if _SINGLE_MEDIA_LINK_PATTERN.match(link['href']):
+					style = link.attrs.get('style', '')
+					imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style)
+					if len(imageUrls) == 1:
+						media.append(Photo(url = imageUrls[0]))
+						# resp = self._get(image[0])
+						# encoded_string = base64.b64encode(resp.content)
+					# Individual photo or video link
+					continue
+				if link.text.startswith('@'):
+					mentions.append(link.text.strip('@'))
+					continue
+				if link.text.startswith('#'):
+					hashtags.append(link.text.strip('#'))
+					continue
+				href = urllib.parse.urljoin(pageUrl, link['href'])
+				if (href not in outlinks) and (href != rawUrl) and (href != forwardedUrl):
+					outlinks.append(href)
+
+			for voicePlayer in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}):
+				audioUrl = voicePlayer.find('audio')['src']
+				durationStr = voicePlayer.find('time').text
+				duration = durationStrToSeconds(durationStr)
+				barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')]
+
+				media.append(VoiceMessage(url = audioUrl, duration = duration, bars = barHeights))
+
+			for videoPlayer in post.find_all('a', {'class': 'tgme_widget_message_video_player'}):
+				iTag = videoPlayer.find('i')
+				if iTag is None:
+					videoUrl = None 
+					videoThumbnailUrl = None
+				else:
+					style = iTag['style']
+					videoThumbnailUrl = _STYLE_MEDIA_URL_PATTERN.findall(style)[0]
+					videoTag = videoPlayer.find('video')
+					videoUrl = None if videoTag is None else videoTag['src']
+				mKwargs = {
+					'thumbnailUrl': videoThumbnailUrl,
+					'url': videoUrl,
+				}
+				timeTag = videoPlayer.find('time')
+				if timeTag is None:
+					cls = Gif
+				else:
+					cls = Video
+					durationStr = videoPlayer.find('time').text
+					mKwargs['duration'] = durationStrToSeconds(durationStr)
+				media.append(cls(**mKwargs))
+
+			linkPreview = None
+			if (linkPreviewA := post.find('a', class_ = 'tgme_widget_message_link_preview')):
+				kwargs = {}
+				kwargs['href'] = urllib.parse.urljoin(pageUrl, linkPreviewA['href'])
+				if (siteNameDiv := linkPreviewA.find('div', class_ = 'link_preview_site_name')):
+					kwargs['siteName'] = siteNameDiv.text
+				if (titleDiv := linkPreviewA.find('div', class_ = 'link_preview_title')):
+					kwargs['title'] = titleDiv.text
+				if (descriptionDiv := linkPreviewA.find('div', class_ = 'link_preview_description')):
+					kwargs['description'] = descriptionDiv.text
+				if (imageI := linkPreviewA.find('i', class_ = 'link_preview_image')):
+					if imageI['style'].startswith("background-image:url('"):
+						kwargs['image'] = imageI['style'][22 : imageI['style'].index("'", 22)]
+					else:
+						_logger.warning(f'Could not process link preview image on {url}')
+				linkPreview = LinkPreview(**kwargs)
+				if kwargs['href'] in outlinks:
+					outlinks.remove(kwargs['href'])
+
+			viewsSpan = post.find('span', class_ = 'tgme_widget_message_views')
+			views = None if viewsSpan is None else parse_num(viewsSpan.text)
+			
+			yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, mentions = mentions, hashtags = hashtags, linkPreview = linkPreview, media = media, forwarded = forwarded, forwardedUrl = forwardedUrl, views = views)
+
+	def get_items(self):
+		r, soup = self._initial_page()
+		if '/s/' not in r.url:
+			_logger.warning('No public post list for this user')
+			return
+		nextPageUrl = ''
+		while True:
+			yield from self._soup_to_items(soup, r.url)
+			try:
+				if soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)['href'].split('/')[-1] == '1':
+					# if message 1 is the first message in the page, terminate scraping
+					break
+			except:
+				pass
+			pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})
+			if not pageLink:
+				# some pages are missing a "tme_messages_more" tag, causing early termination
+				if '=' not in nextPageUrl:
+					nextPageUrl =  soup.find('link', attrs = {'rel': 'canonical'}, href = True)['href']
+				nextPostIndex = int(nextPageUrl.split('=')[-1]) - 20
+				if nextPostIndex > 20:
+					pageLink = {'href': nextPageUrl.split('=')[0] + f'={nextPostIndex}'}
+				else:
+					break
+			nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
+			r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = telegramResponseOkCallback)
+			if r.status_code != 200:
+				raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
+			soup = bs4.BeautifulSoup(r.text, 'lxml')
+
+	def _get_entity(self):
+		kwargs = {}
+		# /channel has a more accurate member count and bigger profile picture
+		r = self._get(f'https://t.me/{self._name}', headers = self._headers)
+		if r.status_code != 200:
+			raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
+		soup = bs4.BeautifulSoup(r.text, 'lxml')
+		membersDiv = soup.find('div', class_ = 'tgme_page_extra')
+		if membersDiv.text.endswith((' members', ' subscribers')):
+			kwargs['members'] = int(''.join(membersDiv.text.split(' ')[:-1]))
+		photoImg = soup.find('img', class_ = 'tgme_page_photo_image')
+		if photoImg is not None:
+			kwargs['photo'] = photoImg.attrs['src']
+		else:
+			kwargs['photo'] = None
+
+		r, soup = self._initial_page()
+		if '/s/' not in r.url: # Redirect on channels without public posts
+			return
+		channelInfoDiv = soup.find('div', class_ = 'tgme_channel_info')
+		assert channelInfoDiv, 'channel info div not found'
+		titleDiv = channelInfoDiv.find('div', class_ = 'tgme_channel_info_header_title')
+		kwargs['title'] = titleDiv.find('span').text
+		kwargs['verified'] = bool(titleDiv.find('i', class_ = 'verified-icon'))
+		# The username in the channel info is not canonicalised, nor is the one on the /channel page anywhere.
+		# However, the post URLs are, so extract the first post and use that.
+		try:
+			kwargs['username'] = next(self._soup_to_items(soup, r.url, onlyUsername = True))
+		except StopIteration:
+			# If there are no posts, fall back to the channel info div, although that should never happen due to the 'Channel created' entry.
+			_logger.warning('Could not find a post; extracting username from channel info div, which may not be capitalised correctly')
+			kwargs['username'] = channelInfoDiv.find('div', class_ = 'tgme_channel_info_header_username').text[1:] # Remove @
+		if (descriptionDiv := channelInfoDiv.find('div', class_ = 'tgme_channel_info_description')):
+			kwargs['description'] = descriptionDiv.text
+
+		for div in channelInfoDiv.find_all('div', class_ = 'tgme_channel_info_counter'):
+			value, granularity = parse_num(div.find('span', class_ = 'counter_value').text)
+			type_ = div.find('span', class_ = 'counter_type').text
+			if type_ == 'members':
+				# Already extracted more accurately from /channel, skip
+				continue
+			elif type_ in ('photos', 'videos', 'links', 'files'):
+				kwargs[type_] = snscrape.base.IntWithGranularity(value, granularity)
+
+		return Channel(**kwargs)
+
+	@classmethod
+	def _cli_setup_parser(cls, subparser):
+		subparser.add_argument('channel', type = snscrape.base.nonempty_string('channel'), help = 'A channel name')
+
+	@classmethod
+	def _cli_from_args(cls, args):
+		return cls._cli_construct(args, args.channel)
+
+def parse_num(s):
+	s = s.replace(' ', '')
+	if s.endswith('M'):
+		return int(float(s[:-1]) * 1e6), 10 ** (6 if '.' not in s else 6 - len(s[:-1].split('.')[1]))
+	elif s.endswith('K'):
+		return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1]))
+	return int(s), 1
+
+def durationStrToSeconds(durationStr):
+	durationList = durationStr.split(':')
+	return sum([int(s) * int(g) for s, g in zip([1, 60, 360], reversed(durationList))])
+
+def telegramResponseOkCallback(r):
+	if r.status_code == 200:
+		return (True, None)
+	return (False, f'{r.status_code=}')
+	
--- a/snscrape/modules/twitter.py
+++ b/snscrape/modules/twitter.py
--- a/snscrape/modules/vkontakte.py
+++ b/snscrape/modules/vkontakte.py
@@ -1,100 +1,410 @@
+__all__ = ['VKontaktePost', 'Photo', 'PhotoVariant', 'Video', 'User', 'VKontakteUserScraper']
+
+
 import bs4
+import collections
+import dataclasses
 import datetime
 import itertools
+import json
 import logging
+import re
 import snscrape.base
 import typing
 import urllib.parse
+try:
+	import zoneinfo
+except ImportError:
+	# Python 3.8 support; nowadays, Europe/Moscow is always UTC+3, but it's more complicated before 2014, so need proper zone info
+	import pytz
+	def _timezone(s):
+		return pytz.timezone(s)
+	def _localised_datetime(tz, *args, **kwargs):
+		return tz.localize(datetime.datetime(*args, **kwargs))
+else:
+	def _timezone(s):
+		return zoneinfo.ZoneInfo(s)
+	def _localised_datetime(tz, *args, **kwargs):
+		return datetime.datetime(*args, tzinfo = tz, **kwargs)


-logger = logging.getLogger(__name__)
+_logger = logging.getLogger(__name__)
+_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
+_datePattern = re.compile(r'^(?P<date>today'
+                                  r'|yesterday'
+                                  r'|(?P<day1>\d+)\s+(?P<month1>' + '|'.join(_months) + ')(\s+(?P<year1>\d{4}))?'
+                                  r'|(?P<month2>' + '|'.join(_months) + r')\s+(?P<day2>\d+),\s+(?P<year2>\d{4})'
+                           ')'
+                          r'\s+at\s+(?P<hour>\d+):(?P<minute>\d+)\s+(?P<ampm>[ap]m)$')


-class VKontaktePost(typing.NamedTuple, snscrape.base.Item):
+@dataclasses.dataclass
+class User(snscrape.base.Entity):
+	username: str
+	name: str
+	verified: bool
+	description: typing.Optional[str] = None
+	websites: typing.Optional[typing.List[str]] = None
+	followers: typing.Optional[snscrape.base.IntWithGranularity] = None
+	posts: typing.Optional[snscrape.base.IntWithGranularity] = None
+	photos: typing.Optional[snscrape.base.IntWithGranularity] = None
+	tags: typing.Optional[snscrape.base.IntWithGranularity] = None
+	following: typing.Optional[snscrape.base.IntWithGranularity] = None
+
+	followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
+	postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
+	photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
+	tagsGranularity = snscrape.base._DeprecatedProperty('tagsGranularity', lambda self: self.tags.granularity, 'tags.granularity')
+	followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
+
+	def __str__(self):
+		return f'https://vk.com/{self.username}'
+
+
+@dataclasses.dataclass
+class VKontaktePost(snscrape.base.Item):
 	url: str
-	date: datetime.datetime
+	date: typing.Optional[typing.Union[datetime.datetime, datetime.date]]
 	content: str
+	user: User
+	outlinks: typing.Optional[typing.List[str]] = None
+	photos: typing.Optional[typing.List['Photo']] = None
+	video: typing.Optional['Video'] = None
+	quotedPost: typing.Optional['VKontaktePost'] = None

 	def __str__(self):
 		return self.url


+@dataclasses.dataclass
+class Photo:
+	variants: typing.List['PhotoVariant']
+	url: typing.Optional[str] = None
+
+
+@dataclasses.dataclass
+class PhotoVariant:
+	url: str
+	width: int
+	height: int
+
+
+@dataclasses.dataclass
+class Video:
+	id: str
+	list: str
+	duration: int
+	url: str
+	thumbUrl: str
+
+
 class VKontakteUserScraper(snscrape.base.Scraper):
 	name = 'vkontakte-user'

 	def __init__(self, username, **kwargs):
 		super().__init__(**kwargs)
 		self._username = username
+		self._baseUrl = f'https://vk.com/{self._username}'
+		self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept-Language': 'en-US,en;q=0.5'}
+		self._initialPage = None
+		self._initialPageSoup = None

-	def _soup_to_items(self, soup, baseUrl):
-		for post in soup.find_all('div', class_ = 'post'):
+	def _away_a_to_url(self, a):
+		# Transform an <a> tag with an href of /away.php?to=... to a plain URL; returns None if a doesn't have that form.
+		if a and a.get('href', '').startswith('/away.php?to='):
+			end = a['href'].find('&', 13)
+			if end == -1:
+				end = None
+			return urllib.parse.unquote(a['href'][13 : end])
+		return None
+
+	def _date_span_to_date(self, dateSpan):
+		if not dateSpan:
+			return None
+		if 'time' in dateSpan.attrs:
+			return datetime.datetime.fromtimestamp(int(dateSpan['time']), datetime.timezone.utc)
+		if (match := _datePattern.match(dateSpan.text)):
+			# Datetime information down to minutes
+			tz = _timezone('Europe/Moscow')
+			if match.group('date') in ('today', 'yesterday'):
+				date = datetime.datetime.now(tz = tz)
+				if match.group('date') == 'yesterday':
+					date -= datetime.timedelta(days = 1)
+				year, month, day = date.year, date.month, date.day
+			else:
+				year = int(match.group('year1') or match.group('year2') or datetime.datetime.now(tz = tz).year)
+				month = _months.index(match.group('month1') or match.group('month2')) + 1
+				day = int(match.group('day1') or match.group('day2'))
+			hour = int(match.group('hour'))
+			# Damn AM/PM...
+			if hour == 12:
+				hour -= 12
+			if match.group('ampm') == 'pm':
+				hour += 12
+			minute = int(match.group('minute'))
+			return _localised_datetime(tz, year, month, day, hour, minute)
+		if (match := re.match(r'^(?P<day>\d+)\s+(?P<month>' + '|'.join(_months) + r')\s+(?P<year>\d{4})$', dateSpan.text)):
+			# Date only
+			return datetime.date(int(match.group('year')), _months.index(match.group('month')) + 1, int(match.group('day')))
+		if dateSpan.text not in ('video', 'photo'): # Silently ignore video and photo reposts which have no original date attached
+			_logger.warning(f'Could not parse date string: {dateSpan.text!r}')
+
+	def _post_div_to_item(self, post, isCopy = False):
+		postLink = post.find('a', class_ = 'post_link' if not isCopy else 'published_by_date')
+		if not postLink:
+			_logger.warning(f'Skipping post without link: {str(post)[:200]!r}')
+			return
+		url = urllib.parse.urljoin(self._baseUrl, postLink['href'])
+		assert (url.startswith('https://vk.com/wall') or (isCopy and (url.startswith('https://vk.com/video') or url.startswith('https://vk.com/photo')))) and '_' in url and url[-1] != '_' and url.rsplit('_', 1)[1].strip('0123456789') in ('', '?reply=')
+		if not isCopy:
 			dateSpan = post.find('div', class_ = 'post_date').find('span', class_ = 'rel_date')
-			textDiv = post.find('div', class_ = 'wall_post_text')
-			yield VKontaktePost(
-			  url = urllib.parse.urljoin(baseUrl, post.find('a', class_ = 'post_link')['href']),
-			  date = datetime.datetime.fromtimestamp(int(dateSpan['time']), datetime.timezone.utc) if 'time' in dateSpan else None,
-			  content = textDiv.text if textDiv else None,
-			 )
+		else:
+			dateSpan = post.find('div', class_ = 'copy_post_date').find('a', class_ = 'published_by_date')
+		textDiv = post.find('div', class_ = 'wall_post_text')
+		outlinks = [h for a in textDiv.find_all('a') if (h := self._away_a_to_url(a))] if textDiv else []
+		if (mediaLinkDiv := post.find('div', class_ = 'media_link')) and \
+		   (mediaLinkA := mediaLinkDiv.find('a', class_ = 'media_link__title')) and \
+		   (href := self._away_a_to_url(mediaLinkA)) and \
+		   href not in outlinks:
+			outlinks.append(href)
+		photos = None
+		video = None
+		if (thumbsDiv := (post.find('div', class_ = 'wall_text') if not isCopy else post).find('div', class_ = 'page_post_sized_thumbs')) and \
+		   not (not isCopy and thumbsDiv.parent.name == 'div' and 'class' in thumbsDiv.parent.attrs and 'copy_quote' in thumbsDiv.parent.attrs['class']): # Skip post quotes
+			photos = []
+			for a in thumbsDiv.find_all('a', class_ = 'page_post_thumb_wrap'):
+				if 'data-photo-id' not in a.attrs and 'data-video' not in a.attrs:
+					_logger.warning(f'Skipping non-photo and non-video thumb wrap on {url}')
+					continue
+				if 'data-video' in a.attrs:
+					# Video
+					if 'data-link-attr' in a.attrs:
+						hrefUrl = urllib.parse.unquote(a.attrs['data-link-attr'].split('to=')[1].split('&')[0])
+					else:
+						hrefUrl = f'https://vk.com{a["href"]}'
+					video = Video(
+						id = a['data-video'],
+						list = a['data-list'],
+						duration = int(a['data-duration']),
+						url = hrefUrl,
+						thumbUrl = a['style'][(begin := a['style'].find('background-image: url(') + 22) : a['style'].find(')', begin)],
+					)
+					continue
+				# From here on: photo
+				if 'onclick' not in a.attrs or not a['onclick'].startswith("return showPhoto('") or '{"temp":' not in a['onclick'] or not a['onclick'].endswith('}, event)'):
+					_logger.warning(f'Photo thumb wrap on {url} has no or unexpected onclick, skipping')
+					continue
+				photoData = a['onclick'][a['onclick'].find('{"temp":') : -8] # -8 = len(', event)')
+				photoObj = json.loads(photoData)
+				singleLetterKeys = [k for k in photoObj['temp'].keys() if len(k) == 1 and 97 <= ord(k) <= 122] # 97 = ord('a'), 122 = ord('z')
+				for x in singleLetterKeys:
+					# Merge base into URLs
+					if not photoObj['temp'][x].startswith('https://'):
+						photoObj['temp'][x] = f'{photoObj["temp"]["base"]}{photoObj["temp"][x]}'
+					x_ = f'{x}_'
+					if not photoObj['temp'][x_][0].startswith('https://'):
+						photoObj['temp'][x_][0] = f'{photoObj["temp"]["base"]}{photoObj["temp"][x_][0]}'
+				if any(k not in {'base', 'w', 'w_', 'x', 'x_', 'y', 'y_', 'z', 'z_'} for k in photoObj['temp'].keys()) or \
+				   not all(photoObj['temp'][x] in (photoObj['temp'][f'{x}_'][0], photoObj['temp'][f'{x}_'][0] + '.jpg') for x in singleLetterKeys) or \
+				   not all(photoObj['temp'][x].startswith('https://sun') and '.userapi.com/' in photoObj['temp'][x] for x in singleLetterKeys) or \
+				   not all(len(photoObj['temp'][(x_ := f'{x}_')]) == 3 and isinstance(photoObj['temp'][x_][1], int) and isinstance(photoObj['temp'][x_][2], int) for x in singleLetterKeys):
+					_logger.warning(f'Photo thumb wrap on {url} has unexpected data structure, skipping')
+					continue
+				photoVariants = []
+				for x in singleLetterKeys:
+					x_ = f'{x}_'
+					photoVariants.append(PhotoVariant(url = f'{photoObj["temp"][x_][0]}.jpg' if '.jpg' not in photoObj['temp'][x_][0] else photoObj['temp'][x_][0], width = photoObj['temp'][x_][1], height = photoObj['temp'][x_][2]))
+				photoUrl = f'https://vk.com{a["href"]}' if 'href' in a.attrs and a['href'].startswith('/photo') and a['href'][6:].strip('0123456789-_') == '' else None
+				photos.append(Photo(variants = photoVariants, url = photoUrl))
+		quotedPost = self._post_div_to_item(quoteDiv, isCopy = True) if (quoteDiv := post.find('div', class_ = 'copy_quote')) else None
+		authorHeading = post.find('h5', class_ = ['post_author', 'copy_post_author'])
+		authorLink = authorHeading.find('a', class_ = ['author', 'copy_author'])
+		username = authorLink['href'].split('/')[-1]
+		name = authorLink.text
+		if authorHeading.find('div', class_ = 'page_verified') is not None:
+			verified = True
+		else:
+			verified = False
+		user = User(username = username, name = name, verified = verified)
+		return VKontaktePost(
+			url = url,
+			date = self._date_span_to_date(dateSpan),
+			content = textDiv.text if textDiv else None,
+			user = user,
+			outlinks = outlinks or None,
+			photos = photos or None,
+			video = video or None,
+			quotedPost = quotedPost,
+		 )
+
+	def _soup_to_items(self, soup):
+		for post in soup.find_all('div', class_ = 'post'):
+			yield self._post_div_to_item(post)
+
+	def _initial_page(self):
+		if self._initialPage is None:
+			_logger.info('Retrieving initial data')
+			r = self._get(self._baseUrl, headers = self._headers)
+			if r.status_code not in (200, 404):
+				raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
+			# VK sends windows-1251-encoded data, but Requests's decoding doesn't seem to work correctly and causes lxml to choke, so we need to pass the binary content and the encoding explicitly.
+			self._initialPage, self._initialPageSoup = r, bs4.BeautifulSoup(r.content, 'lxml', from_encoding = r.encoding)
+		return self._initialPage, self._initialPageSoup

 	def get_items(self):
-		headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept-Language': 'en-US,en;q=0.5'}
-		baseUrl = f'https://vk.com/{self._username}'
-
-		logger.info('Retrieving initial data')
-		r = self._get(baseUrl, headers = headers)
+		r, soup = self._initial_page()
 		if r.status_code == 404:
-			logger.error('Wall does not exist')
+			_logger.warning('Wall does not exist')
 			return
-		elif r.status_code != 200:
-			logger.error(f'Got status code {r.status_code}')
-			return
-
-		# VK sends windows-1251-encoded data, but Requests's decoding doesn't seem to work correctly and causes lxml to choke, so we need to pass the binary content and the encoding explicitly.
-		soup = bs4.BeautifulSoup(r.content, 'lxml', from_encoding = r.encoding)

 		if soup.find('div', class_ = 'profile_closed_wall_dummy'):
-			logger.error('Private profile')
+			_logger.warning('Private profile')
+			return
+
+		if (profileDeleted := soup.find('h5', class_ = 'profile_deleted_text')):
+			# Unclear what this state represents, so just log website text.
+			_logger.warning(profileDeleted.text)
 			return

 		newestPost = soup.find('div', class_ = 'post')
 		if not newestPost:
-			logger.info('Wall has no posts')
+			_logger.info('Wall has no posts')
 			return
 		ownerID = newestPost.attrs['data-post-id'].split('_')[0]
 		# If there is a pinned post, we need its ID for the pagination requests
 		if 'post_fixed' in newestPost.attrs['class']:
-			fixedPostID = newestPost.attrs['id'].split('_')[1]
+			fixedPostID = int(newestPost.attrs['id'].split('_')[1])
 		else:
 			fixedPostID = ''

-		yield from self._soup_to_items(soup, baseUrl)
+		last1000PostIDs = collections.deque(maxlen = 1000)

-		headers['X-Requested-With'] = 'XMLHttpRequest'
+		def _process_soup(soup):
+			nonlocal last1000PostIDs
+			for item in self._soup_to_items(soup):
+				postID = int(item.url.rsplit('_', 1)[1])
+				if postID not in last1000PostIDs:
+					yield item
+					last1000PostIDs.append(postID)
+
+		yield from _process_soup(soup)
+
+		lastWorkingOffset = 0
 		for offset in itertools.count(start = 10, step = 10):
-			logger.info('Retrieving next page')
-			r = self._post(
-			  'https://vk.com/al_wall.php',
-			  data = [('act', 'get_wall'), ('al', 1), ('fixed', fixedPostID), ('offset', offset), ('onlyCache', 'false'), ('owner_id', ownerID), ('type', 'own'), ('wall_start_from', offset)],
-			  headers = headers
-			 )
-			if r.status_code != 200:
-				logger.error(f'Got status code {r.status_code}')
-				return
-			fields = r.content.split(b'<!>')
-			if fields[5].startswith(b'<div class="page_block no_posts">'):
+			posts = self._get_wall_offset(fixedPostID, ownerID, offset)
+			if posts.startswith('<div class="page_block no_posts">'):
 				# Reached the end
 				break
-			if not fields[5].startswith(b'<div id="post'):
-				logger.error(f'Got an unknown response: {fields[5][:200]!r}...')
-				break
-			soup = bs4.BeautifulSoup(fields[5], 'lxml', from_encoding = r.encoding)
-			yield from self._soup_to_items(soup, baseUrl)
+			if not posts.startswith('<div id="post'):
+				if posts == '"\\/blank.php?block=119910902"':
+					_logger.warning(f'Encountered geoblock on offset {offset}, trying to work around the block but might be missing content')
+					for geoblockOffset in range(lastWorkingOffset + 1, offset + 10):
+						geoPosts = self._get_wall_offset(fixedPostID, ownerID, geoblockOffset)
+						if geoPosts.startswith('<div class="page_block no_posts">'):
+							# No breaking the outer loop, it'll just make one extra request and exit as well
+							break
+						if not geoPosts.startswith('<div id="post'):
+							if geoPosts == '"\\/blank.php?block=119910902"':
+								continue
+							raise snscrape.base.ScraperException(f'Got an unknown response: {geoPosts[:200]!r}...')
+						yield from _process_soup(soup = bs4.BeautifulSoup(geoPosts, 'lxml'))
+					continue
+				raise snscrape.base.ScraperException(f'Got an unknown response: {posts[:200]!r}...')
+			lastWorkingOffset = offset
+			soup = bs4.BeautifulSoup(posts, 'lxml')
+			yield from _process_soup(soup)
+
+	def _get_wall_offset(self, fixedPostID, ownerID, offset):
+		headers = self._headers.copy()
+		headers['X-Requested-With'] = 'XMLHttpRequest'
+		_logger.info(f'Retrieving page offset {offset}')
+		r = self._post(
+		  'https://vk.com/al_wall.php',
+		  data = [('act', 'get_wall'), ('al', 1), ('fixed', fixedPostID), ('offset', offset), ('onlyCache', 'false'), ('owner_id', ownerID), ('type', 'own'), ('wall_start_from', offset)],
+		  headers = headers
+		 )
+		if r.status_code != 200:
+			raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
+		# Convert to JSON and read the HTML payload.  Note that this implicitly converts the data to a Python string (i.e., Unicode), away from a windows-1251-encoded bytes.
+		posts = r.json()['payload'][1][0]
+		return posts
+
+	def _get_entity(self):
+		r, soup = self._initial_page()
+		if r.status_code != 200:
+			return
+		kwargs = {}
+		kwargs['username'] = r.url.rsplit('/', 1)[1]
+		nameH1 = soup.find('h1', class_ = 'page_name')
+		kwargs['name'] = nameH1.text
+		kwargs['verified'] = bool(nameH1.find('div', class_ = 'page_verified'))
+
+		if (descriptionDiv := soup.find('div', id = 'page_current_info')):
+			kwargs['description'] = descriptionDiv.text
+
+		if (infoDiv := soup.find('div', id = 'page_info_wrap')):
+			websites = []
+			for rowDiv in infoDiv.find_all('div', class_ = ['profile_info_row', 'group_info_row']):
+				if 'profile_info_row' in rowDiv['class']:
+					labelDiv = rowDiv.find('div', class_ = 'fl_l')
+					if not labelDiv or labelDiv.text != 'Website:':
+						continue
+				else: # group_info_row
+					if rowDiv['title'] == 'Description':
+						kwargs['description'] = rowDiv.text
+					if rowDiv['title'] != 'Website':
+						continue
+				for a in rowDiv.find_all('a'):
+					if not a['href'].startswith('/away.php?to='):
+						_logger.warning(f'Skipping odd website link: {a["href"]!r}')
+						continue
+					websites.append(urllib.parse.unquote(a['href'].split('=', 1)[1].split('&', 1)[0]))
+			if websites:
+				kwargs['websites'] = websites
+
+		def parse_num(s: str) -> typing.Tuple[int, int]:
+			if s.endswith('K'):
+				return int(s[:-1]) * 1000, 1000
+			elif s.endswith('M'):
+				baseNum = s[:-1]
+				precision = 1000000
+				if '.' in s:
+					precision //= (10 ** len(baseNum.split('.')[1]))
+				return int(float(baseNum) * 1000000), precision
+			else:
+				return int(s.replace(',', '')), 1
+
+		if (countsDiv := soup.find('div', class_ = 'counts_module')):
+			for a in countsDiv.find_all('a', class_ = 'page_counter'):
+				count, granularity = parse_num(a.find('div', class_ = 'count').text)
+				label = a.find('div', class_ = 'label').text
+				if label in ('follower', 'post', 'photo', 'tag'):
+					label = f'{label}s'
+				if label in ('followers', 'posts', 'photos', 'tags'):
+					kwargs[label] = snscrape.base.IntWithGranularity(count, granularity)
+
+		if (idolsDiv := soup.find('div', id = 'profile_idols')):
+			if (topDiv := idolsDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Following':
+				kwargs['following'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
+
+		# On public pages, this is where followers are listed
+		if (followersDiv := soup.find('div', id = 'public_followers')):
+			if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Followers':
+				kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
+		# On community groups, this is where followers are listed
+		elif (followersDiv := soup.find('div', class_ = 'group_friends_text')):
+			kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(followersDiv.find('span', class_ = 'group_friends_count').text))
+		# On public groups, this is where followers are listed
+		elif (followersDiv := soup.find('div', id = 'group_followers')):
+			if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Members':
+				kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
+
+		return User(**kwargs)

 	@classmethod
-	def setup_parser(cls, subparser):
-		subparser.add_argument('username', help = 'A VK username')
+	def _cli_setup_parser(cls, subparser):
+		subparser.add_argument('username', type = snscrape.base.nonempty_string('username'), help = 'A VK username')

 	@classmethod
-	def from_args(cls, args):
-		return cls(args.username, retries = args.retries)
-
+	def _cli_from_args(cls, args):
+		return cls._cli_construct(args, args.username)
--- a/snscrape/modules/weibo.py
+++ b/snscrape/modules/weibo.py
@@ -0,0 +1,151 @@
+__all__ = ['Post', 'User', 'WeiboUserScraper']
+
+
+import dataclasses
+import logging
+import re
+import snscrape.base
+import typing
+
+
+_logger = logging.getLogger(__name__)
+_userDoesNotExist = object()
+_HTML_STRIP_PATTERN = re.compile(r'<[^>]*>')
+
+
+@dataclasses.dataclass
+class Post(snscrape.base.Item):
+	url: str
+	id: str
+	user: typing.Optional['User']
+	createdAt: str # Can have a variety of inconsistent formats
+	text: str
+	repostsCount: typing.Optional[int]
+	commentsCount: typing.Optional[typing.Union[int, str]]
+	likesCount: typing.Optional[int]
+	picturesCount: typing.Optional[int]
+	pictures: typing.Optional[typing.List[str]] # May be shorter than pictureCount if the API didn't return all of them (e.g. post Ipay2evb0)
+	video: typing.Optional[str]
+	link: typing.Optional[str]
+	repostedPost: typing.Optional['Post']
+
+	def __str__(self):
+		return self.url
+
+
+@dataclasses.dataclass
+class User(snscrape.base.Entity):
+	screenname: str
+	uid: int
+	verified: bool
+	verifiedReason: typing.Optional[str]
+	description: str
+	statusesCount: int
+	followersCount: int
+	followCount: int
+	avatar: str
+
+	def __str__(self):
+		return f'https://m.weibo.cn/u/{self.uid}'
+
+
+class WeiboUserScraper(snscrape.base.Scraper):
+	name = 'weibo-user'
+
+	def __init__(self, user, **kwargs):
+		super().__init__(**kwargs)
+		self._user = user
+		self._isUserId = isinstance(user, int)
+		self._headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
+
+	def _ensure_user_id(self):
+		if self._isUserId:
+			return
+		r = self._get(f'https://m.weibo.cn/n/{self._user}', headers = self._headers, allowRedirects = False)
+		if r.status_code == 302 and r.headers['Location'].startswith('/u/') and len(r.headers['Location']) == 13 and r.headers['Location'][3:].strip('0123456789') == '':
+			# Redirect to uid URL
+			self._user = int(r.headers['Location'][3:])
+			self._isUserId = True
+		elif r.status_code == 200 and '<p class="h5-4con">用户不存在</p>' in r.text:
+			_logger.warning('User does not exist')
+			self._user = _userDoesNotExist
+		else:
+			raise snscrape.base.ScraperException(f'Got unexpected response on resolving username ({r.status_code})')
+
+	def _check_timeline_response(self, r):
+		if r.status_code == 200 and r.content == b'{"ok":0,"msg":"\\u8fd9\\u91cc\\u8fd8\\u6ca1\\u6709\\u5185\\u5bb9","data":{"cards":[]}}':
+			# 'No content here yet'. Appears to happen sometimes on pagination, possibly due to too fast requests; retry this
+			return False, 'no-content message'
+		if r.status_code != 200:
+			return False, 'non-200 status code'
+		return True, None
+
+	def _mblog_to_item(self, mblog):
+		return Post(
+			url = f'https://m.weibo.cn/status/{mblog["bid"]}',
+			id = mblog['id'],
+			user = self._user_info_to_entity(mblog['user']) if mblog['user'] is not None else None,
+			createdAt = mblog['created_at'],
+			text = mblog['raw_text'] if 'raw_text' in mblog else _HTML_STRIP_PATTERN.sub('', mblog['text']),
+			repostsCount = mblog.get('reposts_count'),
+			commentsCount = mblog.get('comments_count'),
+			likesCount = mblog.get('attitudes_count'),
+			picturesCount = mblog.get('pic_num'),
+			pictures = [x['large']['url'] for x in mblog['pics']] if 'pics' in mblog else None,
+			video = mblog['page_info']['media_info']['mp4_720p_mp4'] if 'page_info' in mblog and mblog['page_info']['type'] == 'video' else None,
+			link = mblog['page_info']['page_url'] if 'page_info' in mblog and mblog['page_info']['type'] == 'webpage' else None,
+			repostedPost = self._mblog_to_item(mblog['retweeted_status']) if 'retweeted_status' in mblog else None,
+		  )
+
+	def get_items(self):
+		self._ensure_user_id()
+		if self._user is _userDoesNotExist:
+			return
+		sinceId = None
+		while True:
+			sinceParam = f'&since_id={sinceId}' if sinceId is not None else ''
+			r = self._get(f'https://m.weibo.cn/api/container/getIndex?type=uid&value={self._user}&containerid=107603{self._user}&count=25{sinceParam}', headers = self._headers, responseOkCallback = self._check_timeline_response)
+			if r.status_code != 200:
+				raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
+			o = r.json()
+			for card in o['data']['cards']:
+				if card['card_type'] != 9:
+					_logger.warning(f'Skipping card of type {card["card_type"]}')
+					continue
+				yield self._mblog_to_item(card['mblog'])
+			if 'since_id' not in o['data']['cardlistInfo']:
+				# End of pagination
+				break
+			sinceId = o['data']['cardlistInfo']['since_id']
+
+	def _user_info_to_entity(self, userInfo):
+		return User(
+			screenname = userInfo['screen_name'],
+			uid = userInfo['id'],
+			verified = userInfo['verified'],
+			verifiedReason = userInfo.get('verified_reason'),
+			description = userInfo['description'],
+			statusesCount = userInfo['statuses_count'],
+			followersCount = userInfo['followers_count'],
+			followCount = userInfo['follow_count'],
+			avatar = userInfo['avatar_hd'],
+		  )
+
+	def _get_entity(self):
+		self._ensure_user_id()
+		if self._user is _userDoesNotExist:
+			return
+		r = self._get(f'https://m.weibo.cn/api/container/getIndex?type=uid&value={self._user}', headers = self._headers)
+		if r.status_code != 200:
+			raise snscrape.base.ScraperException('Could not fetch user info')
+		o = r.json()
+		return self._user_info_to_entity(o['data']['userInfo'])
+
+	@classmethod
+	def _cli_setup_parser(cls, subparser):
+		subparser.add_argument('--name', dest = 'isName', action = 'store_true', help = 'Use username instead of user ID')
+		subparser.add_argument('user', type = snscrape.base.nonempty_string('user'), help = 'A user ID')
+
+	@classmethod
+	def _cli_from_args(cls, args):
+		return cls._cli_construct(args, user = args.user if args.isName else int(args.user))
--- a/snscrape/version.py
+++ b/snscrape/version.py
@@ -1,7 +1,7 @@
-import pkg_resources
+import importlib.metadata


 try:
-	__version__ = pkg_resources.get_distribution('snscrape').version
-except pkg_resources.DistributionNotFound:
+	__version__ = importlib.metadata.version('snscrape')
+except importlib.metadata.PackageNotFoundError:
 	__version__ = None