diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..e052d78 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,97 @@ +name: Bug report +description: Are you experiencing a problem? Create a report to help us improve! +labels: 'bug' +body: + - type: markdown + attributes: + value: | + ## Self Check + - Try searching existing GitHub Issues (open or closed) for similar issues. + - type: textarea + validations: + required: true + attributes: + label: Describe the bug + description: A clear description of what the bug is. + placeholder: e.g. I see an AssertionError when trying to scrape a Twitter user! + - type: textarea + validations: + required: true + attributes: + label: How to reproduce + description: | + How to reproduce the problem. + This should be a minimal reproducible example, i.e. the shortest possible code or the smallest number of steps that still causes the error. + placeholder: e.g. I can reproduce this issue by scraping the textfiles user with the twitter-user scraper. + - type: textarea + validations: + required: true + attributes: + label: Expected behaviour + description: A brief description of what should happen. + - type: textarea + attributes: + label: Screenshots and recordings + description: | + If applicable, add screenshots or videos to help explain your problem. (Videos should be as short as possible! Avoid watermarks too.) + - type: input + validations: + required: true + attributes: + label: Operating system + description: Include the version too, please! + placeholder: e.g. Windows 10, Ubuntu 20.04, macOS 10.15... + - type: input + validations: + required: true + attributes: + label: | + Python version: output of `python3 --version` + - type: input + validations: + required: true + attributes: + label: | + snscrape version: output of `snscrape --version` + - type: input + validations: + required: true + attributes: + label: Scraper + placeholder: e.g. twitter-user, reddit-search, TwitterSearchScraper, ... + - type: dropdown + validations: + required: true + attributes: + label: How are you using snscrape? + options: ['CLI (`snscrape ...` as a command, e.g. in a terminal)', 'Module (`import snscrape.modules.something` in Python code)'] + - type: textarea + validations: + required: false + attributes: + label: Backtrace + description: What is the error snscrape gives you, if any? + - type: textarea + validations: + required: false + attributes: + label: Log output + description: | + Insert here the debug log of snscrape. + If you use the CLI, add the global options `-vv` to the command, e.g. `snscrape -vv twitter-search ...`. + If you use the module, set the debug level in your Python code before any use of snscrape: `import logging; logging.basicConfig(level = logging.DEBUG)`. + If you already use `logging` in your own code, you may need to adjust the level there instead. + - type: textarea + validations: + required: false + attributes: + label: Dump of locals + description: | + Here attach the dump of your snscrape locals, if it's a crash. (snscrape should tell you the path). + Please note that it may contain identifying info such as IP address, if the website returns that. + You can also optionally request to exchange the file in private. + Finally, if snscrape didn't crash, leave this field blank. + - type: textarea + attributes: + label: Additional context + description: Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000..5cf91e1 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,27 @@ +name: Feature Request +description: Want a feature? Ask; we don't bite! +labels: 'enhancement' +body: + - type: markdown + attributes: + value: | + ## Self Check + - Try searching existing GitHub Issues (open or closed) for similar issues. + - type: textarea + validations: + required: true + attributes: + label: Describe the feature + description: A clear description of what the feature is. + - type: textarea + validations: + required: false + attributes: + label: Would this fix a problem you're experiencing? If so, specify. + - type: textarea + attributes: + label: Did you consider other alternatives? + description: If so, specify + - type: input + attributes: + label: Additional context diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md new file mode 100644 index 0000000..2632c39 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/question.md @@ -0,0 +1,6 @@ +--- +name: Question +about: Ask away! (Do not use this for bugs or features.) +labels: 'question' + +--- diff --git a/README.md b/README.md index bfe30d3..00cb4f9 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ The following services are currently supported: * Mastodon: user profiles and toots (single or thread) * Reddit: users, subreddits, and searches (via Pushshift) * Telegram: channels -* Twitter: users, user profiles, hashtags, searches, tweets (single or surrounding thread), list posts, and trends +* Twitter: users, user profiles, hashtags, searches (live tweets, top tweets, and users), tweets (single or surrounding thread), list posts, communities, and trends * VKontakte: user profiles * Weibo (Sina Weibo): user profiles @@ -59,7 +59,10 @@ To get the latest 100 tweets with the hashtag #archiveteam: It is also possible to use snscrape as a library in Python, but this is currently undocumented. ## Issue reporting -If you discover an issue with snscrape, please report it at . If possible please run snscrape with `-vv` and `--dump-locals` and include the log output as well as the dump files referenced in the log in the issue. Note that the files may contain sensitive information in some cases and could potentially be used to identify you (e.g. if the service includes your IP address in its response). If you prefer to arrange a file transfer privately, just mention that in the issue. +If you discover an issue with snscrape, please report it at . If you use the CLI, please run snscrape with `-vv` and include the log output in the issue. If you use snscrape as a module, please enable debug-level logging using `import logging; logging.basicConfig(level = logging.DEBUG)` (before using snscrape at all) and include the log output in the issue. + +### Dump files +In some cases, debugging may require more information than is available in the log. The CLI has a `--dump-locals` option that enables dumping all local variables within snscrape based on important log messages (rather than, by default, only on crashes). Note that the dump files may contain sensitive information in some cases and could potentially be used to identify you (e.g. if the service includes your IP address in its response). If you prefer to arrange a file transfer privately, just mention that in the issue. ## License This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..0ccf5bd --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,37 @@ +[build-system] +requires = ['setuptools>=61', 'setuptools_scm>=6.2'] +build-backend = 'setuptools.build_meta' + +[tool.setuptools] +packages = ['snscrape', 'snscrape.modules'] + +[tool.setuptools_scm] + +[project] +name = 'snscrape' +description = 'A social networking service scraper' +readme = 'README.md' +authors = [{name = 'JustAnotherArchivist'}] +classifiers = [ + 'Development Status :: 4 - Beta', + 'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', +] +dependencies = [ + 'requests[socks]', + 'lxml', + 'beautifulsoup4', + 'pytz; python_version < "3.9.0"', + 'filelock', +] +requires-python = '~=3.8' +dynamic = ['version'] + +[project.urls] +repository = "https://github.com/JustAnotherArchivist/snscrape" + +[project.scripts] +snscrape = 'snscrape._cli:main' diff --git a/setup.py b/setup.py deleted file mode 100644 index c026667..0000000 --- a/setup.py +++ /dev/null @@ -1,42 +0,0 @@ -import os.path -import setuptools - - -with open(os.path.join(os.path.dirname(__file__), 'README.md')) as fp: - readme = fp.read() - - -setuptools.setup( - name = 'snscrape', - description = 'A social networking service scraper', - long_description = readme, - long_description_content_type = 'text/markdown', - author = 'JustAnotherArchivist', - url = 'https://github.com/JustAnotherArchivist/snscrape', - classifiers = [ - 'Development Status :: 4 - Beta', - 'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - ], - packages = ['snscrape', 'snscrape.modules'], - setup_requires = ['setuptools_scm'], - use_scm_version = True, - install_requires = [ - 'requests[socks]', - 'lxml', - 'beautifulsoup4', - 'pytz; python_version < "3.9.0"', - 'filelock', - ], - python_requires = '~=3.8', - extras_require = { - 'test': ['coverage'], - }, - entry_points = { - 'console_scripts': [ - 'snscrape = snscrape._cli:main', - ], - }, -) diff --git a/snscrape/_cli.py b/snscrape/_cli.py index d4b66ea..c0bb32d 100644 --- a/snscrape/_cli.py +++ b/snscrape/_cli.py @@ -6,6 +6,7 @@ import datetime import importlib.metadata import inspect import logging +import os import requests # Imported in parse_args() after setting up the logger: #import snscrape.base @@ -23,7 +24,7 @@ logger = logging # Replaced below after setting the logger class class Logger(logging.Logger): def _log_with_stack(self, level, *args, **kwargs): super().log(level, *args, **kwargs) - if dumpLocals: + if dumpLocals and not kwargs.get('extra', {}).get('_snscrapeSuppressDumpLocals', False): stack = inspect.stack() if len(stack) >= 3: name = _dump_stack_and_locals(stack[2:][::-1]) @@ -118,7 +119,7 @@ def _dump_locals_on_exception(): trace = inspect.trace() if len(trace) >= 2: name = _dump_stack_and_locals(trace[1:], exc = e) - logger.fatal(f'Dumped stack and locals to {name}') + logger.fatal(f'Dumped stack and locals to {name}', extra = {'_snscrapeSuppressDumpLocals': True}) raise @@ -307,32 +308,36 @@ def main(): i = 0 with _dump_locals_on_exception(): - if args.withEntity and (entity := scraper.entity): - if args.jsonl: - print(entity.json()) + try: + if args.withEntity and (entity := scraper.entity): + if args.jsonl: + print(entity.json()) + else: + print(entity) + if args.maxResults == 0: + logger.info('Exiting after 0 results') + return + for i, item in enumerate(scraper.get_items(), start = 1): + if args.since is not None and item.date < args.since: + logger.info(f'Exiting due to reaching older results than {args.since}') + break + if args.jsonl: + print(item.json()) + elif args.format is not None: + print(args.format.format(item)) + else: + print(item) + if args.progress and i % 100 == 0: + print(f'Scraping, {i} results so far', file = sys.stderr) + if args.maxResults and i >= args.maxResults: + logger.info(f'Exiting after {i} results') + if args.progress: + print(f'Stopped scraping after {i} results due to --max-results', file = sys.stderr) + break else: - print(entity) - if args.maxResults == 0: - logger.info('Exiting after 0 results') - return - for i, item in enumerate(scraper.get_items(), start = 1): - if args.since is not None and item.date < args.since: - logger.info(f'Exiting due to reaching older results than {args.since}') - break - if args.jsonl: - print(item.json()) - elif args.format is not None: - print(args.format.format(item)) - else: - print(item) - if args.progress and i % 100 == 0: - print(f'Scraping, {i} results so far', file = sys.stderr) - if args.maxResults and i >= args.maxResults: - logger.info(f'Exiting after {i} results') + logger.info(f'Done, found {i} results') if args.progress: - print(f'Stopped scraping after {i} results due to --max-results', file = sys.stderr) - break - else: - logger.info(f'Done, found {i} results') - if args.progress: - print(f'Finished, {i} results', file = sys.stderr) + print(f'Finished, {i} results', file = sys.stderr) + except BrokenPipeError: + os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stdout.fileno()) + sys.exit(1) diff --git a/snscrape/base.py b/snscrape/base.py index 71ab649..c9e75d9 100644 --- a/snscrape/base.py +++ b/snscrape/base.py @@ -1,3 +1,6 @@ +__all__ = ['DeprecatedFeatureWarning', 'IntWithGranularity', 'Item', 'Scraper', 'ScraperException'] + + import abc import copy import dataclasses @@ -6,11 +9,28 @@ import functools import json import logging import requests +import requests.adapters +import urllib3.connection import time import warnings -logger = logging.getLogger(__name__) +_logger = logging.getLogger(__name__) + + +def _module_deprecation_helper(all, **names): + def __getattr__(name): + if name in names: + warnings.warn(f'{name} is deprecated, use {names[name].__name__} instead', DeprecatedFeatureWarning, stacklevel = 2) + return names[name] + raise AttributeError(f'module {__name__!r} has no attribute {name!r}') + def __dir__(): + return sorted(all + list(names.keys())) + return __getattr__, __dir__ + + +class DeprecatedFeatureWarning(FutureWarning): + pass class _DeprecatedProperty: @@ -22,7 +42,7 @@ class _DeprecatedProperty: def __get__(self, obj, objType): if obj is None: # if the access is through the class using _DeprecatedProperty rather than an instance of the class: return self - warnings.warn(f'{self.name} is deprecated, use {self.replStr} instead', FutureWarning, stacklevel = 2) + warnings.warn(f'{self.name} is deprecated, use {self.replStr} instead', DeprecatedFeatureWarning, stacklevel = 2) return self.repl(obj) @@ -43,9 +63,9 @@ def _json_dataclass_to_dict(obj): if field.name.startswith('_'): continue out[field.name] = _json_dataclass_to_dict(getattr(obj, field.name)) - # Add in (non-deprecated) properties + # Add properties for k in dir(obj): - if isinstance(getattr(type(obj), k, None), property): + if isinstance(getattr(type(obj), k, None), (property, _DeprecatedProperty)): assert k != '_type' if k.startswith('_'): continue @@ -68,7 +88,9 @@ class _JSONDataclass: def json(self): '''Convert the object to a JSON string''' - out = _json_dataclass_to_dict(self) + with warnings.catch_warnings(): + warnings.filterwarnings(action = 'ignore', category = DeprecatedFeatureWarning) + out = _json_dataclass_to_dict(self) for key, value in list(out.items()): # Modifying the dict below, so make a copy first if isinstance(value, IntWithGranularity): out[key] = int(value) @@ -79,7 +101,7 @@ class _JSONDataclass: @dataclasses.dataclass class Item(_JSONDataclass): - '''An abstract base class for an item returned by the scraper's get_items generator. + '''An abstract base class for an item returned by the scraper. An item can really be anything. The string representation should be useful for the CLI output (e.g. a direct URL for the item). ''' @@ -89,18 +111,6 @@ class Item(_JSONDataclass): pass -@dataclasses.dataclass -class Entity(_JSONDataclass): - '''An abstract base class for an entity returned by the scraper's entity property. - - An entity is typically the account of a person or organisation. The string representation should be the preferred direct URL to the entity's page on the network. - ''' - - @abc.abstractmethod - def __str__(self): - pass - - class IntWithGranularity(int): '''A number with an associated granularity @@ -116,18 +126,31 @@ class IntWithGranularity(int): return (IntWithGranularity, (int(self), self.granularity)) -class URLItem(Item): - '''A generic item which only holds a URL string.''' +class _HTTPSAdapter(requests.adapters.HTTPAdapter): + def init_poolmanager(self, *args, **kwargs): + super().init_poolmanager(*args, **kwargs) + #FIXME: Uses private urllib3.PoolManager attribute pool_classes_by_scheme. + try: + self.poolmanager.pool_classes_by_scheme['https'].ConnectionCls = _HTTPSConnection + except (AttributeError, KeyError) as e: + _logger.debug(f'Could not install TLS cipher logger: {type(e).__module__}.{type(e).__name__} {e!s}') - def __init__(self, url): - self._url = url - @property - def url(self): - return self._url - - def __str__(self): - return self._url +class _HTTPSConnection(urllib3.connection.HTTPSConnection): + def connect(self, *args, **kwargs): + conn = super().connect(*args, **kwargs) + #FIXME: Uses undocumented attribute self.sock and beyond. + try: + _logger.debug(f'Connected to: {self.sock.getpeername()}') + except AttributeError: + # self.sock might be a urllib3.util.ssltransport.SSLTransport, which lacks getpeername. + pass + try: + _logger.debug(f'Connection cipher: {self.sock.cipher()}') + except AttributeError: + # Shouldn't be possible, but better safe than sorry. + pass + return conn class ScraperException(Exception): @@ -143,6 +166,7 @@ class Scraper: self._retries = retries self._proxies = proxies self._session = requests.Session() + self._session.mount('https://', _HTTPSAdapter()) @abc.abstractmethod def get_items(self): @@ -164,16 +188,17 @@ class Scraper: def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None, allowRedirects = True, proxies = None): proxies = proxies or self._proxies or {} + errors = [] for attempt in range(self._retries + 1): # The request is newly prepared on each retry because of potential cookie updates. req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers)) environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None) - logger.info(f'Retrieving {req.url}') - logger.debug(f'... with headers: {headers!r}') + _logger.info(f'Retrieving {req.url}') + _logger.debug(f'... with headers: {headers!r}') if data: - logger.debug(f'... with data: {data!r}') + _logger.debug(f'... with data: {data!r}') if environmentSettings: - logger.debug(f'... with environmentSettings: {environmentSettings!r}') + _logger.debug(f'... with environmentSettings: {environmentSettings!r}') try: r = self._session.send(req, allow_redirects = allowRedirects, timeout = timeout, **environmentSettings) except requests.exceptions.RequestException as exc: @@ -183,21 +208,25 @@ class Scraper: else: retrying = '' level = logging.ERROR - logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}') + _logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}') + errors.append(repr(exc)) else: redirected = f' (redirected to {r.url})' if r.history else '' - logger.info(f'Retrieved {req.url}{redirected}: {r.status_code}') + _logger.info(f'Retrieved {req.url}{redirected}: {r.status_code}') + _logger.debug(f'... with response headers: {r.headers!r}') if r.history: for i, redirect in enumerate(r.history): - logger.debug(f'... request {i}: {redirect.request.url}: {r.status_code} (Location: {r.headers.get("Location")})') + _logger.debug(f'... request {i}: {redirect.request.url}: {redirect.status_code} (Location: {redirect.headers.get("Location")})') + _logger.debug(f'... ... with response headers: {redirect.headers!r}') if responseOkCallback is not None: success, msg = responseOkCallback(r) + errors.append(msg) else: success, msg = (True, None) msg = f': {msg}' if msg else '' if success: - logger.debug(f'{req.url} retrieved successfully{msg}') + _logger.debug(f'{req.url} retrieved successfully{msg}') return r else: if attempt < self._retries: @@ -206,14 +235,15 @@ class Scraper: else: retrying = '' level = logging.ERROR - logger.log(level, f'Error retrieving {req.url}{msg}{retrying}') + _logger.log(level, f'Error retrieving {req.url}{msg}{retrying}') if attempt < self._retries: sleepTime = 1.0 * 2**attempt # exponential backoff: sleep 1 second after first attempt, 2 after second, 4 after third, etc. - logger.info(f'Waiting {sleepTime:.0f} seconds') + _logger.info(f'Waiting {sleepTime:.0f} seconds') time.sleep(sleepTime) else: msg = f'{self._retries + 1} requests to {req.url} failed, giving up.' - logger.fatal(msg) + _logger.fatal(msg) + _logger.fatal(f'Errors: {", ".join(errors)}') raise ScraperException(msg) raise RuntimeError('Reached unreachable code') @@ -229,7 +259,7 @@ class Scraper: @classmethod def _cli_from_args(cls, args): - return cls._construct(args) + return cls._cli_construct(args) @classmethod def _cli_construct(cls, argparseArgs, *args, **kwargs): @@ -244,3 +274,6 @@ def nonempty_string(name): raise ValueError('must not be an empty string') f.__name__ = name return f + + +__getattr__, __dir__ = _module_deprecation_helper(__all__, Entity = Item) diff --git a/snscrape/modules/facebook.py b/snscrape/modules/facebook.py index 6b6bbde..c2839f1 100644 --- a/snscrape/modules/facebook.py +++ b/snscrape/modules/facebook.py @@ -30,7 +30,7 @@ class FacebookPost(snscrape.base.Item): @dataclasses.dataclass -class User(snscrape.base.Entity): +class User(snscrape.base.Item): username: str pageId: int name: str diff --git a/snscrape/modules/instagram.py b/snscrape/modules/instagram.py index 14483e0..1cd5db7 100644 --- a/snscrape/modules/instagram.py +++ b/snscrape/modules/instagram.py @@ -32,7 +32,7 @@ class InstagramPost(snscrape.base.Item): @dataclasses.dataclass -class User(snscrape.base.Entity): +class User(snscrape.base.Item): username: str name: typing.Optional[str] followers: snscrape.base.IntWithGranularity diff --git a/snscrape/modules/mastodon.py b/snscrape/modules/mastodon.py index cfe69e4..653e83a 100644 --- a/snscrape/modules/mastodon.py +++ b/snscrape/modules/mastodon.py @@ -67,7 +67,7 @@ class PollOption: @dataclasses.dataclass -class User(snscrape.base.Entity): +class User(snscrape.base.Item): account: str # @username@domain.invalid displayName: typing.Optional[str] = None displayNameWithCustomEmojis: typing.Optional[typing.List[typing.Union[str, 'CustomEmoji']]] = None diff --git a/snscrape/modules/reddit.py b/snscrape/modules/reddit.py index 55af939..f93b96e 100644 --- a/snscrape/modules/reddit.py +++ b/snscrape/modules/reddit.py @@ -133,6 +133,21 @@ class _RedditPushshiftScraper(snscrape.base.Scraper): return cls(**kwargs) + def _iter_api(self, url, params = None): + '''Iterate through the Pushshift API using the 'until' parameter and yield the items.''' + lowestIdSeen = None + if params is None: + params = {} + while True: + obj = self._get_api(url, params = params) + if not obj['data'] or (lowestIdSeen is not None and all(_cmp_id(d['id'], lowestIdSeen) >= 0 for d in obj['data'])): # end of pagination + break + for d in obj['data']: + if lowestIdSeen is None or _cmp_id(d['id'], lowestIdSeen) == -1: + yield self._api_obj_to_item(d) + lowestIdSeen = d['id'] + params['until'] = obj["data"][-1]["created_utc"] + 1 + class _RedditPushshiftSearchScraper(_RedditPushshiftScraper): def __init__(self, name, *, submissions = True, comments = True, before = None, after = None, **kwargs): @@ -148,35 +163,20 @@ class _RedditPushshiftSearchScraper(_RedditPushshiftScraper): if not self._submissions and not self._comments: raise ValueError('At least one of submissions and comments must be True') - def _iter_api(self, url, params = None): - '''Iterate through the Pushshift API using the 'before' parameter and yield the items.''' - lowestIdSeen = None - if params is None: - params = {} - if self._before is not None: - params['before'] = self._before - if self._after is not None: - params['after'] = self._after - params['sort'] = 'desc' - while True: - obj = self._get_api(url, params = params) - if not obj['data'] or (lowestIdSeen is not None and all(_cmp_id(d['id'], lowestIdSeen) >= 0 for d in obj['data'])): # end of pagination - break - for d in obj['data']: - if lowestIdSeen is None or _cmp_id(d['id'], lowestIdSeen) == -1: - yield self._api_obj_to_item(d) - lowestIdSeen = d['id'] - params['before'] = obj["data"][-1]["created_utc"] + 1 - def _iter_api_submissions_and_comments(self, params: dict): # Retrieve both submissions and comments, interleave the results to get a reverse-chronological order - params['size'] = '1000' + params['limit'] = '1000' + if self._before is not None: + params['until'] = self._before + if self._after is not None: + params['since'] = self._after + if self._submissions: - submissionsIter = self._iter_api('https://api.pushshift.io/reddit/search/submission/', params.copy()) # Pass copies to prevent the two iterators from messing each other up by using the same dict + submissionsIter = self._iter_api('https://api.pushshift.io/reddit/search/submission', params.copy()) # Pass copies to prevent the two iterators from messing each other up by using the same dict else: submissionsIter = iter(()) if self._comments: - commentsIter = self._iter_api('https://api.pushshift.io/reddit/search/comment/', params.copy()) + commentsIter = self._iter_api('https://api.pushshift.io/reddit/search/comment', params.copy()) else: commentsIter = iter(()) @@ -260,21 +260,15 @@ class RedditSubmissionScraper(_RedditPushshiftScraper): self._submissionId = submissionId def get_items(self): - obj = self._get_api(f'https://api.pushshift.io/reddit/search/submission/?ids={self._submissionId}') + obj = self._get_api(f'https://api.pushshift.io/reddit/search/submission?ids={self._submissionId}') if not obj['data']: return if len(obj['data']) != 1: raise snscrape.base.ScraperException(f'Got {len(obj["data"])} results instead of 1') yield self._api_obj_to_item(obj['data'][0]) - obj = self._get_api(f'https://api.pushshift.io/reddit/submission/comment_ids/{self._submissionId}') - if not obj['data']: - return - commentIds = obj['data'] - for i in range(0, len(commentIds), 500): - ids = commentIds[i : i + 500] - obj = self._get_api(f'https://api.pushshift.io/reddit/comment/search?ids={",".join(ids)}') - yield from map(self._api_obj_to_item, obj['data']) + # Upstream bug: link_id must be provided in decimal https://old.reddit.com/r/pushshift/comments/zkggt0/update_on_colo_switchover_bug_fixes_reindexing/ + yield from self._iter_api('https://api.pushshift.io/reddit/search/comment', {'link_id': int(self._submissionId, 36), 'limit': 1000}) @classmethod def _cli_setup_parser(cls, subparser): diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py index 64b8ca4..4e97765 100644 --- a/snscrape/modules/telegram.py +++ b/snscrape/modules/telegram.py @@ -24,7 +24,7 @@ class LinkPreview: @dataclasses.dataclass -class Channel(snscrape.base.Entity): +class Channel(snscrape.base.Item): username: str title: typing.Optional[str] = None verified: typing.Optional[bool] = None @@ -269,14 +269,10 @@ class TelegramChannelScraper(snscrape.base.Scraper): if r.status_code != 200: raise snscrape.base.ScraperException(f'Got status code {r.status_code}') soup = bs4.BeautifulSoup(r.text, 'lxml') - membersDiv = soup.find('div', class_ = 'tgme_page_extra') - if membersDiv is not None: + if (membersDiv := soup.find('div', class_ = 'tgme_page_extra')): if membersDiv.text.split(',')[0].endswith((' members', ' subscribers')): membersStr = ''.join(membersDiv.text.split(',')[0].split(' ')[:-1]) - if membersStr == 'no': - kwargs['members'] = 0 - else: - kwargs['members'] = int(membersStr) + kwargs['members'] = 0 if membersStr == 'no' else int(membersStr) photoImg = soup.find('img', class_ = 'tgme_page_photo_image') if photoImg is not None: kwargs['photo'] = photoImg.attrs['src'] diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 048df34..8c66fe4 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1,5 +1,5 @@ __all__ = [ - 'Tweet', 'Medium', 'Photo', 'VideoVariant', 'Video', 'Gif', 'DescriptionUrl', 'Coordinates', 'Place', + 'Tweet', 'Medium', 'Photo', 'VideoVariant', 'Video', 'Gif', 'TextLink', 'Coordinates', 'Place', 'User', 'UserLabel', 'Trend', 'GuestTokenManager', @@ -7,6 +7,7 @@ __all__ = [ 'TwitterUserScraper', 'TwitterProfileScraper', 'TwitterHashtagScraper', + 'TwitterCashtagScraper', 'TwitterTweetScraperMode', 'TwitterTweetScraper', 'TwitterListPostsScraper', @@ -14,7 +15,9 @@ __all__ = [ ] +import base64 import collections +import copy import dataclasses import datetime import email.utils @@ -26,24 +29,29 @@ import random import logging import os import re +import requests.adapters import snscrape.base +import snscrape.utils import string import time import typing import urllib.parse +import urllib3.util.ssl_ +import warnings _logger = logging.getLogger(__name__) _API_AUTHORIZATION_HEADER = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs=1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' _globalGuestTokenManager = None _GUEST_TOKEN_VALIDITY = 10800 +_CIPHERS_CHROME = 'TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-RSA-AES128-SHA:ECDHE-RSA-AES256-SHA:AES128-GCM-SHA256:AES256-GCM-SHA384:AES128-SHA:AES256-SHA' @dataclasses.dataclass class Tweet(snscrape.base.Item): url: str date: datetime.datetime - content: str + rawContent: str renderedContent: str id: int user: 'User' @@ -53,11 +61,10 @@ class Tweet(snscrape.base.Item): quoteCount: int conversationId: int lang: str - source: str + source: typing.Optional[str] = None sourceUrl: typing.Optional[str] = None sourceLabel: typing.Optional[str] = None - outlinks: typing.Optional[typing.List[str]] = None - tcooutlinks: typing.Optional[typing.List[str]] = None + links: typing.Optional[typing.List['TextLink']] = None media: typing.Optional[typing.List['Medium']] = None retweetedTweet: typing.Optional['Tweet'] = None quotedTweet: typing.Optional['Tweet'] = None @@ -69,15 +76,28 @@ class Tweet(snscrape.base.Item): hashtags: typing.Optional[typing.List[str]] = None cashtags: typing.Optional[typing.List[str]] = None card: typing.Optional['Card'] = None + viewCount: typing.Optional[int] = None + vibe: typing.Optional['Vibe'] = None username = snscrape.base._DeprecatedProperty('username', lambda self: self.user.username, 'user.username') - outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks) if self.outlinks else '', 'outlinks') - tcooutlinksss = snscrape.base._DeprecatedProperty('tcooutlinksss', lambda self: ' '.join(self.tcooutlinks) if self.tcooutlinks else '', 'tcooutlinks') + outlinks = snscrape.base._DeprecatedProperty('outlinks', lambda self: [x.url for x in self.links] if self.links else [], 'links (url attribute)') + outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(x.url for x in self.links) if self.links else '', 'links (url attribute)') + tcooutlinks = snscrape.base._DeprecatedProperty('tcooutlinks', lambda self: [x.tcourl for x in self.links] if self.links else [], 'links (tcourl attribute)') + tcooutlinksss = snscrape.base._DeprecatedProperty('tcooutlinksss', lambda self: ' '.join(x.tcourl for x in self.links) if self.links else '', 'links (tcourl attribute)') + content = snscrape.base._DeprecatedProperty('content', lambda self: self.rawContent, 'rawContent') def __str__(self): return self.url +@dataclasses.dataclass +class TextLink: + text: typing.Optional[str] + url: str + tcourl: typing.Optional[str] + indices: typing.Tuple[int, int] + + class Medium: pass @@ -86,12 +106,13 @@ class Medium: class Photo(Medium): previewUrl: str fullUrl: str + altText: typing.Optional[str] = None @dataclasses.dataclass class VideoVariant: - contentType: str url: str + contentType: typing.Optional[str] bitrate: typing.Optional[int] @@ -101,20 +122,14 @@ class Video(Medium): variants: typing.List[VideoVariant] duration: typing.Optional[float] = None views: typing.Optional[int] = None + altText: typing.Optional[str] = None @dataclasses.dataclass class Gif(Medium): thumbnailUrl: str variants: typing.List[VideoVariant] - - -@dataclasses.dataclass -class DescriptionURL: - text: typing.Optional[str] - url: str - tcourl: str - indices: typing.Tuple[int, int] + altText: typing.Optional[str] = None @dataclasses.dataclass @@ -125,6 +140,7 @@ class Coordinates: @dataclasses.dataclass class Place: + id: str fullName: str name: str type: str @@ -196,7 +212,7 @@ class PromoConvoAction: class BroadcastCard(Card): id: str url: str - title: str + title: typing.Optional[str] = None state: typing.Optional[str] = None broadcaster: typing.Optional['User'] = None thumbnailUrl: typing.Optional[str] = None @@ -212,7 +228,7 @@ class PeriscopeBroadcastCard(Card): description: str state: str totalParticipants: int - thumbnailUrl: str + thumbnailUrl: typing.Optional[str] = None source: typing.Optional[str] = None broadcaster: typing.Optional['User'] = None siteUser: typing.Optional['User'] = None @@ -240,10 +256,10 @@ class Event: class NewsletterCard(Card): title: str description: str - imageUrl: str url: str revueAccountId: int issueCount: int + imageUrl: typing.Optional[str] = None @dataclasses.dataclass @@ -300,33 +316,12 @@ class UnifiedCard(Card): apps: typing.Optional[typing.Dict[UnifiedCardAppKey, typing.List['UnifiedCardApp']]] = None components: typing.Optional[typing.List[UnifiedCardComponentKey]] = None swipeableLayoutSlides: typing.Optional[typing.List['UnifiedCardSwipeableLayoutSlide']] = None + collectionLayoutSlides: typing.Optional[typing.List['UnifiedCardCollectionLayoutSlide']] = None type: typing.Optional[str] = None def __post_init__(self): - if (self.components is None) == (self.swipeableLayoutSlides is None): - raise ValueError('did not get exactly one of components or swipeableLayoutSlides') - if self.components and not all(k in self.componentObjects for k in self.components): - raise ValueError('missing components') - if self.swipeableLayoutSlides and not all(s.mediumComponentKey in self.componentObjects and s.componentKey in self.componentObjects for s in self.swipeableLayoutSlides): - raise ValueError('missing components') - if any(c.destinationKey not in self.destinations for c in self.componentObjects.values() if hasattr(c, 'destinationKey')): - raise ValueError('missing destinations') - if any(b.destinationKey not in self.destinations for c in self.componentObjects.values() if isinstance(c, UnifiedCardButtonGroupComponentObject) for b in c.buttons): - raise ValueError('missing destinations') - mediaKeys = [] - for c in self.componentObjects.values(): - if isinstance(c, UnifiedCardMediumComponentObject): - mediaKeys.append(c.mediumKey) - elif isinstance(c, UnifiedCardSwipeableMediaComponentObject): - mediaKeys.extend(x.mediumKey for x in c.media) - mediaKeys.extend(d.mediumKey for d in self.destinations.values() if d.mediumKey is not None) - mediaKeys.extend(a.iconMediumKey for l in (self.apps.values() if self.apps is not None else []) for a in l if a.iconMediumKey is not None) - if any(k not in self.media for k in mediaKeys): - raise ValueError('missing media') - if any(c.appKey not in self.apps for c in self.componentObjects.values() if hasattr(c, 'appKey')): - raise ValueError('missing apps') - if any(d.appKey not in self.apps for d in self.destinations.values() if d.appKey is not None): - raise ValueError('missing apps') + if (self.components is not None) + (self.swipeableLayoutSlides is not None) + (self.collectionLayoutSlides is not None) != 1: + raise ValueError('did not get exactly one of components, swipeableLayoutSlides, and collectionLayoutSlides') class UnifiedCardComponentObject: @@ -407,9 +402,9 @@ class UnifiedCardApp: type: str id: str title: str - category: str countryCode: str url: str + category: typing.Optional[str] = None description: typing.Optional[str] = None iconMediumKey: typing.Optional[UnifiedCardMediumKey] = None size: typing.Optional[int] = None @@ -428,6 +423,19 @@ class UnifiedCardSwipeableLayoutSlide: componentKey: UnifiedCardComponentKey +@dataclasses.dataclass +class UnifiedCardCollectionLayoutSlide: + detailsComponentKey: UnifiedCardComponentKey + mediumComponentKey: UnifiedCardComponentKey + + +@dataclasses.dataclass +class Vibe: + text: str + imageUrl: str + imageDescription: str + + @dataclasses.dataclass class TweetRef(snscrape.base.Item): '''A reference to a tweet for which no proper Tweet object could be produced from the data returned by Twitter''' @@ -439,15 +447,27 @@ class TweetRef(snscrape.base.Item): @dataclasses.dataclass -class User(snscrape.base.Entity): +class Tombstone(snscrape.base.Item): + '''A placeholder for a tweet that cannot be accessed''' + + id: int + text: typing.Optional[str] = None + textLinks: typing.Optional[typing.List[TextLink]] = None + + def __str__(self): + return f'https://twitter.com/i/web/status/{self.id}' + + +@dataclasses.dataclass +class User(snscrape.base.Item): # Most fields can be None if they're not known. username: str id: int displayname: typing.Optional[str] = None - description: typing.Optional[str] = None # Description as it's displayed on the web interface with URLs replaced rawDescription: typing.Optional[str] = None # Raw description with the URL(s) intact - descriptionUrls: typing.Optional[typing.List[DescriptionURL]] = None + renderedDescription: typing.Optional[str] = None # Description as it's displayed on the web interface with URLs replaced + descriptionLinks: typing.Optional[typing.List[TextLink]] = None verified: typing.Optional[bool] = None created: typing.Optional[datetime.datetime] = None followersCount: typing.Optional[int] = None @@ -458,12 +478,16 @@ class User(snscrape.base.Entity): mediaCount: typing.Optional[int] = None location: typing.Optional[str] = None protected: typing.Optional[bool] = None - linkUrl: typing.Optional[str] = None - linkTcourl: typing.Optional[str] = None + link: typing.Optional[TextLink] = None profileImageUrl: typing.Optional[str] = None profileBannerUrl: typing.Optional[str] = None label: typing.Optional['UserLabel'] = None + descriptionUrls = snscrape.base._DeprecatedProperty('descriptionUrls', lambda self: self.descriptionLinks, 'descriptionLinks') + linkUrl = snscrape.base._DeprecatedProperty('linkUrl', lambda self: self.link.url if self.link else None, 'link.url') + linkTcourl = snscrape.base._DeprecatedProperty('linkTcourl', lambda self: self.link.tcourl if self.link else None, 'link.tcourl') + description = snscrape.base._DeprecatedProperty('description', lambda self: self.renderedDescription, 'renderedDescription') + @property def url(self): return f'https://twitter.com/{self.username}' @@ -483,6 +507,27 @@ class UserLabel: @dataclasses.dataclass class UserRef: id: int + text: typing.Optional[str] = None + textLinks: typing.Optional[typing.List[TextLink]] = None + + def __str__(self): + return f'https://twitter.com/i/user/{self.id}' + + +@dataclasses.dataclass +class Community(snscrape.base.Item): + id: int + name: str + created: datetime.datetime + admin: typing.Union[User, UserRef] + creator: typing.Union[User, UserRef] + membersFacepile: typing.List[typing.Union[User, UserRef]] + moderatorsCount: int + membersCount: int + rules: typing.List[str] + theme: str + bannerUrl: str + description: typing.Optional[str] = None @dataclasses.dataclass @@ -548,7 +593,12 @@ class _CLIGuestTokenManager(GuestTokenManager): return None _logger.info(f'Reading guest token from {self._file}') with open(self._file, 'r') as fp: - o = json.load(fp) + try: + o = json.load(fp) + except json.JSONDecodeError as e: + _logger.warning(f'Malformed guest token file {self._file}: {e!s}') + self.reset() + return None self._token = o['token'] self._setTime = o['setTime'] if self._setTime < time.time() - _GUEST_TOKEN_VALIDITY: @@ -588,13 +638,20 @@ class _CLIGuestTokenManager(GuestTokenManager): pass +class _TwitterTLSAdapter(snscrape.base._HTTPSAdapter): + def init_poolmanager(self, *args, **kwargs): + #FIXME: When urllib3 2.0.0 is out and can be required, this should use urllib3.util.create_urllib3_context instead of the private, undocumented ssl_ module. + kwargs['ssl_context'] = urllib3.util.ssl_.create_urllib3_context(ciphers = _CIPHERS_CHROME) + super().init_poolmanager(*args, **kwargs) + + class _TwitterAPIType(enum.Enum): V2 = 0 # Introduced with the redesign GRAPHQL = 1 class _TwitterAPIScraper(snscrape.base.Scraper): - def __init__(self, baseUrl, *, guestTokenManager = None, **kwargs): + def __init__(self, baseUrl, *, guestTokenManager = None, maxEmptyPages = 0, **kwargs): super().__init__(**kwargs) self._baseUrl = baseUrl if guestTokenManager is None: @@ -603,22 +660,26 @@ class _TwitterAPIScraper(snscrape.base.Scraper): _globalGuestTokenManager = GuestTokenManager() guestTokenManager = _globalGuestTokenManager self._guestTokenManager = guestTokenManager + self._maxEmptyPages = maxEmptyPages self._apiHeaders = { 'User-Agent': None, 'Authorization': _API_AUTHORIZATION_HEADER, 'Referer': self._baseUrl, 'Accept-Language': 'en-US,en;q=0.5', } + adapter = _TwitterTLSAdapter() + self._session.mount('https://twitter.com', adapter) + self._session.mount('https://api.twitter.com', adapter) self._set_random_user_agent() def _set_random_user_agent(self): - self._userAgent = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.{random.randint(0, 9999)} Safari/537.{random.randint(0, 99)}' + self._userAgent = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.5563.{random.randint(0, 9999)} Safari/537.{random.randint(0, 99)}' self._apiHeaders['User-Agent'] = self._userAgent def _check_guest_token_response(self, r): if r.status_code != 200: self._set_random_user_agent() - return False, f'non-200 response ({r.status_code})' + return False, ('non-200 response' if r.status_code != 404 else 'blocked') + f' ({r.status_code})' return True, None def _ensure_guest_token(self, url = None): @@ -650,20 +711,20 @@ class _TwitterAPIScraper(snscrape.base.Scraper): del self._apiHeaders['x-guest-token'] def _check_api_response(self, r): - if r.status_code in (403, 429): + if r.status_code in (403, 404, 429): self._unset_guest_token() self._ensure_guest_token() return False, f'blocked ({r.status_code})' if r.headers.get('content-type', '').replace(' ', '') != 'application/json;charset=utf-8': return False, 'content type is not JSON' if r.status_code != 200: - return False, 'non-200 status code' + return False, f'non-200 status code ({r.status_code})' return True, None def _get_api_data(self, endpoint, apiType, params): self._ensure_guest_token() if apiType is _TwitterAPIType.GRAPHQL: - params = urllib.parse.urlencode({'variables': json.dumps(params, separators = (',', ':'))}, quote_via = urllib.parse.quote) + params = urllib.parse.urlencode({k: json.dumps(v, separators = (',', ':')) for k, v in params.items()}, quote_via = urllib.parse.quote) r = self._get(endpoint, params = params, headers = self._apiHeaders, responseOkCallback = self._check_api_response) try: obj = r.json() @@ -683,8 +744,11 @@ class _TwitterAPIScraper(snscrape.base.Scraper): if cursor is None: reqParams = params else: - reqParams = paginationParams.copy() - reqParams['cursor'] = cursor + reqParams = copy.deepcopy(paginationParams) + if apiType is _TwitterAPIType.V2: + reqParams['cursor'] = cursor + else: + reqParams['variables']['cursor'] = cursor bottomCursorAndStop = None if direction is _ScrollDirection.TOP or direction is _ScrollDirection.BOTH: dir = 'top' @@ -692,6 +756,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): dir = 'bottom' stopOnEmptyResponse = False emptyResponsesOnCursor = 0 + emptyPages = 0 while True: _logger.info(f'Retrieving scroll page {cursor}') obj = self._get_api_data(endpoint, apiType, reqParams) @@ -706,11 +771,11 @@ class _TwitterAPIScraper(snscrape.base.Scraper): elif apiType is _TwitterAPIType.GRAPHQL: if 'user' in obj['data']: # UserTweets, UserTweetsAndReplies - instructions = obj['data']['user']['result']['timeline']['timeline']['instructions'] + instructions = obj['data']['user']['result']['timeline_v2']['timeline']['instructions'] else: # TweetDetail - instructions = obj['data'].get('threaded_conversation_with_injections', {}).get('instructions', []) - tweetCount = 0 + instructions = obj['data'].get('threaded_conversation_with_injections_v2', {}).get('instructions', []) + entryCount = 0 for instruction in instructions: if 'addEntries' in instruction: entries = instruction['addEntries']['entries'] @@ -720,7 +785,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): entries = instruction['entries'] else: continue - tweetCount += self._count_tweets(entries) + entryCount += self._count_tweets_and_users(entries) for entry in entries: if not (entry['entryId'].startswith('sq-cursor-') or entry['entryId'].startswith('cursor-')): continue @@ -736,19 +801,27 @@ class _TwitterAPIScraper(snscrape.base.Scraper): newCursor = entryCursor if entryCursorStop is not None: stopOnEmptyResponse = entryCursorStop - elif entry['entryId'].startswith('cursor-showMoreThreadsPrompt-'): # E.g. 'offensive' replies button + elif entry['entryId'].startswith('cursor-showmorethreadsprompt-') or entry['entryId'].startswith('cursor-showmorethreads-'): + # E.g. 'offensive' replies and 'Show more replies' button promptCursor = entryCursor elif direction is _ScrollDirection.BOTH and bottomCursorAndStop is None and (entry['entryId'] == 'sq-cursor-bottom' or entry['entryId'].startswith('cursor-bottom-')): newBottomCursorAndStop = (entryCursor, entryCursorStop or False) if bottomCursorAndStop is None and newBottomCursorAndStop is not None: bottomCursorAndStop = newBottomCursorAndStop - if newCursor == cursor and tweetCount == 0: + if newCursor == cursor and entryCount == 0: # Twitter sometimes returns the same cursor as requested and no results even though there are more results. # When this happens, retry the same cursor up to the retries setting. emptyResponsesOnCursor += 1 if emptyResponsesOnCursor > self._retries: break - if not newCursor or (stopOnEmptyResponse and tweetCount == 0): + if entryCount == 0: + emptyPages += 1 + if self._maxEmptyPages and emptyPages >= self._maxEmptyPages: + _logger.warning(f'Stopping after {emptyPages} empty pages') + break + else: + emptyPages = 0 + if not newCursor or (stopOnEmptyResponse and entryCount == 0): # End of pagination if promptCursor is not None: newCursor = promptCursor @@ -761,13 +834,16 @@ class _TwitterAPIScraper(snscrape.base.Scraper): if newCursor != cursor: emptyResponsesOnCursor = 0 cursor = newCursor - reqParams = paginationParams.copy() - reqParams['cursor'] = cursor + reqParams = copy.deepcopy(paginationParams) + if apiType is _TwitterAPIType.V2: + reqParams['cursor'] = cursor + else: + reqParams['variables']['cursor'] = cursor - def _count_tweets(self, entries): - return sum(entry['entryId'].startswith('sq-I-t-') or entry['entryId'].startswith('tweet-') for entry in entries) + def _count_tweets_and_users(self, entries): + return sum(entry['entryId'].startswith('sq-I-t-') or entry['entryId'].startswith('tweet-') or entry['entryId'].startswith('user-') for entry in entries) - def _v2_timeline_instructions_to_tweets(self, obj, includeConversationThreads = False): + def _v2_timeline_instructions_to_tweets_or_users(self, obj): # No data format test, just a hard and loud crash if anything's wrong :-) for instruction in obj['timeline']['instructions']: if 'addEntries' in instruction: @@ -779,10 +855,8 @@ class _TwitterAPIScraper(snscrape.base.Scraper): for entry in entries: if entry['entryId'].startswith('sq-I-t-') or entry['entryId'].startswith('tweet-'): yield from self._v2_instruction_tweet_entry_to_tweet(entry['entryId'], entry['content'], obj) - elif includeConversationThreads and entry['entryId'].startswith('conversationThread-') and not entry['entryId'].endswith('-show_more_cursor'): - for item in entry['content']['timelineModule']['items']: - if item['entryId'].startswith('tweet-'): - yield from self._v2_instruction_tweet_entry_to_tweet(item['entryId'], item, obj) + elif entry['entryId'].startswith('user-'): + yield self._user_to_user(obj['globalObjects']['users'][entry['content']['item']['content']['user']['id']]) def _v2_instruction_tweet_entry_to_tweet(self, entryId, entry, obj): if 'tweet' in entry['item']['content']: @@ -792,13 +866,6 @@ class _TwitterAPIScraper(snscrape.base.Scraper): _logger.warning(f'Skipping tweet {entry["item"]["content"]["tweet"]["id"]} which is not in globalObjects') return tweet = obj['globalObjects']['tweets'][entry['item']['content']['tweet']['id']] - elif 'tombstone' in entry['item']['content']: - if 'tweet' not in entry['item']['content']['tombstone']: # E.g. deleted reply - return - if entry['item']['content']['tombstone']['tweet']['id'] not in obj['globalObjects']['tweets']: - _logger.warning(f'Skipping tweet {entry["item"]["content"]["tombstone"]["tweet"]["id"]} which is not in globalObjects') - return - tweet = obj['globalObjects']['tweets'][entry['item']['content']['tombstone']['tweet']['id']] else: raise snscrape.base.ScraperException(f'Unable to handle entry {entryId!r}') yield self._tweet_to_tweet(tweet, obj) @@ -806,17 +873,20 @@ class _TwitterAPIScraper(snscrape.base.Scraper): def _get_tweet_id(self, tweet): return tweet['id'] if 'id' in tweet else int(tweet['id_str']) - def _make_tweet(self, tweet, user, retweetedTweet = None, quotedTweet = None, card = None): + def _make_tweet(self, tweet, user, retweetedTweet = None, quotedTweet = None, card = None, **kwargs): tweetId = self._get_tweet_id(tweet) - kwargs = {} kwargs['id'] = tweetId - kwargs['content'] = tweet['full_text'] + kwargs['rawContent'] = tweet['full_text'] kwargs['renderedContent'] = self._render_text_with_urls(tweet['full_text'], tweet['entities'].get('urls')) kwargs['user'] = user kwargs['date'] = email.utils.parsedate_to_datetime(tweet['created_at']) if tweet['entities'].get('urls'): - kwargs['outlinks'] = [u['expanded_url'] for u in tweet['entities']['urls']] - kwargs['tcooutlinks'] = [u['url'] for u in tweet['entities']['urls']] + kwargs['links'] = [TextLink( + text = u.get('display_url'), + url = u['expanded_url'], + tcourl = u['url'], + indices = tuple(u['indices']), + ) for u in tweet['entities']['urls']] kwargs['url'] = f'https://twitter.com/{user.username}/status/{tweetId}' kwargs['replyCount'] = tweet['reply_count'] kwargs['retweetCount'] = tweet['retweet_count'] @@ -824,11 +894,12 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['quoteCount'] = tweet['quote_count'] kwargs['conversationId'] = tweet['conversation_id'] if 'conversation_id' in tweet else int(tweet['conversation_id_str']) kwargs['lang'] = tweet['lang'] - kwargs['source'] = tweet['source'] - if (match := re.search(r'href=[\'"]?([^\'" >]+)', tweet['source'])): - kwargs['sourceUrl'] = match.group(1) - if (match := re.search(r'>([^<]*)<', tweet['source'])): - kwargs['sourceLabel'] = match.group(1) + if 'source' in tweet: + kwargs['source'] = tweet['source'] + if (match := re.search(r'href=[\'"]?([^\'" >]+)', tweet['source'])): + kwargs['sourceUrl'] = match.group(1) + if (match := re.search(r'>([^<]*)<', tweet['source'])): + kwargs['sourceLabel'] = match.group(1) if 'extended_entities' in tweet and 'media' in tweet['extended_entities']: media = [] for medium in tweet['extended_entities']['media']: @@ -864,7 +935,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): if (coords := tweet['geo']['coordinates']) and len(coords) == 2: kwargs['coordinates'] = Coordinates(coords[1], coords[0]) if tweet.get('place'): - kwargs['place'] = Place(tweet['place']['full_name'], tweet['place']['name'], tweet['place']['place_type'], tweet['place']['country'], tweet['place']['country_code']) + kwargs['place'] = Place(tweet['place']['id'], tweet['place']['full_name'], tweet['place']['name'], tweet['place']['place_type'], tweet['place']['country'], tweet['place']['country_code']) if 'coordinates' not in kwargs and tweet['place'].get('bounding_box') and (coords := tweet['place']['bounding_box']['coordinates']) and coords[0] and len(coords[0][0]) == 2: # Take the first (longitude, latitude) couple of the "place square" kwargs['coordinates'] = Coordinates(coords[0][0][0], coords[0][0][1]) @@ -877,10 +948,15 @@ class _TwitterAPIScraper(snscrape.base.Scraper): if hasattr(card, 'url') and '//t.co/' in card.url: # Try to convert the URL to the non-shortened/t.co one # Retweets inherit the card but not the outlinks; try to get them from the retweeted tweet instead in that case. - if 'tcooutlinks' in kwargs and card.url in kwargs['tcooutlinks']: - card.url = kwargs['outlinks'][kwargs['tcooutlinks'].index(card.url)] - elif retweetedTweet and retweetedTweet.tcooutlinks and card.url in retweetedTweet.tcooutlinks: - card.url = retweetedTweet.outlinks[retweetedTweet.tcooutlinks.index(card.url)] + candidates = [] + if 'links' in kwargs: + candidates.extend(kwargs['links']) + if retweetedTweet: + candidates.extend(retweetedTweet.links) + for u in candidates: + if u.tcourl == card.url: + card.url = u.url + break else: _logger.warning(f'Could not translate t.co card URL on tweet {tweetId}') return Tweet(**kwargs) @@ -896,10 +972,13 @@ class _TwitterAPIScraper(snscrape.base.Scraper): if format not in ('jpg', 'png'): _logger.warning(f'Skipping photo with unknown format on tweet {tweetId}: {format!r}') return - return Photo( - previewUrl = f'{baseUrl}?format={format}&name=small', - fullUrl = f'{baseUrl}?format={format}&name=large', - ) + mKwargs = { + 'previewUrl': f'{baseUrl}?format={format}&name=small', + 'fullUrl': f'{baseUrl}?format={format}&name=orig', + } + if medium.get('ext_alt_text'): + mKwargs['altText'] = medium['ext_alt_text'] + return Photo(**mKwargs) elif medium['type'] == 'video' or medium['type'] == 'animated_gif': variants = [] for variant in medium['video_info']['variants']: @@ -917,6 +996,8 @@ class _TwitterAPIScraper(snscrape.base.Scraper): cls = Video elif medium['type'] == 'animated_gif': cls = Gif + if medium.get('ext_alt_text'): + mKwargs['altText'] = medium['ext_alt_text'] return cls(**mKwargs) else: _logger.warning(f'Unsupported medium type on tweet {tweetId}: {medium["type"]!r}') @@ -924,10 +1005,6 @@ class _TwitterAPIScraper(snscrape.base.Scraper): def _make_card(self, card, apiType, tweetId): bindingValues = {} - def _kwargs_from_map(keyKwargMap): - nonlocal bindingValues - return {kwarg: bindingValues[key] for key, kwarg in keyKwargMap.items() if key in bindingValues} - userRefs = {} if apiType is _TwitterAPIType.V2: for o in card.get('users', {}).values(): @@ -935,15 +1012,24 @@ class _TwitterAPIScraper(snscrape.base.Scraper): assert userId not in userRefs userRefs[userId] = self._user_to_user(o) elif apiType is _TwitterAPIType.GRAPHQL: - for o in card['legacy'].get('user_refs', {}): - userId = int(o['rest_id']) - if userId in userRefs: - _logger.warning(f'Duplicate user {userId} in card on tweet {tweetId}') + for o in card['legacy'].get('user_refs_results', []): + if 'result' not in o: + _logger.warning(f'Empty user ref object in card on tweet {tweetId}') continue + o = o['result'] + if o['__typename'] == 'UserUnavailable': + _logger.warning(f'Unavailable user in card on tweet {tweetId}') + continue + userId = int(o['rest_id']) if 'legacy' in o: - userRefs[userId] = self._user_to_user(o['legacy'], id_ = userId) + user = self._user_to_user(o['legacy'], id_ = userId) else: - userRefs[userId] = UserRef(id = userId) + user = UserRef(id = userId) + if userId in userRefs: + if userRefs[userId] != user: + _logger.warning(f'Duplicate user {userId} with differing data in card on tweet {tweetId}') + continue + userRefs[userId] = user if apiType is _TwitterAPIType.V2: messyBindingValues = card['binding_values'].items() @@ -967,7 +1053,10 @@ class _TwitterAPIScraper(snscrape.base.Scraper): elif value['type'] == 'BOOLEAN': bindingValues[key] = value['boolean_value'] elif value['type'] == 'USER': - bindingValues[key] = userRefs[int(value['user_value']['id_str'])] + userId = int(value['user_value']['id_str']) + bindingValues[key] = userRefs.get(userId) + if bindingValues[key] is None: + _logger.warning(f'User {userId} not found in user refs in card on tweet {tweetId}') else: _logger.warning(f'Unsupported card value type on {key!r} on tweet {tweetId}: {value["type"]!r}') @@ -977,7 +1066,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): cardName = card['legacy']['name'] if cardName in ('summary', 'summary_large_image', 'app', 'direct_store_link_app'): - keyKwargMap = { + keyMap = { 'title': 'title', 'description': 'description', 'card_url': 'url', @@ -985,13 +1074,13 @@ class _TwitterAPIScraper(snscrape.base.Scraper): 'creator': 'creatorUser', } if cardName in ('app', 'direct_store_link_app'): - keyKwargMap['thumbnail_original'] = 'thumbnailUrl' - return AppCard(**_kwargs_from_map(keyKwargMap)) + keyMap['thumbnail_original'] = 'thumbnailUrl' + return AppCard(**snscrape.utils.dict_map(bindingValues, keyMap)) else: - keyKwargMap['thumbnail_image_original'] = 'thumbnailUrl' - return SummaryCard(**_kwargs_from_map(keyKwargMap)) + keyMap['thumbnail_image_original'] = 'thumbnailUrl' + return SummaryCard(**snscrape.utils.dict_map(bindingValues, keyMap)) elif any(cardName.startswith(x) for x in ('poll2choice_', 'poll3choice_', 'poll4choice_')) and cardName.split('_', 1)[1] in ('text_only', 'image', 'video'): - kwargs = _kwargs_from_map({'end_datetime_utc': 'endDate', 'last_updated_datetime_utc': 'lastUpdateDate', 'duration_minutes': 'duration', 'counts_are_final': 'finalResults'}) + kwargs = snscrape.utils.dict_map(bindingValues, {'end_datetime_utc': 'endDate', 'last_updated_datetime_utc': 'lastUpdateDate', 'duration_minutes': 'duration', 'counts_are_final': 'finalResults'}) options = [] for key in sorted(bindingValues): @@ -1015,9 +1104,9 @@ class _TwitterAPIScraper(snscrape.base.Scraper): return PollCard(**kwargs) elif cardName == 'player': - return PlayerCard(**_kwargs_from_map({'title': 'title', 'description': 'description', 'card_url': 'url', 'player_image_original': 'imageUrl', 'site': 'siteUser'})) + return PlayerCard(**snscrape.utils.dict_map(bindingValues, {'title': 'title', 'description': 'description', 'card_url': 'url', 'player_image_original': 'imageUrl', 'site': 'siteUser'})) elif cardName in ('promo_image_convo', 'promo_video_convo'): - kwargs = _kwargs_from_map({'thank_you_text': 'thankYouText', 'thank_you_url': 'thankYouUrl', 'thank_you_shortened_url': 'thankYouTcoUrl'}) + kwargs = snscrape.utils.dict_map(bindingValues, {'thank_you_text': 'thankYouText', 'thank_you_url': 'thankYouUrl', 'thank_you_shortened_url': 'thankYouTcoUrl'}) kwargs['actions'] = [] for l in ('one', 'two', 'three', 'four'): if f'cta_{l}' in bindingValues: @@ -1036,14 +1125,17 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['medium'] = Video(thumbnailUrl = bindingValues['player_image_original'], variants = variants, duration = int(bindingValues['content_duration_seconds'])) return PromoConvoCard(**kwargs) elif cardName in ('745291183405076480:broadcast', '3691233323:periscope_broadcast'): - keyKwargMap = {'broadcast_state': 'state', 'broadcast_source': 'source', 'site': 'siteUser'} + keyMap = {'broadcast_state': 'state', 'broadcast_source': 'source', 'site': 'siteUser'} if cardName == '745291183405076480:broadcast': - keyKwargMap = {**keyKwargMap, 'broadcast_id': 'id', 'broadcast_url': 'url', 'broadcast_title': 'title', 'broadcast_thumbnail_original': 'thumbnailUrl'} + keyMap = {**keyMap, 'broadcast_id': 'id', 'broadcast_url': 'url', 'broadcast_title': 'title', 'broadcast_thumbnail_original': 'thumbnailUrl'} else: - keyKwargMap = {**keyKwargMap, 'id': 'id', 'url': 'url', 'title': 'title', 'description': 'description', 'total_participants': 'totalParticipants', 'full_size_thumbnail_url': 'thumbnailUrl'} - kwargs = _kwargs_from_map(keyKwargMap) + keyMap = {**keyMap, 'id': 'id', 'url': 'url', 'title': 'title', 'description': 'description', 'total_participants': 'totalParticipants', 'full_size_thumbnail_url': 'thumbnailUrl'} + kwargs = snscrape.utils.dict_map(bindingValues, keyMap) if 'broadcaster_twitter_id' in bindingValues: - kwargs['broadcaster'] = User(id = int(bindingValues['broadcaster_twitter_id']), username = bindingValues['broadcaster_username'], displayname = bindingValues['broadcaster_display_name']) + if int(bindingValues['broadcaster_twitter_id']) in userRefs: + kwargs['broadcaster'] = userRefs[int(bindingValues['broadcaster_twitter_id'])] + else: + kwargs['broadcaster'] = User(id = int(bindingValues['broadcaster_twitter_id']), username = bindingValues['broadcaster_username'], displayname = bindingValues['broadcaster_display_name']) if 'siteUser' not in kwargs: kwargs['siteUser'] = None if cardName == '745291183405076480:broadcast': @@ -1052,17 +1144,17 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['totalParticipants'] = int(kwargs['totalParticipants']) return PeriscopeBroadcastCard(**kwargs) elif cardName == '745291183405076480:live_event': - kwargs = _kwargs_from_map({'event_id': 'id', 'event_title': 'title', 'event_category': 'category', 'event_subtitle': 'description'}) + kwargs = snscrape.utils.dict_map(bindingValues, {'event_id': 'id', 'event_title': 'title', 'event_category': 'category', 'event_subtitle': 'description'}) kwargs['id'] = int(kwargs['id']) kwargs['photo'] = Photo(previewUrl = bindingValues['event_thumbnail_small'], fullUrl = bindingValues.get('event_thumbnail_original') or bindingValues['event_thumbnail']) return EventCard(event = Event(**kwargs)) elif cardName == '3337203208:newsletter_publication': - kwargs = _kwargs_from_map({'newsletter_title': 'title', 'newsletter_description': 'description', 'newsletter_image_original': 'imageUrl', 'card_url': 'url', 'revue_account_id': 'revueAccountId', 'issue_count': 'issueCount'}) + kwargs = snscrape.utils.dict_map(bindingValues, {'newsletter_title': 'title', 'newsletter_description': 'description', 'newsletter_image_original': 'imageUrl', 'card_url': 'url', 'revue_account_id': 'revueAccountId', 'issue_count': 'issueCount'}) kwargs['revueAccountId'] = int(kwargs['revueAccountId']) kwargs['issueCount'] = int(kwargs['issueCount']) return NewsletterCard(**kwargs) elif cardName == '3337203208:newsletter_issue': - kwargs = _kwargs_from_map({ + kwargs = snscrape.utils.dict_map(bindingValues, { 'newsletter_title': 'newsletterTitle', 'newsletter_description': 'newsletterDescription', 'issue_title': 'issueTitle', @@ -1080,11 +1172,11 @@ class _TwitterAPIScraper(snscrape.base.Scraper): id = bindingValues['amplify_content_id'], video = Video( thumbnailUrl = bindingValues['player_image'], - variants = [VideoVariant(contentType = bindingValues['player_stream_content_type'], url = bindingValues['amplify_url_vmap'], bitrate = None)], + variants = [VideoVariant(url = bindingValues['amplify_url_vmap'], contentType = bindingValues.get('player_stream_content_type'), bitrate = None)], ), ) elif cardName == 'appplayer': - kwargs = _kwargs_from_map({'title': 'title', 'app_category': 'appCategory', 'player_owner_id': 'playerOwnerId', 'site': 'siteUser'}) + kwargs = snscrape.utils.dict_map(bindingValues, {'title': 'title', 'app_category': 'appCategory', 'player_owner_id': 'playerOwnerId', 'site': 'siteUser'}) kwargs['playerOwnerId'] = int(kwargs['playerOwnerId']) variants = [] variants.append(VideoVariant(contentType = 'application/x-mpegurl', url = bindingValues['player_hls_url'], bitrate = None)) @@ -1094,7 +1186,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['video'] = Video(thumbnailUrl = bindingValues['player_image_original'], variants = variants, duration = int(bindingValues['content_duration_seconds'])) return AppPlayerCard(**kwargs) elif cardName == '3691233323:audiospace': - return SpacesCard(**_kwargs_from_map({'card_url': 'url', 'id': 'id'})) + return SpacesCard(**snscrape.utils.dict_map(bindingValues, {'card_url': 'url', 'id': 'id'})) elif cardName == '2586390716:message_me': # Note that the strings in Twitter's JS appear to have an incorrect mapping that then gets changed somewhere in the 1.8 MiB of JS! # cta_1, 3, and 4 should mean 'Message us', 'Send a private message', and 'Send me a private message', but the correct mapping is currently unknown. @@ -1102,7 +1194,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): if bindingValues['cta'] not in ctas: _logger.warning(f'Unsupported message_me card cta on tweet {tweetId}: {bindingValues["cta"]!r}') return - return MessageMeCard(**_kwargs_from_map({'recipient': 'recipient', 'card_url': 'url'}), buttonText = ctas[bindingValues['cta']]) + return MessageMeCard(**snscrape.utils.dict_map(bindingValues, {'recipient': 'recipient', 'card_url': 'url'}), buttonText = ctas[bindingValues['cta']]) elif cardName == 'unified_card': o = json.loads(bindingValues['unified_card']) kwargs = {} @@ -1112,6 +1204,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): 'image_app', 'image_carousel_app', 'image_carousel_website', + 'image_collection_website', 'image_multi_dest_carousel_website', 'image_website', 'mixed_media_multi_dest_carousel_website', @@ -1205,11 +1298,16 @@ class _TwitterAPIScraper(snscrape.base.Scraper): vKwargs['title'] = var['title']['content'] if 'description' in var: vKwargs['description'] = var['description']['content'] - vKwargs['category'] = var['category']['content'] + if 'category' in var: + vKwargs['category'] = var['category']['content'] if (ratings := var['ratings']): vKwargs['ratingAverage'] = var['ratings']['star'] vKwargs['ratingCount'] = var['ratings']['count'] vKwargs['url'] = f'https://play.google.com/store/apps/details?id={var["id"]}' if var['type'] == 'android_app' else f'https://itunes.apple.com/app/id{var["id"]}' + if 'iconMediumKey' in vKwargs and vKwargs['iconMediumKey'] not in kwargs['media']: + # https://github.com/JustAnotherArchivist/snscrape/issues/470 + _logger.warning(f'Tweet {tweetId} contains an app icon medium key {vKwargs["iconMediumKey"]!r} on app {vKwargs["type"]!r}/{vKwargs["id"]!r}, but the corresponding medium is missing; dropping') + del vKwargs['iconMediumKey'] variants.append(UnifiedCardApp(**vKwargs)) kwargs['apps'][k] = variants @@ -1217,15 +1315,54 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['components'] = o['components'] if 'layout' in o: - if o['layout']['type'] != 'swipeable': + if o['layout']['type'] == 'swipeable': + kwargs['swipeableLayoutSlides'] = [UnifiedCardSwipeableLayoutSlide(mediumComponentKey = v[0], componentKey = v[1]) for v in o['layout']['data']['slides']] + elif o['layout']['type'] == 'collection': + kwargs['collectionLayoutSlides'] = [UnifiedCardCollectionLayoutSlide(detailsComponentKey = v[0], mediumComponentKey = v[1]) for v in o['layout']['data']['slides']] + else: _logger.warning(f'Unsupported unified_card layout type on tweet {tweetId}: {o["layout"]["type"]!r}') return - kwargs['swipeableLayoutSlides'] = [UnifiedCardSwipeableLayoutSlide(mediumComponentKey = v[0], componentKey = v[1]) for v in o['layout']['data']['slides']] - return UnifiedCard(**kwargs) + card = UnifiedCard(**kwargs) + + # Consistency checks + missingParts = set() + if card.components and not all(k in card.componentObjects for k in card.components): + missingParts.add('components') + if card.swipeableLayoutSlides and not all(s.mediumComponentKey in card.componentObjects and s.componentKey in card.componentObjects for s in card.swipeableLayoutSlides): + missingParts.add('components') + if any(c.destinationKey not in card.destinations for c in card.componentObjects.values() if hasattr(c, 'destinationKey')): + missingParts.add('destinations') + if any(b.destinationKey not in card.destinations for c in card.componentObjects.values() if isinstance(c, UnifiedCardButtonGroupComponentObject) for b in c.buttons): + missingParts.add('destinations') + mediaKeys = [] + for c in card.componentObjects.values(): + if isinstance(c, UnifiedCardMediumComponentObject): + mediaKeys.append(c.mediumKey) + elif isinstance(c, UnifiedCardSwipeableMediaComponentObject): + mediaKeys.extend(x.mediumKey for x in c.media) + mediaKeys.extend(d.mediumKey for d in card.destinations.values() if d.mediumKey is not None) + mediaKeys.extend(a.iconMediumKey for l in (card.apps.values() if card.apps is not None else []) for a in l if a.iconMediumKey is not None) + if any(k not in card.media for k in mediaKeys): + missingParts.add('media') + if any(c.appKey not in card.apps for c in card.componentObjects.values() if hasattr(c, 'appKey')): + missingParts.add('apps') + if any(d.appKey not in card.apps for d in card.destinations.values() if d.appKey is not None): + missingParts.add('apps') + if missingParts: + _logger.warning(f'Consistency errors in unified card on tweet {tweetId}: missing {", ".join(missingParts)}') + + return card _logger.warning(f'Unsupported card type on tweet {tweetId}: {cardName!r}') + def _make_vibe(self, vibe): + return Vibe( + text = vibe['text'], + imageUrl = vibe['imgUrl'], + imageDescription = vibe['imgDescription'], + ) + def _tweet_to_tweet(self, tweet, obj): user = self._user_to_user(obj['globalObjects']['users'][tweet['user_id_str']]) kwargs = {} @@ -1235,14 +1372,36 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['quotedTweet'] = self._tweet_to_tweet(obj['globalObjects']['tweets'][tweet['quoted_status_id_str']], obj) if 'card' in tweet: kwargs['card'] = self._make_card(tweet['card'], _TwitterAPIType.V2, self._get_tweet_id(tweet)) + if 'ext_views' in tweet and 'count' in tweet['ext_views']: + kwargs['viewCount'] = int(tweet['ext_views']['count']) + if 'vibe' in tweet.get('ext', {}): + kwargs['vibe'] = self._make_vibe(tweet['ext']['vibe']['r']['ok']) return self._make_tweet(tweet, user, **kwargs) - def _graphql_timeline_tweet_item_result_to_tweet(self, result): + def _make_tombstone(self, tweetId, info): + if tweetId is None: + raise snscrape.base.ScraperException('Cannot create tombstone without tweet ID') + if info and (text := info.get('richText', info['text'])): + return Tombstone( + id = tweetId, + text = text['text'], + textLinks = [TextLink(text = text['text'][x['fromIndex']:x['toIndex']], url = x['ref']['url'], tcourl = None, indices = (x['fromIndex'], x['toIndex'])) for x in text['entities']], + ) + else: + return Tombstone(id = tweetId) + + def _graphql_timeline_tweet_item_result_to_tweet(self, result, tweetId = None): if result['__typename'] == 'Tweet': pass elif result['__typename'] == 'TweetWithVisibilityResults': #TODO Include result['softInterventionPivot'] in the Tweet object result = result['tweet'] + elif result['__typename'] == 'TweetTombstone': + return self._make_tombstone(tweetId, result.get('tombstone')) + elif result['__typename'] == 'TweetUnavailable': + if tweetId is None: + raise snscrape.base.ScraperException('Cannot handle unavailable tweet without tweet ID') + return TweetRef(id = tweetId) else: raise snscrape.base.ScraperException(f'Unknown result type {result["__typename"]!r}') tweet = result['legacy'] @@ -1250,21 +1409,34 @@ class _TwitterAPIScraper(snscrape.base.Scraper): user = self._user_to_user(result['core']['user_results']['result']['legacy'], id_ = userId) kwargs = {} if 'retweeted_status_result' in tweet: + #TODO Tombstones will cause a crash here. kwargs['retweetedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(tweet['retweeted_status_result']['result']) if 'quoted_status_result' in result: - if result['quoted_status_result']['result']['__typename'] == 'TweetTombstone': - kwargs['quotedTweet'] = TweetRef(id = int(tweet['quoted_status_id_str'])) + if 'result' not in result['quoted_status_result']: + _logger.warning(f'quoted_status_result for {tweet["quoted_status_id_str"]} without an actual result on tweet {self._get_tweet_id(tweet)}, using TweetRef') + kwargs['quotedTweet'] = TweetRef(int(tweet['quoted_status_id_str'])) else: - kwargs['quotedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(result['quoted_status_result']['result']) - elif 'quotedRefResult' in result: + kwargs['quotedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(result['quoted_status_result']['result'], tweetId = int(tweet['quoted_status_id_str'])) + elif result.get('quotedRefResult'): if result['quotedRefResult']['result']['__typename'] == 'TweetTombstone': - kwargs['quotedTweet'] = TweetRef(id = int(tweet['quoted_status_id_str'])) + kwargs['quotedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(result['quotedRefResult']['result'], tweetId = int(tweet['quoted_status_id_str'])) else: - kwargs['quotedTweet'] = TweetRef(id = int(result['quotedRefResult']['result']['rest_id'])) + qTweet = result['quotedRefResult']['result'] + if result['quotedRefResult']['result']['__typename'] not in ('Tweet', 'TweetWithVisibilityResults'): + _logger.warning(f'Unknown quotedRefResult type {result["quotedRefResult"]["result"]["__typename"]!r} on tweet {self._get_tweet_id(tweet)}, using TweetRef') + elif result['quotedRefResult']['result']['__typename'] == 'TweetWithVisibilityResults': + qTweet = qTweet['tweet'] + kwargs['quotedTweet'] = TweetRef(id = int(qTweet['rest_id'])) elif 'quoted_status_id_str' in tweet: - kwargs['quotedTweet'] = TweetRef(id = int(tweet['quoted_status_id_str'])) + # Omit the TweetRef if this is a retweet and the quoted tweet ID matches the tweet quoted in the retweeted tweet. + if tweet['quoted_status_id_str'] != tweet.get('retweeted_status_result', {}).get('result', {}).get('quoted_status_result', {}).get('result', {}).get('rest_id'): + kwargs['quotedTweet'] = TweetRef(id = int(tweet['quoted_status_id_str'])) if 'card' in result: kwargs['card'] = self._make_card(result['card'], _TwitterAPIType.GRAPHQL, self._get_tweet_id(tweet)) + if 'views' in result and 'count' in result['views']: + kwargs['viewCount'] = int(result['views']['count']) + if 'vibe' in result: + kwargs['vibe'] = self._make_vibe(result['vibe']) return self._make_tweet(tweet, user, **kwargs) def _graphql_timeline_instructions_to_tweets(self, instructions, includeConversationThreads = False): @@ -1273,14 +1445,32 @@ class _TwitterAPIScraper(snscrape.base.Scraper): continue for entry in instruction['entries']: if entry['entryId'].startswith('tweet-'): + tweetId = int(entry['entryId'].split('-', 1)[1]) if entry['content']['entryType'] == 'TimelineTimelineItem' and entry['content']['itemContent']['itemType'] == 'TimelineTweet': - yield self._graphql_timeline_tweet_item_result_to_tweet(entry['content']['itemContent']['tweet_results']['result']) + if 'result' not in entry['content']['itemContent']['tweet_results']: + _logger.warning(f'Skipping empty tweet entry {entry["entryId"]}') + continue + yield self._graphql_timeline_tweet_item_result_to_tweet(entry['content']['itemContent']['tweet_results']['result'], tweetId = tweetId) else: - logger.warning('Got unrecognised timeline tweet item(s)') + _logger.warning('Got unrecognised timeline tweet item(s)') + elif entry['entryId'].startswith('homeConversation-'): + if entry['content']['entryType'] == 'TimelineTimelineModule': + for item in entry['content']['items']: + if not item['entryId'].startswith('homeConversation-') or '-tweet-' not in item['entryId']: + raise snscrape.base.ScraperException(f'Unexpected home conversation entry ID: {item["entryId"]!r}') + tweetId = int(item['entryId'].split('-tweet-', 1)[1]) + if item['item']['itemContent']['itemType'] == 'TimelineTweet': + if 'result' in item['item']['itemContent']['tweet_results']: + yield self._graphql_timeline_tweet_item_result_to_tweet(item['item']['itemContent']['tweet_results']['result'], tweetId = tweetId) + else: + yield TweetRef(id = tweetId) elif includeConversationThreads and entry['entryId'].startswith('conversationthread-'): #TODO show more cursor? for item in entry['content']['items']: if item['entryId'].startswith(f'{entry["entryId"]}-tweet-'): - yield self._graphql_timeline_tweet_item_result_to_tweet(item['item']['itemContent']['tweet_results']['result']) + tweetId = int(item['entryId'][len(entry['entryId']) + 7:]) + yield self._graphql_timeline_tweet_item_result_to_tweet(item['item']['itemContent']['tweet_results']['result'], tweetId = tweetId) + elif not entry['entryId'].startswith('cursor-'): + _logger.warning(f'Skipping unrecognised entry ID: {entry["entryId"]!r}') def _render_text_with_urls(self, text, urls): if not urls: @@ -1300,10 +1490,15 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['username'] = user['screen_name'] kwargs['id'] = id_ if id_ else user['id'] if 'id' in user else int(user['id_str']) kwargs['displayname'] = user['name'] - kwargs['description'] = self._render_text_with_urls(user['description'], user['entities']['description'].get('urls')) kwargs['rawDescription'] = user['description'] + kwargs['renderedDescription'] = self._render_text_with_urls(user['description'], user['entities']['description'].get('urls')) if user['entities']['description'].get('urls'): - kwargs['descriptionUrls'] = [{'text': x.get('display_url'), 'url': x['expanded_url'], 'tcourl': x['url'], 'indices': tuple(x['indices'])} for x in user['entities']['description']['urls']] + kwargs['descriptionLinks'] = [TextLink( + text = x.get('display_url'), + url = x['expanded_url'], + tcourl = x['url'], + indices = tuple(x['indices']), + ) for x in user['entities']['description']['urls']] kwargs['verified'] = user.get('verified') kwargs['created'] = email.utils.parsedate_to_datetime(user['created_at']) kwargs['followersCount'] = user['followers_count'] @@ -1314,12 +1509,16 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['mediaCount'] = user['media_count'] kwargs['location'] = user['location'] kwargs['protected'] = user.get('protected') - if 'url' in user['entities']: - kwargs['linkUrl'] = (user['entities']['url']['urls'][0].get('expanded_url') or user.get('url')) - kwargs['linkTcourl'] = user.get('url') + if user.get('url'): + entity = user['entities'].get('url', {}).get('urls', [None])[0] + if not entity or entity['url'] != user['url']: + _logger.warning(f'Link inconsistency on user {kwargs["id"]}') + if not entity: + entity = {'indices': (0, len(user['url']))} + kwargs['link'] = TextLink(text = entity.get('display_url'), url = entity.get('expanded_url', user['url']), tcourl = user['url'], indices = tuple(entity['indices'])) kwargs['profileImageUrl'] = user['profile_image_url_https'] kwargs['profileBannerUrl'] = user.get('profile_banner_url') - if 'ext' in user and (label := user['ext']['highlightedLabel']['r']['ok'].get('label')): + if 'ext' in user and 'highlightedLabel' in user['ext'] and (label := user['ext']['highlightedLabel']['r']['ok'].get('label')): kwargs['label'] = self._user_label_to_user_label(label) return User(**kwargs) @@ -1334,32 +1533,69 @@ class _TwitterAPIScraper(snscrape.base.Scraper): labelKwargs['longDescription'] = label['longDescription']['text'] return UserLabel(**labelKwargs) + def _graphql_user_results_to_user_ref(self, obj): + if 'id' not in obj: + return None + if isinstance(obj['id'], int): + userId = obj['id'] + elif obj['id'].startswith('VXNlclJlc3VsdHM6'): + # UserResults: in base64 + try: + userId = base64.b64decode(obj['id']) + except ValueError: + return None + assert userId.startswith(b'UserResults:') + userId = int(userId.split(b':', 1)[1]) + kwargs = {} + if 'result' in obj and obj['result']['__typename'] == 'UserUnavailable' and 'unavailable_message' in obj['result']: + kwargs['text'] = obj['result']['unavailable_message']['text'] + kwargs['textLinks'] = [TextLink(text = kwargs['text'][x['fromIndex']:x['toIndex']], url = x['ref']['url'], tcourl = None, indices = (x['fromIndex'], x['toIndex'])) for x in obj['result']['unavailable_message']['entities']] + return UserRef(id = userId, **kwargs) + + def _graphql_user_results_to_user(self, results): + if 'result' not in results or results['result']['__typename'] == 'UserUnavailable': + return self._graphql_user_results_to_user_ref(results) + return self._user_to_user(results['result']['legacy'], id_ = int(results['result']['rest_id'])) + @classmethod def _cli_construct(cls, argparseArgs, *args, **kwargs): kwargs['guestTokenManager'] = _CLIGuestTokenManager() return super()._cli_construct(argparseArgs, *args, **kwargs) +class TwitterSearchScraperMode(enum.Enum): + LIVE = 'live' + TOP = 'top' + USER = 'user' + + @classmethod + def _cli_from_args(cls, args): + if args.top: + return cls.TOP + if args.user: + return cls.USER + return cls.LIVE + + class TwitterSearchScraper(_TwitterAPIScraper): name = 'twitter-search' - def __init__(self, query, *, cursor = None, top = False, **kwargs): + def __init__(self, query, *, cursor = None, mode = TwitterSearchScraperMode.LIVE, top = None, maxEmptyPages = 20, **kwargs): if not query.strip(): raise ValueError('empty query') + if mode not in tuple(TwitterSearchScraperMode): + raise ValueError('invalid mode, must be a TwitterSearchScraperMode') + kwargs['maxEmptyPages'] = maxEmptyPages super().__init__(baseUrl = 'https://twitter.com/search?' + urllib.parse.urlencode({'f': 'live', 'lang': 'en', 'q': query, 'src': 'spelling_expansion_revert_click'}), **kwargs) self._query = query # Note: may get replaced by subclasses when using user ID resolution + if cursor is not None: + warnings.warn('the `cursor` argument is deprecated', snscrape.base.DeprecatedFeatureWarning, stacklevel = 2) self._cursor = cursor - self._top = top - - def _check_scroll_response(self, r): - if r.status_code == 429: - # Accept a 429 response as "valid" to prevent retries; handled explicitly in get_items - return True, None - if r.headers.get('content-type').replace(' ', '') != 'application/json;charset=utf-8': - return False, 'content type is not JSON' - if r.status_code != 200: - return False, 'non-200 status code' - return True, None + if top is not None: + replacement = f'{__name__}.TwitterSearchScraperMode.' + ('TOP' if top else 'LIVE') + warnings.warn(f'`top` argument is deprecated, use `mode = {replacement}` instead of `top = {bool(top)}`', snscrape.base.DeprecatedFeatureWarning, stacklevel = 2) + mode = TwitterSearchScraperMode.TOP if top else TwitterSearchScraperMode.LIVE + self._mode = mode def get_items(self): if not self._query.strip(): @@ -1373,47 +1609,70 @@ class TwitterSearchScraper(_TwitterAPIScraper): 'include_mute_edge': '1', 'include_can_dm': '1', 'include_can_media_tag': '1', + 'include_ext_has_nft_avatar': '1', + 'include_ext_is_blue_verified': '1', + 'include_ext_verified_type': '1', 'skip_status': '1', 'cards_platform': 'Web-12', 'include_cards': '1', 'include_ext_alt_text': 'true', + 'include_ext_limited_action_results': 'false', 'include_quote_count': 'true', 'include_reply_count': '1', 'tweet_mode': 'extended', + 'include_ext_collab_control': 'true', + 'include_ext_views': 'true', 'include_entities': 'true', 'include_user_entities': 'true', 'include_ext_media_color': 'true', 'include_ext_media_availability': 'true', + 'include_ext_sensitive_media_warning': 'true', + 'include_ext_trusted_friends_metadata': 'true', 'send_error_codes': 'true', - 'simple_quoted_tweets': 'true', + 'simple_quoted_tweet': 'true', 'q': self._query, - 'tweet_search_mode': 'live', - 'count': '100', + } + if self._mode is TwitterSearchScraperMode.LIVE: + paginationParams = { + **paginationParams, + 'tweet_search_mode': 'live', + } + elif self._mode is TwitterSearchScraperMode.TOP: + pass + elif self._mode is TwitterSearchScraperMode.USER: + paginationParams = { + **paginationParams, + 'result_filter': 'user', + 'query_source': '', + } + paginationParams = { + **paginationParams, + 'count': '20', 'query_source': 'spelling_expansion_revert_click', 'cursor': None, 'pc': '1', 'spelling_corrections': '1', - 'ext': 'mediaStats,highlightedLabel', + 'include_ext_edit_control': 'true', + 'ext': 'mediaStats,highlightedLabel,hasNftAvatar,voiceInfo,enrichments,superFollowMetadata,unmentionInfo,editControl,collab_control,vibe', } params = paginationParams.copy() del params['cursor'] - if self._top: - del params['tweet_search_mode'] - del paginationParams['tweet_search_mode'] - for obj in self._iter_api_data('https://api.twitter.com/2/search/adaptive.json', _TwitterAPIType.V2, params, paginationParams, cursor = self._cursor): - yield from self._v2_timeline_instructions_to_tweets(obj) + yield from self._v2_timeline_instructions_to_tweets_or_users(obj) @classmethod def _cli_setup_parser(cls, subparser): - subparser.add_argument('--cursor', metavar = 'CURSOR') - subparser.add_argument('--top', action = 'store_true', default = False, help = 'Enable fetching top tweets instead of live/chronological') + subparser.add_argument('--cursor', metavar = 'CURSOR', help = '(deprecated)') + group = subparser.add_mutually_exclusive_group(required = False) + group.add_argument('--top', action = 'store_true', default = False, help = 'Search top tweets instead of live/chronological') + group.add_argument('--user', action = 'store_true', default = False, help = 'Search users instead of tweets') + subparser.add_argument('--max-empty-pages', dest = 'maxEmptyPages', metavar = 'N', type = int, default = 20, help = 'Stop after N empty pages from Twitter; set to 0 to disable') subparser.add_argument('query', type = snscrape.base.nonempty_string('query'), help = 'A Twitter search string') @classmethod def _cli_from_args(cls, args): - return cls._cli_construct(args, args.query, cursor = args.cursor, top = args.top) + return cls._cli_construct(args, args.query, cursor = args.cursor, mode = TwitterSearchScraperMode._cli_from_args(args), maxEmptyPages = args.maxEmptyPages) class TwitterUserScraper(TwitterSearchScraper): @@ -1436,12 +1695,24 @@ class TwitterUserScraper(TwitterSearchScraper): fieldName = 'userId' endpoint = 'https://twitter.com/i/api/graphql/I5nvpI91ljifos1Y3Lltyg/UserByRestId' variables = {fieldName: str(self._user), 'withSafetyModeUserFields': True, 'withSuperFollowsUserFields': True} - obj = self._get_api_data(endpoint, _TwitterAPIType.GRAPHQL, params = variables) - if not obj['data'] or obj['data']['user']['result']['__typename'] == 'UserUnavailable': + obj = self._get_api_data(endpoint, _TwitterAPIType.GRAPHQL, params = {'variables': variables}) + if not obj['data'] or 'result' not in obj['data']['user']: + _logger.warning('Empty response') + return None + if obj['data']['user']['result']['__typename'] == 'UserUnavailable': + _logger.warning('User unavailable') return None user = obj['data']['user']['result'] rawDescription = user['legacy']['description'] - description = self._render_text_with_urls(rawDescription, user['legacy']['entities']['description']['urls']) + renderedDescription = self._render_text_with_urls(rawDescription, user['legacy']['entities']['description']['urls']) + link = None + if user['legacy'].get('url'): + entity = user['legacy']['entities'].get('url', {}).get('urls', [None])[0] + if not entity or entity['url'] != user['legacy']['url']: + _logger.warning(f'Link inconsistency on user') + if not entity: + entity = {'indices': (0, len(user['legacy']['url']))} + link = TextLink(text = entity.get('display_url'), url = entity.get('expanded_url', user['legacy']['url']), tcourl = user['legacy']['url'], indices = tuple(entity['indices'])) label = None if (labelO := user['affiliates_highlighted_label'].get('label')): label = self._user_label_to_user_label(labelO) @@ -1449,9 +1720,14 @@ class TwitterUserScraper(TwitterSearchScraper): username = user['legacy']['screen_name'], id = int(user['rest_id']), displayname = user['legacy']['name'], - description = description, rawDescription = rawDescription, - descriptionUrls = [{'text': x.get('display_url'), 'url': x['expanded_url'], 'tcourl': x['url'], 'indices': tuple(x['indices'])} for x in user['legacy']['entities']['description']['urls']], + renderedDescription = renderedDescription, + descriptionLinks = [TextLink( + text = x.get('display_url'), + url = x['expanded_url'], + tcourl = x['url'], + indices = tuple(x['indices']), + ) for x in user['legacy']['entities']['description']['urls']], verified = user['legacy']['verified'], created = email.utils.parsedate_to_datetime(user['legacy']['created_at']), followersCount = user['legacy']['followers_count'], @@ -1462,8 +1738,7 @@ class TwitterUserScraper(TwitterSearchScraper): mediaCount = user['legacy']['media_count'], location = user['legacy']['location'], protected = user['legacy']['protected'], - linkUrl = user['legacy']['entities']['url']['urls'][0]['expanded_url'] if 'url' in user['legacy']['entities'] else None, - linkTcourl = user['legacy'].get('url'), + link = link, profileImageUrl = user['legacy']['profile_image_url_https'], profileBannerUrl = user['legacy'].get('profile_banner_url'), label = label, @@ -1472,6 +1747,8 @@ class TwitterUserScraper(TwitterSearchScraper): def get_items(self): if self._isUserId: # Resolve user ID to username + if self.entity is None: + raise snscrape.base.ScraperException(f'Could not resolve user ID {self._user!r} to username') self._user = self.entity.username self._isUserId = False self._query = f'from:{self._user}' @@ -1479,7 +1756,7 @@ class TwitterUserScraper(TwitterSearchScraper): @staticmethod def is_valid_username(s): - return 1 <= len(s) <= 15 and s.strip(string.ascii_letters + string.digits + '_') == '' + return 1 <= len(s) <= 20 and s.strip(string.ascii_letters + string.digits + '_') == '' @classmethod def _cli_setup_parser(cls, subparser): @@ -1501,6 +1778,8 @@ class TwitterProfileScraper(TwitterUserScraper): def get_items(self): if not self._isUserId: + if self.entity is None: + raise snscrape.base.ScraperException(f'Could not resolve username {self._user!r} to ID') userId = self.entity.id else: userId = self._user @@ -1516,19 +1795,47 @@ class TwitterProfileScraper(TwitterUserScraper): 'withReactionsPerspective': False, 'withSuperFollowsTweetFields': True, 'withVoice': True, - 'withV2Timeline': False, + 'withV2Timeline': True, } variables = paginationVariables.copy() del variables['cursor'] + features = { + 'responsive_web_twitter_blue_verified_badge_is_enabled': True, + 'responsive_web_graphql_exclude_directive_enabled': False, + 'verified_phone_label_enabled': False, + 'responsive_web_graphql_timeline_navigation_enabled': True, + 'responsive_web_graphql_skip_user_profile_image_extensions_enabled': False, + 'responsive_web_graphql_skip_user_profile_image_extensions_enabled': False, + 'tweetypie_unmention_optimization_enabled': True, + 'vibe_api_enabled': True, + 'responsive_web_edit_tweet_api_enabled': True, + 'graphql_is_translatable_rweb_tweet_is_translatable_enabled': True, + 'view_counts_everywhere_api_enabled': True, + 'longform_notetweets_consumption_enabled': True, + 'tweet_awards_web_tipping_enabled': False, + 'freedom_of_speech_not_reach_fetch_enabled': False, + 'standardized_nudges_misinfo': True, + 'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': False, + 'interactive_text_enabled': True, + 'responsive_web_text_conversations_enabled': False, + 'responsive_web_enhance_cards_enabled': False, + } + + params = {'variables': variables, 'features': features} + paginationParams = {'variables': paginationVariables, 'features': features} gotPinned = False - for obj in self._iter_api_data('https://twitter.com/i/api/graphql/BSKxQ9_IaCoVyIvQHQROIQ/UserTweetsAndReplies', _TwitterAPIType.GRAPHQL, variables, paginationVariables): - instructions = obj['data']['user']['result']['timeline']['timeline']['instructions'] + for obj in self._iter_api_data('https://twitter.com/i/api/graphql/nrdle2catTyGnTyj1Qa7wA/UserTweetsAndReplies', _TwitterAPIType.GRAPHQL, params, paginationParams): + if obj['data']['user']['result']['__typename'] == 'UserUnavailable': + _logger.warning('User unavailable') + break + instructions = obj['data']['user']['result']['timeline_v2']['timeline']['instructions'] if not gotPinned: for instruction in instructions: if instruction['type'] == 'TimelinePinEntry': gotPinned = True - yield self._graphql_timeline_tweet_item_result_to_tweet(instruction['entry']['content']['itemContent']['tweet_results']['result']) + tweetId = int(instruction['entry']['entryId'][6:]) if instruction['entry']['entryId'].startswith('tweet-') else None + yield self._graphql_timeline_tweet_item_result_to_tweet(instruction['entry']['content']['itemContent']['tweet_results']['result'], tweetId = tweetId) yield from self._graphql_timeline_instructions_to_tweets(instructions) @@ -1548,6 +1855,22 @@ class TwitterHashtagScraper(TwitterSearchScraper): return cls._cli_construct(args, args.hashtag) +class TwitterCashtagScraper(TwitterSearchScraper): + name = 'twitter-cashtag' + + def __init__(self, cashtag, **kwargs): + super().__init__(f'${cashtag}', **kwargs) + self._cashtag = cashtag + + @classmethod + def _cli_setup_parser(cls, subparser): + subparser.add_argument('cashtag', type = snscrape.base.nonempty_string('cashtag'), help = 'A Twitter cashtag (without $)') + + @classmethod + def _cli_from_args(cls, args): + return cls._cli_construct(args, args.cashtag) + + class TwitterTweetScraperMode(enum.Enum): SINGLE = 'single' SCROLL = 'scroll' @@ -1579,50 +1902,71 @@ class TwitterTweetScraper(_TwitterAPIScraper): 'includePromotedContent': True, 'withCommunity': True, 'withQuickPromoteEligibilityTweetFields': True, - 'withTweetQuoteCount': True, - 'withBirdwatchNotes': True, + 'withBirdwatchNotes': False, 'withSuperFollowsUserFields': True, - 'withBirdwatchPivots': False, 'withDownvotePerspective': False, 'withReactionsMetadata': False, 'withReactionsPerspective': False, 'withSuperFollowsTweetFields': True, 'withVoice': True, - 'withV2Timeline': False, + 'withV2Timeline': True, } variables = paginationVariables.copy() del variables['cursor'], variables['referrer'] - url = 'https://twitter.com/i/api/graphql/8svRea_Lc0_mdhwP6dqe0Q/TweetDetail' + features = { + 'responsive_web_twitter_blue_verified_badge_is_enabled': True, + 'responsive_web_graphql_exclude_directive_enabled': False, + 'verified_phone_label_enabled': False, + 'responsive_web_graphql_timeline_navigation_enabled': True, + 'responsive_web_graphql_skip_user_profile_image_extensions_enabled': False, + 'tweetypie_unmention_optimization_enabled': True, + 'vibe_api_enabled': True, + 'responsive_web_edit_tweet_api_enabled': True, + 'graphql_is_translatable_rweb_tweet_is_translatable_enabled': True, + 'view_counts_everywhere_api_enabled': True, + 'longform_notetweets_consumption_enabled': True, + 'tweet_awards_web_tipping_enabled': False, + 'freedom_of_speech_not_reach_fetch_enabled': False, + 'standardized_nudges_misinfo': True, + 'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': False, + 'interactive_text_enabled': True, + 'responsive_web_text_conversations_enabled': False, + 'responsive_web_enhance_cards_enabled': False, + } + + params = {'variables': variables, 'features': features} + paginationParams = {'variables': paginationVariables, 'features': features} + url = 'https://twitter.com/i/api/graphql/NNiD2K-nEYUfXlMwGCocMQ/TweetDetail' if self._mode is TwitterTweetScraperMode.SINGLE: - obj = self._get_api_data(url, _TwitterAPIType.GRAPHQL, params = variables) + obj = self._get_api_data(url, _TwitterAPIType.GRAPHQL, params = params) if not obj['data']: return - for instruction in obj['data']['threaded_conversation_with_injections']['instructions']: + for instruction in obj['data']['threaded_conversation_with_injections_v2']['instructions']: if instruction['type'] != 'TimelineAddEntries': continue for entry in instruction['entries']: if entry['entryId'] == f'tweet-{self._tweetId}' and entry['content']['entryType'] == 'TimelineTimelineItem' and entry['content']['itemContent']['itemType'] == 'TimelineTweet': - yield self._graphql_timeline_tweet_item_result_to_tweet(entry['content']['itemContent']['tweet_results']['result']) + yield self._graphql_timeline_tweet_item_result_to_tweet(entry['content']['itemContent']['tweet_results']['result'], tweetId = self._tweetId) break elif self._mode is TwitterTweetScraperMode.SCROLL: - for obj in self._iter_api_data(url, _TwitterAPIType.GRAPHQL, variables, paginationVariables, direction = _ScrollDirection.BOTH): + for obj in self._iter_api_data(url, _TwitterAPIType.GRAPHQL, params, paginationParams, direction = _ScrollDirection.BOTH): if not obj['data']: continue - yield from self._graphql_timeline_instructions_to_tweets(obj['data']['threaded_conversation_with_injections']['instructions'], includeConversationThreads = True) + yield from self._graphql_timeline_instructions_to_tweets(obj['data']['threaded_conversation_with_injections_v2']['instructions'], includeConversationThreads = True) elif self._mode is TwitterTweetScraperMode.RECURSE: seenTweets = set() queue = collections.deque() queue.append(self._tweetId) while queue: tweetId = queue.popleft() - thisPagVariables = paginationVariables.copy() - thisPagVariables['focalTweetId'] = str(tweetId) - thisVariables = thisPagVariables.copy() - del thisPagVariables['cursor'], thisPagVariables['referrer'] - for obj in self._iter_api_data(url, _TwitterAPIType.GRAPHQL, thisVariables, thisPagVariables, direction = _ScrollDirection.BOTH): + thisPagParams = copy.deepcopy(paginationParams) + thisPagParams['variables']['focalTweetId'] = str(tweetId) + thisParams = copy.deepcopy(thisPagParams) + del thisPagParams['variables']['cursor'], thisPagParams['variables']['referrer'] + for obj in self._iter_api_data(url, _TwitterAPIType.GRAPHQL, thisParams, thisPagParams, direction = _ScrollDirection.BOTH): if not obj['data']: continue - for tweet in self._graphql_timeline_instructions_to_tweets(obj['data']['threaded_conversation_with_injections']['instructions'], includeConversationThreads = True): + for tweet in self._graphql_timeline_instructions_to_tweets(obj['data']['threaded_conversation_with_injections_v2']['instructions'], includeConversationThreads = True): if tweet.id not in seenTweets: yield tweet seenTweets.add(tweet.id) @@ -1657,6 +2001,108 @@ class TwitterListPostsScraper(TwitterSearchScraper): return cls._cli_construct(args, args.list) +class TwitterCommunityScraper(_TwitterAPIScraper): + name = 'twitter-community' + + def __init__(self, communityId, **kwargs): + self._communityId = communityId + super().__init__(f'https://twitter.com/i/communities/{self._communityId}', **kwargs) + + def _get_entity(self): + self._ensure_guest_token() + params = { + 'variables': { + 'communityId': str(self._communityId), + 'withDmMuting': False, + 'withSafetyModeUserFields': False, + 'withSuperFollowsUserFields': True, + }, + 'features': { + 'responsive_web_graphql_exclude_directive_enabled': False, + 'responsive_web_graphql_skip_user_profile_image_extensions_enabled': False, + 'responsive_web_graphql_timeline_navigation_enabled': True, + 'responsive_web_twitter_blue_verified_badge_is_enabled': True, + 'verified_phone_label_enabled': False, + }, + } + obj = self._get_api_data('https://api.twitter.com/graphql/MO8cE7aTvaenXJX_teUGcA/CommunitiesFetchOneQuery', _TwitterAPIType.GRAPHQL, params = params) + if not obj['data'] or 'result' not in obj['data']['communityResults']: + _logger.warning('Empty response') + return None + if obj['data']['communityResults']['result']['__typename'] == 'CommunityUnavailable': + _logger.warning('Community unavailable') + return None + community = obj['data']['communityResults']['result'] + optKwargs = {} + if 'description' in community: + optKwargs['description'] = community['description'] + return Community( + id = int(community['id_str']), + name = community['name'], + created = datetime.datetime.fromtimestamp(community['created_at'] / 1000, tz = datetime.timezone.utc), + admin = self._graphql_user_results_to_user(community['admin_results']), + creator = self._graphql_user_results_to_user(community['creator_results']), + membersFacepile = [self._graphql_user_results_to_user(m) for m in community['members_facepile_results']], + moderatorsCount = community['moderator_count'], + membersCount = community['member_count'], + rules = [r['name'] for r in community['rules']], + theme = community.get('custom_theme', community['default_theme']), + bannerUrl = community.get('custom_banner_media', community['default_banner_media'])['media_info']['original_img_url'], + **optKwargs, + ) + + def get_items(self): + paginationVariables = { + 'count': 20, + 'cursor': None, + 'communityId': str(self._communityId), + 'withCommunity': True, + 'withSuperFollowsUserFields': True, + 'withDownvotePerspective': False, + 'withReactionsMetadata': False, + 'withReactionsPerspective': False, + 'withSuperFollowsTweetFields': True, + } + variables = paginationVariables.copy() + del variables['count'], variables['cursor'] + features = { + 'responsive_web_twitter_blue_verified_badge_is_enabled': True, + 'responsive_web_graphql_exclude_directive_enabled': False, + 'verified_phone_label_enabled': False, + 'responsive_web_graphql_timeline_navigation_enabled': True, + 'responsive_web_graphql_skip_user_profile_image_extensions_enabled': False, + 'tweetypie_unmention_optimization_enabled': True, + 'vibe_api_enabled': True, + 'responsive_web_edit_tweet_api_enabled': True, + 'graphql_is_translatable_rweb_tweet_is_translatable_enabled': True, + 'view_counts_everywhere_api_enabled': True, + 'longform_notetweets_consumption_enabled': True, + 'tweet_awards_web_tipping_enabled': False, + 'freedom_of_speech_not_reach_fetch_enabled': False, + 'standardized_nudges_misinfo': True, + 'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': False, + 'interactive_text_enabled': True, + 'responsive_web_text_conversations_enabled': False, + 'responsive_web_enhance_cards_enabled': False, + } + params = {'variables': variables, 'features': features} + paginationParams = {'variables': paginationVariables, 'features': features} + + for obj in self._iter_api_data('https://api.twitter.com/graphql/Qvst9FkHq45wuqicCvMpVw/CommunityTweetsTimeline', _TwitterAPIType.GRAPHQL, params, paginationParams): + if obj['data']['communityResults']['result']['__typename'] == 'CommunityUnavailable': + _logger.warning('Community unavailable') + break + yield from self._graphql_timeline_instructions_to_tweets(obj['data']['communityResults']['result']['community_timeline']['timeline']['instructions']) + + @classmethod + def _cli_setup_parser(cls, subparser): + subparser.add_argument('communityId', type = int, help = 'A community ID') + + @classmethod + def _cli_from_args(cls, args): + return cls._cli_construct(args, args.communityId) + + class TwitterTrendsScraper(_TwitterAPIScraper): name = 'twitter-trends' @@ -1673,6 +2119,7 @@ class TwitterTrendsScraper(_TwitterAPIScraper): 'include_mute_edge': '1', 'include_can_dm': '1', 'include_can_media_tag': '1', + 'include_ext_has_nft_avatar': '1', 'skip_status': '1', 'cards_platform': 'Web-12', 'include_cards': '1', @@ -1684,13 +2131,15 @@ class TwitterTrendsScraper(_TwitterAPIScraper): 'include_user_entities': 'true', 'include_ext_media_color': 'true', 'include_ext_media_availability': 'true', + 'include_ext_sensitive_media_warning': 'true', + 'include_ext_trusted_friends_metadata': 'true', 'send_error_codes': 'true', 'simple_quoted_tweet': 'true', 'count': '20', 'candidate_source': 'trends', 'include_page_configuration': 'false', 'entity_tokens': 'false', - 'ext': 'mediaStats,highlightedLabel,voiceInfo', + 'ext': 'mediaStats,highlightedLabel,hasNftAvatar,voiceInfo,enrichments,superFollowMetadata,unmentionInfo', } obj = self._get_api_data('https://twitter.com/i/api/2/guide.json', _TwitterAPIType.V2, params) for instruction in obj['timeline']['instructions']: @@ -1702,3 +2151,6 @@ class TwitterTrendsScraper(_TwitterAPIScraper): for item in entry['content']['timelineModule']['items']: trend = item['item']['content']['trend'] yield Trend(name = trend['name'], metaDescription = trend['trendMetadata'].get('metaDescription'), domainContext = trend['trendMetadata']['domainContext']) + + +__getattr__, __dir__ = snscrape.base._module_deprecation_helper(__all__, DescriptionURL = TextLink) diff --git a/snscrape/modules/vkontakte.py b/snscrape/modules/vkontakte.py index 9353cd5..a8d6b46 100644 --- a/snscrape/modules/vkontakte.py +++ b/snscrape/modules/vkontakte.py @@ -32,7 +32,7 @@ _logger = logging.getLogger(__name__) _months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] _datePattern = re.compile(r'^(?Ptoday' r'|yesterday' - r'|(?P\d+)\s+(?P' + '|'.join(_months) + ')(\s+(?P\d{4}))?' + r'|(?P\d+)\s+(?P' + '|'.join(_months) + r')(\s+(?P\d{4}))?' r'|(?P' + '|'.join(_months) + r')\s+(?P\d+),\s+(?P\d{4})' ')' r'\s+at\s+(?P\d+):(?P\d+)\s+(?P[ap]m)$') @@ -75,7 +75,7 @@ class Video: @dataclasses.dataclass -class User(snscrape.base.Entity): +class User(snscrape.base.Item): username: str name: str verified: bool @@ -117,6 +117,9 @@ class VKontakteUserScraper(snscrape.base.Scraper): return urllib.parse.unquote(a['href'][13 : end]) return None + def is_photo(self, a): + return 'aria-label' in a.attrs and a.attrs['aria-label'].startswith('photo') + def _date_span_to_date(self, dateSpan): if not dateSpan: return None @@ -172,7 +175,7 @@ class VKontakteUserScraper(snscrape.base.Scraper): not (not isCopy and thumbsDiv.parent.name == 'div' and 'class' in thumbsDiv.parent.attrs and 'copy_quote' in thumbsDiv.parent.attrs['class']): # Skip post quotes photos = [] for a in thumbsDiv.find_all('a', class_ = 'page_post_thumb_wrap'): - if 'data-photo-id' not in a.attrs and 'data-video' not in a.attrs: + if not self.is_photo(a) and 'data-video' not in a.attrs: _logger.warning(f'Skipping non-photo and non-video thumb wrap on {url}') continue if 'data-video' in a.attrs: diff --git a/snscrape/modules/weibo.py b/snscrape/modules/weibo.py index 796f864..654fd62 100644 --- a/snscrape/modules/weibo.py +++ b/snscrape/modules/weibo.py @@ -34,7 +34,7 @@ class Post(snscrape.base.Item): @dataclasses.dataclass -class User(snscrape.base.Entity): +class User(snscrape.base.Item): screenname: str uid: int verified: bool @@ -81,6 +81,8 @@ class WeiboUserScraper(snscrape.base.Scraper): return True, None def _mblog_to_item(self, mblog): + if mblog.get('page_info', {}).get('type') not in (None, 'video', 'webpage'): + _logger.warning(f'Skipping unknown page info {mblog["page_info"]["type"]!r} on status {mblog["id"]}') return Post( url = f'https://m.weibo.cn/status/{mblog["bid"]}', id = mblog['id'], @@ -92,7 +94,7 @@ class WeiboUserScraper(snscrape.base.Scraper): likesCount = mblog.get('attitudes_count'), picturesCount = mblog.get('pic_num'), pictures = [x['large']['url'] for x in mblog['pics']] if 'pics' in mblog else None, - video = mblog['page_info']['media_info']['mp4_720p_mp4'] if 'page_info' in mblog and mblog['page_info']['type'] == 'video' else None, + video = urls.get('mp4_720p_mp4') or urls.get('mp4_hd_mp4') or urls['mp4_ld_mp4'] if 'page_info' in mblog and mblog['page_info']['type'] == 'video' and (urls := mblog['page_info']['urls']) else None, link = mblog['page_info']['page_url'] if 'page_info' in mblog and mblog['page_info']['type'] == 'webpage' else None, repostedPost = self._mblog_to_item(mblog['retweeted_status']) if 'retweeted_status' in mblog else None, ) diff --git a/snscrape/utils.py b/snscrape/utils.py new file mode 100644 index 0000000..3150924 --- /dev/null +++ b/snscrape/utils.py @@ -0,0 +1,16 @@ +def dict_map(input, keyMap): + '''Return a new dict from an input dict and a {'input_key': 'output_key'} mapping''' + + return {outputKey: input[inputKey] for inputKey, outputKey in keyMap.items() if inputKey in input} + + +def snake_to_camel(**kwargs): + '''Return a new dict from kwargs with snake_case keys replaced by camelCase''' + + out = {} + for key, value in kwargs.items(): + keyParts = key.split('_') + for i in range(1, len(keyParts)): + keyParts[i] = keyParts[i][:1].upper() + keyParts[i][1:] + out[''.join(keyParts)] = value + return out