95 Commits

Author SHA1 Message Date
Tristan Lee
cacd783b95 merged upstram changes 2023-04-04 04:19:54 -05:00
JustAnotherArchivist
3dd9c28e31 Add snake_to_camel helper 2023-04-03 02:35:26 +00:00
JustAnotherArchivist
7186c833dd Move dict remapping helper to utils module 2023-04-03 02:35:05 +00:00
JustAnotherArchivist
1c3a592415 Fix KeyError on broadcast cards with incomplete broadcaster user data
Fixes #810
2023-04-01 22:08:20 +00:00
JustAnotherArchivist
285d5874fc Deprecate cursor argument to TwitterSearchScraper
#778
2023-03-20 19:19:19 +00:00
JustAnotherArchivist
adac052723 Treat 404 responses from Twitter as a block 2023-03-20 19:15:11 +00:00
JustAnotherArchivist
edac5f38cb Validate mode parameter on TwitterSearchScraper 2023-03-15 01:19:53 +00:00
JustAnotherArchivist
b93cf2640c Revise bug reporting instructions: add instructions and field for debug log, put less emphasis on dump files, request minimal reproducer, and reorder template 2023-03-15 01:14:12 +00:00
JustAnotherArchivist
e47fbe3d1f Bump user agent
Fixes #760
2023-03-14 03:03:50 +00:00
JustAnotherArchivist
99050710d7 Fix AttributeError crashes on resolving user IDs to usernames or vice-versa 2023-03-03 02:25:48 +00:00
JustAnotherArchivist
3f7bb0516d Fix crash due to missing profile timeline on unavailable users (e.g. protected) 2023-03-03 01:32:07 +00:00
JustAnotherArchivist
98b50ff9e9 Separate warnings for empty responses and unavailable users/communities 2023-03-03 01:16:49 +00:00
JustAnotherArchivist
fd75fff202 Fix crash on communities without a description 2023-03-03 00:39:08 +00:00
JustAnotherArchivist
c77d19da5d Fix crash on some deleted tweets in communities 2023-03-03 00:31:30 +00:00
JustAnotherArchivist
945bfbde04 Merge pull request #743 from kelcheone/master
Add Twitter cashtag scraper
2023-03-02 21:24:07 +00:00
KΞVIN KΞLCHΞ
0942beedd6 fix: code style line spacing 2023-03-02 19:08:53 +00:00
KΞVIN KΞLCHΞ
3545837637 fix: code style line spacing 2023-03-02 19:05:16 +00:00
KΞVIN KΞLCHΞ
aa8d93e07c Merge branch 'JustAnotherArchivist:master' into master 2023-03-01 22:49:43 +03:00
kelche
7061ad2eb5 fix: code style 2023-03-01 18:09:34 +03:00
JustAnotherArchivist
03ef3debaf Fix behaviour on SIGPIPE/BrokenPipeError 2023-02-28 20:20:28 +00:00
JustAnotherArchivist
42cb6d8170 Fix crash on quotedRefResult without an actual result
Fixes #740
2023-02-28 20:16:55 +00:00
JustAnotherArchivist
ea7c6786c2 Handle TweetWithVisibilityResults on quoted tweets
Fixes #604
2023-02-28 20:16:07 +00:00
kelche
61dbbba6b1 feat: cashtag func 2023-02-27 22:39:31 +03:00
kelche
d1592177ab feat: cashtag func 2023-02-27 22:35:21 +03:00
JustAnotherArchivist
21cf626803 Update list of scrapers 2023-02-21 22:10:33 +00:00
JustAnotherArchivist
f329b69ed4 Add support for scraping Twitter's user search
#263
2023-02-21 22:07:40 +00:00
JustAnotherArchivist
f109f3fd46 Fix forgotten warning name change (cf. 7327a013) 2023-02-21 21:59:06 +00:00
JustAnotherArchivist
7330e0a9a0 Rename private logger variable 2023-02-21 21:26:00 +00:00
JustAnotherArchivist
4e6956e564 Remove dead code 2023-02-21 21:25:01 +00:00
JustAnotherArchivist
4e70306f99 Deprecate Entity type
There is no meaningful distinction from Items, and it complicates the integration of scrapers for user searches
2023-02-21 21:24:00 +00:00
JustAnotherArchivist
7327a01397 Refactor module-level deprecation code 2023-02-21 21:23:12 +00:00
JustAnotherArchivist
880a0a7f55 Handle TweetUnavailable results
Fixes #433
2023-02-21 20:16:23 +00:00
JustAnotherArchivist
57b126c656 Add support for scraping Twitter Communities
Closes #614
2023-02-21 20:15:57 +00:00
JustAnotherArchivist
82f64a6472 Remove dead code 2023-02-21 06:22:13 +00:00
JustAnotherArchivist
6a6b02cb28 Handle tombstones
Closes #392
Fixes #603
2023-02-21 04:23:47 +00:00
JustAnotherArchivist
3d6cd63a00 Fix more logger typos 2023-02-21 04:23:47 +00:00
JustAnotherArchivist
9a2f1524c2 Remove dead code 2023-02-21 04:23:47 +00:00
JustAnotherArchivist
b5694e01a2 Fix logger typo 2023-02-21 04:23:47 +00:00
JustAnotherArchivist
280b972f22 Fix extraction of tweets behind 'offensive' replies button 2023-02-21 04:23:47 +00:00
JustAnotherArchivist
6ba478657b Merge pull request #733 from mrunderline/fix/telegram_channel_members_count
fix: telegram channel members count
2023-02-20 19:16:03 +00:00
Ali Madihi
71fb33af70 fix: telegram channel members count 2023-02-20 22:14:34 +03:30
JustAnotherArchivist
c65e36a094 Bump GraphQL endpoints 2023-02-19 06:21:40 +00:00
JustAnotherArchivist
206907612d Fix double dump on exceptions with --dump-locals 2023-02-19 05:12:47 +00:00
JustAnotherArchivist
fe5d90b748 Fix tweets behind 'Show more replies' button getting missed
Fixes #572
2023-02-19 03:29:39 +00:00
JustAnotherArchivist
f1cb96b685 Merge pull request #724 from quentinwolf/patch-1
Twitter: change fullUrl to use 'orig' instead of 'large'
2023-02-19 02:55:27 +00:00
JustAnotherArchivist
8709282ba0 Add deprecated properties to JSON
Cf. #611
2023-02-19 02:51:47 +00:00
quentinwolf
0933a30e37 change fullUrl to use 'orig' instead of 'large'
Changing fullUrl from '&name=large' to '&name=orig'  since large is capped at half the resolution of orig which may not be ideal for scraping/archiving.

Large images are  2048px x 1365px
Original images are up to 4096px × 2730px

Alternatively one could add largeUrl as an alternative to download the Large image and utillze fullUrl as above to download the original image for those that do wish to save either versions, but I feel there is no reason for saving the middle-resolution image.
2023-02-13 16:45:44 -07:00
JustAnotherArchivist
d60ce38b6a Make (most) consistency errors in unified cards non-fatal
Fixes #703
2023-02-10 02:39:06 +00:00
JustAnotherArchivist
23ebdd2a3c Fix YAML syntax 2023-02-02 21:03:52 +00:00
JustAnotherArchivist
35c0c32c38 Refine bug report template 2023-02-02 21:02:16 +00:00
JustAnotherArchivist
b515a66b93 Fix crash in recursive tweet scraping
Introduced by 3e297c9a

Fixes #684
2023-01-19 16:18:15 +00:00
JustAnotherArchivist
36e85c54c1 Log response headers for debugging 2023-01-16 03:48:21 +00:00
JustAnotherArchivist
49270f6d3a Fix debug messages for redirects to report the correct status code and redirect location 2023-01-16 03:47:46 +00:00
JustAnotherArchivist
d0fb9ab8a9 Log TLS connection details for debugging 2023-01-16 02:39:05 +00:00
JustAnotherArchivist
5d3f27bc2b Fix title-less BroadcastCard crash 2023-01-15 16:36:04 +00:00
JustAnotherArchivist
b7cb270b6e Fix crash on empty user objects 2023-01-15 12:31:28 +00:00
JustAnotherArchivist
8ad26fc7d1 Switch from setup.py to pyproject.toml 2023-01-13 18:52:03 +00:00
JustAnotherArchivist
1fb5c39168 Add Python 3.11 classifier 2023-01-13 10:12:39 +00:00
JustAnotherArchivist
d81d247a87 Port Reddit scraper to new Pushshift API
Fixes #619
2023-01-13 10:07:58 +00:00
JustAnotherArchivist
564a5eca77 Fix crash on unavailable users in cards 2023-01-13 09:12:16 +00:00
JustAnotherArchivist
bf0e720b5a Fix crash on empty tweet entries in timelines
Fixes #620
2023-01-13 09:01:15 +00:00
JustAnotherArchivist
27374285a2 Fix crash on missing source label data
This data had been announced in mid-November to disappear but was still always returned by the API until very recently.
2023-01-13 08:32:02 +00:00
JustAnotherArchivist
238bdcd560 Reduce warnings about duplicate users on cards 2023-01-13 08:28:52 +00:00
JustAnotherArchivist
e846a6a4cd Fix KeyError in card user handling 2023-01-13 08:06:57 +00:00
JustAnotherArchivist
cbeb65d5c9 Fix KeyError crash on some tweets with AmplifyCards
Fixes #601
2023-01-13 07:57:31 +00:00
JustAnotherArchivist
3e19f8f84b Add support for image_collection_website unified cards 2023-01-13 07:36:53 +00:00
JustAnotherArchivist
28f5a45825 Fix empty page counter not getting reset on results 2023-01-13 06:59:51 +00:00
JustAnotherArchivist
2196bdf3e8 Extract vibe 2023-01-13 04:09:00 +00:00
JustAnotherArchivist
faf09b2f5e Extract tweet view counts
Closes #629
2023-01-13 04:00:50 +00:00
JustAnotherArchivist
3e297c9a42 Update GraphQL API parameters 2023-01-13 04:00:31 +00:00
JustAnotherArchivist
a0414d92cf Extract alt text for media on Twitter
Closes #588
2023-01-13 03:13:10 +00:00
JustAnotherArchivist
ff5e2d61ee Update search API parameters 2023-01-13 03:01:48 +00:00
JustAnotherArchivist
129ad3fc34 Add --max-empty-pages option to stop long (potentially infinite) empty pagination
Fixes #636
2023-01-13 02:35:48 +00:00
JustAnotherArchivist
7de8d734e9 Override TLS ciphers to get past Twitter's new fingerprinting
Fixes #647
2023-01-13 02:25:39 +00:00
JustAnotherArchivist
ceb06664f0 Clarify descriptions of issue templates 2023-01-11 22:52:52 +00:00
JustAnotherArchivist
996cf882cc Expose status code for non-200 Twitter responses 2023-01-11 20:01:05 +00:00
JustAnotherArchivist
e449d5cdbe Expose individual error messages when all request retries fail 2023-01-11 20:01:05 +00:00
JustAnotherArchivist
cbdaee6864 Merge pull request #343 from TheTechRobo/master
Add issue templates for snscrape
2022-12-19 23:25:17 +00:00
JustAnotherArchivist
a3bee057b1 Merge pull request #615 from engkimo/fix-return-twitter-place-ids
Add returning Twitter Place IDs
2022-12-19 22:57:40 +00:00
JustAnotherArchivist
6f9a0e6534 Merge pull request #590 from caseyho/UnifiedCardApp_no_category
Handle tweets that contain card info with no category
2022-12-19 22:55:36 +00:00
engkimo
4ff4af13cf Add returning Twitter Place IDs 2022-12-06 11:23:01 +09:00
JustAnotherArchivist
e09aea70e7 Fix Twitter username length limit
Although 15 characters is the official, current limit, there are accounts with longer usernames. 20 is the longest observed example, but it's unclear what the true limit is.
2022-12-03 06:36:52 +00:00
Tristan Lee
cbdfeed812 fixed edge case where members information wasnt included 2022-11-30 12:59:38 -06:00
Casey Ho
aa325fa1a5 Handle UnifiedCardApp with no category 2022-11-14 17:38:03 -08:00
JustAnotherArchivist
46a603053c Handle users with extensions but no label
Fixes #559
2022-10-16 21:13:46 +00:00
JustAnotherArchivist
59abeaf04c Make newsletter card images optional
Fixes #546
2022-09-04 15:04:20 +00:00
JustAnotherArchivist
e13033fea0 Fix AttributeError on certain videos included from other platforms 2022-08-24 15:53:21 +00:00
JustAnotherArchivist
9294c26ffa Make PeriscopeBroadcastCard.thumbnailUrl optional to handle tweets without a thumbnail
Fixes #507
2022-08-21 01:58:41 +00:00
JustAnotherArchivist
d6bce5b1d6 Merge pull request #518 from hgrsd/fix/vkontakte-photo-scrape
fix(vkontakte): update photo detection
2022-08-21 01:49:59 +00:00
JustAnotherArchivist
2c7a85a620 Add warning on unknown page_info types 2022-08-21 01:40:49 +00:00
JustAnotherArchivist
ff18f6f771 Fix video extraction on Weibo
Fixes #509
2022-08-21 01:40:31 +00:00
JustAnotherArchivist
da3d870e10 Drop app icons when Twitter didn't actually include them in the response
Fixes #470
2022-08-13 21:17:55 +00:00
hgrsd
279d1cf4a1 fix(vkontakte): update photo detection 2022-07-16 18:27:02 +01:00
TheTechRobo
afb6bfc429 add feature_request and question templates 2022-01-04 13:43:26 -05:00
TheTechRobo
ec5626097a Create bug_report.yml 2022-01-04 12:59:28 -05:00
17 changed files with 939 additions and 386 deletions

97
.github/ISSUE_TEMPLATE/bug_report.yml vendored Normal file
View File

@@ -0,0 +1,97 @@
name: Bug report
description: Are you experiencing a problem? Create a report to help us improve!
labels: 'bug'
body:
- type: markdown
attributes:
value: |
## Self Check
- Try searching existing GitHub Issues (open or closed) for similar issues.
- type: textarea
validations:
required: true
attributes:
label: Describe the bug
description: A clear description of what the bug is.
placeholder: e.g. I see an AssertionError when trying to scrape a Twitter user!
- type: textarea
validations:
required: true
attributes:
label: How to reproduce
description: |
How to reproduce the problem.
This should be a minimal reproducible example, i.e. the shortest possible code or the smallest number of steps that still causes the error.
placeholder: e.g. I can reproduce this issue by scraping the textfiles user with the twitter-user scraper.
- type: textarea
validations:
required: true
attributes:
label: Expected behaviour
description: A brief description of what should happen.
- type: textarea
attributes:
label: Screenshots and recordings
description: |
If applicable, add screenshots or videos to help explain your problem. (Videos should be as short as possible! Avoid watermarks too.)
- type: input
validations:
required: true
attributes:
label: Operating system
description: Include the version too, please!
placeholder: e.g. Windows 10, Ubuntu 20.04, macOS 10.15...
- type: input
validations:
required: true
attributes:
label: |
Python version: output of `python3 --version`
- type: input
validations:
required: true
attributes:
label: |
snscrape version: output of `snscrape --version`
- type: input
validations:
required: true
attributes:
label: Scraper
placeholder: e.g. twitter-user, reddit-search, TwitterSearchScraper, ...
- type: dropdown
validations:
required: true
attributes:
label: How are you using snscrape?
options: ['CLI (`snscrape ...` as a command, e.g. in a terminal)', 'Module (`import snscrape.modules.something` in Python code)']
- type: textarea
validations:
required: false
attributes:
label: Backtrace
description: What is the error snscrape gives you, if any?
- type: textarea
validations:
required: false
attributes:
label: Log output
description: |
Insert here the debug log of snscrape.
If you use the CLI, add the global options `-vv` to the command, e.g. `snscrape -vv twitter-search ...`.
If you use the module, set the debug level in your Python code before any use of snscrape: `import logging; logging.basicConfig(level = logging.DEBUG)`.
If you already use `logging` in your own code, you may need to adjust the level there instead.
- type: textarea
validations:
required: false
attributes:
label: Dump of locals
description: |
Here attach the dump of your snscrape locals, if it's a crash. (snscrape should tell you the path).
Please note that it may contain identifying info such as IP address, if the website returns that.
You can also optionally request to exchange the file in private.
Finally, if snscrape didn't crash, leave this field blank.
- type: textarea
attributes:
label: Additional context
description: Add any other context about the problem here.

View File

@@ -0,0 +1,27 @@
name: Feature Request
description: Want a feature? Ask; we don't bite!
labels: 'enhancement'
body:
- type: markdown
attributes:
value: |
## Self Check
- Try searching existing GitHub Issues (open or closed) for similar issues.
- type: textarea
validations:
required: true
attributes:
label: Describe the feature
description: A clear description of what the feature is.
- type: textarea
validations:
required: false
attributes:
label: Would this fix a problem you're experiencing? If so, specify.
- type: textarea
attributes:
label: Did you consider other alternatives?
description: If so, specify
- type: input
attributes:
label: Additional context

6
.github/ISSUE_TEMPLATE/question.md vendored Normal file
View File

@@ -0,0 +1,6 @@
---
name: Question
about: Ask away! (Do not use this for bugs or features.)
labels: 'question'
---

View File

@@ -8,7 +8,7 @@ The following services are currently supported:
* Mastodon: user profiles and toots (single or thread)
* Reddit: users, subreddits, and searches (via Pushshift)
* Telegram: channels
* Twitter: users, user profiles, hashtags, searches, tweets (single or surrounding thread), list posts, and trends
* Twitter: users, user profiles, hashtags, searches (live tweets, top tweets, and users), tweets (single or surrounding thread), list posts, communities, and trends
* VKontakte: user profiles
* Weibo (Sina Weibo): user profiles
@@ -59,7 +59,10 @@ To get the latest 100 tweets with the hashtag #archiveteam:
It is also possible to use snscrape as a library in Python, but this is currently undocumented.
## Issue reporting
If you discover an issue with snscrape, please report it at <https://github.com/JustAnotherArchivist/snscrape/issues>. If possible please run snscrape with `-vv` and `--dump-locals` and include the log output as well as the dump files referenced in the log in the issue. Note that the files may contain sensitive information in some cases and could potentially be used to identify you (e.g. if the service includes your IP address in its response). If you prefer to arrange a file transfer privately, just mention that in the issue.
If you discover an issue with snscrape, please report it at <https://github.com/JustAnotherArchivist/snscrape/issues>. If you use the CLI, please run snscrape with `-vv` and include the log output in the issue. If you use snscrape as a module, please enable debug-level logging using `import logging; logging.basicConfig(level = logging.DEBUG)` (before using snscrape at all) and include the log output in the issue.
### Dump files
In some cases, debugging may require more information than is available in the log. The CLI has a `--dump-locals` option that enables dumping all local variables within snscrape based on important log messages (rather than, by default, only on crashes). Note that the dump files may contain sensitive information in some cases and could potentially be used to identify you (e.g. if the service includes your IP address in its response). If you prefer to arrange a file transfer privately, just mention that in the issue.
## License
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

37
pyproject.toml Normal file
View File

@@ -0,0 +1,37 @@
[build-system]
requires = ['setuptools>=61', 'setuptools_scm>=6.2']
build-backend = 'setuptools.build_meta'
[tool.setuptools]
packages = ['snscrape', 'snscrape.modules']
[tool.setuptools_scm]
[project]
name = 'snscrape'
description = 'A social networking service scraper'
readme = 'README.md'
authors = [{name = 'JustAnotherArchivist'}]
classifiers = [
'Development Status :: 4 - Beta',
'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
]
dependencies = [
'requests[socks]',
'lxml',
'beautifulsoup4',
'pytz; python_version < "3.9.0"',
'filelock',
]
requires-python = '~=3.8'
dynamic = ['version']
[project.urls]
repository = "https://github.com/JustAnotherArchivist/snscrape"
[project.scripts]
snscrape = 'snscrape._cli:main'

View File

@@ -1,42 +0,0 @@
import os.path
import setuptools
with open(os.path.join(os.path.dirname(__file__), 'README.md')) as fp:
readme = fp.read()
setuptools.setup(
name = 'snscrape',
description = 'A social networking service scraper',
long_description = readme,
long_description_content_type = 'text/markdown',
author = 'JustAnotherArchivist',
url = 'https://github.com/JustAnotherArchivist/snscrape',
classifiers = [
'Development Status :: 4 - Beta',
'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
],
packages = ['snscrape', 'snscrape.modules'],
setup_requires = ['setuptools_scm'],
use_scm_version = True,
install_requires = [
'requests[socks]',
'lxml',
'beautifulsoup4',
'pytz; python_version < "3.9.0"',
'filelock',
],
python_requires = '~=3.8',
extras_require = {
'test': ['coverage'],
},
entry_points = {
'console_scripts': [
'snscrape = snscrape._cli:main',
],
},
)

View File

@@ -6,6 +6,7 @@ import datetime
import importlib.metadata
import inspect
import logging
import os
import requests
# Imported in parse_args() after setting up the logger:
#import snscrape.base
@@ -23,7 +24,7 @@ logger = logging # Replaced below after setting the logger class
class Logger(logging.Logger):
def _log_with_stack(self, level, *args, **kwargs):
super().log(level, *args, **kwargs)
if dumpLocals:
if dumpLocals and not kwargs.get('extra', {}).get('_snscrapeSuppressDumpLocals', False):
stack = inspect.stack()
if len(stack) >= 3:
name = _dump_stack_and_locals(stack[2:][::-1])
@@ -118,7 +119,7 @@ def _dump_locals_on_exception():
trace = inspect.trace()
if len(trace) >= 2:
name = _dump_stack_and_locals(trace[1:], exc = e)
logger.fatal(f'Dumped stack and locals to {name}')
logger.fatal(f'Dumped stack and locals to {name}', extra = {'_snscrapeSuppressDumpLocals': True})
raise
@@ -307,32 +308,36 @@ def main():
i = 0
with _dump_locals_on_exception():
if args.withEntity and (entity := scraper.entity):
if args.jsonl:
print(entity.json())
try:
if args.withEntity and (entity := scraper.entity):
if args.jsonl:
print(entity.json())
else:
print(entity)
if args.maxResults == 0:
logger.info('Exiting after 0 results')
return
for i, item in enumerate(scraper.get_items(), start = 1):
if args.since is not None and item.date < args.since:
logger.info(f'Exiting due to reaching older results than {args.since}')
break
if args.jsonl:
print(item.json())
elif args.format is not None:
print(args.format.format(item))
else:
print(item)
if args.progress and i % 100 == 0:
print(f'Scraping, {i} results so far', file = sys.stderr)
if args.maxResults and i >= args.maxResults:
logger.info(f'Exiting after {i} results')
if args.progress:
print(f'Stopped scraping after {i} results due to --max-results', file = sys.stderr)
break
else:
print(entity)
if args.maxResults == 0:
logger.info('Exiting after 0 results')
return
for i, item in enumerate(scraper.get_items(), start = 1):
if args.since is not None and item.date < args.since:
logger.info(f'Exiting due to reaching older results than {args.since}')
break
if args.jsonl:
print(item.json())
elif args.format is not None:
print(args.format.format(item))
else:
print(item)
if args.progress and i % 100 == 0:
print(f'Scraping, {i} results so far', file = sys.stderr)
if args.maxResults and i >= args.maxResults:
logger.info(f'Exiting after {i} results')
logger.info(f'Done, found {i} results')
if args.progress:
print(f'Stopped scraping after {i} results due to --max-results', file = sys.stderr)
break
else:
logger.info(f'Done, found {i} results')
if args.progress:
print(f'Finished, {i} results', file = sys.stderr)
print(f'Finished, {i} results', file = sys.stderr)
except BrokenPipeError:
os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stdout.fileno())
sys.exit(1)

View File

@@ -1,3 +1,6 @@
__all__ = ['DeprecatedFeatureWarning', 'IntWithGranularity', 'Item', 'Scraper', 'ScraperException']
import abc
import copy
import dataclasses
@@ -6,11 +9,28 @@ import functools
import json
import logging
import requests
import requests.adapters
import urllib3.connection
import time
import warnings
logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)
def _module_deprecation_helper(all, **names):
def __getattr__(name):
if name in names:
warnings.warn(f'{name} is deprecated, use {names[name].__name__} instead', DeprecatedFeatureWarning, stacklevel = 2)
return names[name]
raise AttributeError(f'module {__name__!r} has no attribute {name!r}')
def __dir__():
return sorted(all + list(names.keys()))
return __getattr__, __dir__
class DeprecatedFeatureWarning(FutureWarning):
pass
class _DeprecatedProperty:
@@ -22,7 +42,7 @@ class _DeprecatedProperty:
def __get__(self, obj, objType):
if obj is None: # if the access is through the class using _DeprecatedProperty rather than an instance of the class:
return self
warnings.warn(f'{self.name} is deprecated, use {self.replStr} instead', FutureWarning, stacklevel = 2)
warnings.warn(f'{self.name} is deprecated, use {self.replStr} instead', DeprecatedFeatureWarning, stacklevel = 2)
return self.repl(obj)
@@ -43,9 +63,9 @@ def _json_dataclass_to_dict(obj):
if field.name.startswith('_'):
continue
out[field.name] = _json_dataclass_to_dict(getattr(obj, field.name))
# Add in (non-deprecated) properties
# Add properties
for k in dir(obj):
if isinstance(getattr(type(obj), k, None), property):
if isinstance(getattr(type(obj), k, None), (property, _DeprecatedProperty)):
assert k != '_type'
if k.startswith('_'):
continue
@@ -68,7 +88,9 @@ class _JSONDataclass:
def json(self):
'''Convert the object to a JSON string'''
out = _json_dataclass_to_dict(self)
with warnings.catch_warnings():
warnings.filterwarnings(action = 'ignore', category = DeprecatedFeatureWarning)
out = _json_dataclass_to_dict(self)
for key, value in list(out.items()): # Modifying the dict below, so make a copy first
if isinstance(value, IntWithGranularity):
out[key] = int(value)
@@ -79,7 +101,7 @@ class _JSONDataclass:
@dataclasses.dataclass
class Item(_JSONDataclass):
'''An abstract base class for an item returned by the scraper's get_items generator.
'''An abstract base class for an item returned by the scraper.
An item can really be anything. The string representation should be useful for the CLI output (e.g. a direct URL for the item).
'''
@@ -89,18 +111,6 @@ class Item(_JSONDataclass):
pass
@dataclasses.dataclass
class Entity(_JSONDataclass):
'''An abstract base class for an entity returned by the scraper's entity property.
An entity is typically the account of a person or organisation. The string representation should be the preferred direct URL to the entity's page on the network.
'''
@abc.abstractmethod
def __str__(self):
pass
class IntWithGranularity(int):
'''A number with an associated granularity
@@ -116,18 +126,31 @@ class IntWithGranularity(int):
return (IntWithGranularity, (int(self), self.granularity))
class URLItem(Item):
'''A generic item which only holds a URL string.'''
class _HTTPSAdapter(requests.adapters.HTTPAdapter):
def init_poolmanager(self, *args, **kwargs):
super().init_poolmanager(*args, **kwargs)
#FIXME: Uses private urllib3.PoolManager attribute pool_classes_by_scheme.
try:
self.poolmanager.pool_classes_by_scheme['https'].ConnectionCls = _HTTPSConnection
except (AttributeError, KeyError) as e:
_logger.debug(f'Could not install TLS cipher logger: {type(e).__module__}.{type(e).__name__} {e!s}')
def __init__(self, url):
self._url = url
@property
def url(self):
return self._url
def __str__(self):
return self._url
class _HTTPSConnection(urllib3.connection.HTTPSConnection):
def connect(self, *args, **kwargs):
conn = super().connect(*args, **kwargs)
#FIXME: Uses undocumented attribute self.sock and beyond.
try:
_logger.debug(f'Connected to: {self.sock.getpeername()}')
except AttributeError:
# self.sock might be a urllib3.util.ssltransport.SSLTransport, which lacks getpeername.
pass
try:
_logger.debug(f'Connection cipher: {self.sock.cipher()}')
except AttributeError:
# Shouldn't be possible, but better safe than sorry.
pass
return conn
class ScraperException(Exception):
@@ -143,6 +166,7 @@ class Scraper:
self._retries = retries
self._proxies = proxies
self._session = requests.Session()
self._session.mount('https://', _HTTPSAdapter())
@abc.abstractmethod
def get_items(self):
@@ -164,16 +188,17 @@ class Scraper:
def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None, allowRedirects = True, proxies = None):
proxies = proxies or self._proxies or {}
errors = []
for attempt in range(self._retries + 1):
# The request is newly prepared on each retry because of potential cookie updates.
req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None)
logger.info(f'Retrieving {req.url}')
logger.debug(f'... with headers: {headers!r}')
_logger.info(f'Retrieving {req.url}')
_logger.debug(f'... with headers: {headers!r}')
if data:
logger.debug(f'... with data: {data!r}')
_logger.debug(f'... with data: {data!r}')
if environmentSettings:
logger.debug(f'... with environmentSettings: {environmentSettings!r}')
_logger.debug(f'... with environmentSettings: {environmentSettings!r}')
try:
r = self._session.send(req, allow_redirects = allowRedirects, timeout = timeout, **environmentSettings)
except requests.exceptions.RequestException as exc:
@@ -183,21 +208,25 @@ class Scraper:
else:
retrying = ''
level = logging.ERROR
logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
_logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
errors.append(repr(exc))
else:
redirected = f' (redirected to {r.url})' if r.history else ''
logger.info(f'Retrieved {req.url}{redirected}: {r.status_code}')
_logger.info(f'Retrieved {req.url}{redirected}: {r.status_code}')
_logger.debug(f'... with response headers: {r.headers!r}')
if r.history:
for i, redirect in enumerate(r.history):
logger.debug(f'... request {i}: {redirect.request.url}: {r.status_code} (Location: {r.headers.get("Location")})')
_logger.debug(f'... request {i}: {redirect.request.url}: {redirect.status_code} (Location: {redirect.headers.get("Location")})')
_logger.debug(f'... ... with response headers: {redirect.headers!r}')
if responseOkCallback is not None:
success, msg = responseOkCallback(r)
errors.append(msg)
else:
success, msg = (True, None)
msg = f': {msg}' if msg else ''
if success:
logger.debug(f'{req.url} retrieved successfully{msg}')
_logger.debug(f'{req.url} retrieved successfully{msg}')
return r
else:
if attempt < self._retries:
@@ -206,14 +235,15 @@ class Scraper:
else:
retrying = ''
level = logging.ERROR
logger.log(level, f'Error retrieving {req.url}{msg}{retrying}')
_logger.log(level, f'Error retrieving {req.url}{msg}{retrying}')
if attempt < self._retries:
sleepTime = 1.0 * 2**attempt # exponential backoff: sleep 1 second after first attempt, 2 after second, 4 after third, etc.
logger.info(f'Waiting {sleepTime:.0f} seconds')
_logger.info(f'Waiting {sleepTime:.0f} seconds')
time.sleep(sleepTime)
else:
msg = f'{self._retries + 1} requests to {req.url} failed, giving up.'
logger.fatal(msg)
_logger.fatal(msg)
_logger.fatal(f'Errors: {", ".join(errors)}')
raise ScraperException(msg)
raise RuntimeError('Reached unreachable code')
@@ -244,3 +274,6 @@ def nonempty_string(name):
raise ValueError('must not be an empty string')
f.__name__ = name
return f
__getattr__, __dir__ = _module_deprecation_helper(__all__, Entity = Item)

View File

@@ -30,7 +30,7 @@ class FacebookPost(snscrape.base.Item):
@dataclasses.dataclass
class User(snscrape.base.Entity):
class User(snscrape.base.Item):
username: str
pageId: int
name: str

View File

@@ -32,7 +32,7 @@ class InstagramPost(snscrape.base.Item):
@dataclasses.dataclass
class User(snscrape.base.Entity):
class User(snscrape.base.Item):
username: str
name: typing.Optional[str]
followers: snscrape.base.IntWithGranularity

View File

@@ -67,7 +67,7 @@ class PollOption:
@dataclasses.dataclass
class User(snscrape.base.Entity):
class User(snscrape.base.Item):
account: str # @username@domain.invalid
displayName: typing.Optional[str] = None
displayNameWithCustomEmojis: typing.Optional[typing.List[typing.Union[str, 'CustomEmoji']]] = None

View File

@@ -133,6 +133,21 @@ class _RedditPushshiftScraper(snscrape.base.Scraper):
return cls(**kwargs)
def _iter_api(self, url, params = None):
'''Iterate through the Pushshift API using the 'until' parameter and yield the items.'''
lowestIdSeen = None
if params is None:
params = {}
while True:
obj = self._get_api(url, params = params)
if not obj['data'] or (lowestIdSeen is not None and all(_cmp_id(d['id'], lowestIdSeen) >= 0 for d in obj['data'])): # end of pagination
break
for d in obj['data']:
if lowestIdSeen is None or _cmp_id(d['id'], lowestIdSeen) == -1:
yield self._api_obj_to_item(d)
lowestIdSeen = d['id']
params['until'] = obj["data"][-1]["created_utc"] + 1
class _RedditPushshiftSearchScraper(_RedditPushshiftScraper):
def __init__(self, name, *, submissions = True, comments = True, before = None, after = None, **kwargs):
@@ -148,35 +163,20 @@ class _RedditPushshiftSearchScraper(_RedditPushshiftScraper):
if not self._submissions and not self._comments:
raise ValueError('At least one of submissions and comments must be True')
def _iter_api(self, url, params = None):
'''Iterate through the Pushshift API using the 'before' parameter and yield the items.'''
lowestIdSeen = None
if params is None:
params = {}
if self._before is not None:
params['before'] = self._before
if self._after is not None:
params['after'] = self._after
params['sort'] = 'desc'
while True:
obj = self._get_api(url, params = params)
if not obj['data'] or (lowestIdSeen is not None and all(_cmp_id(d['id'], lowestIdSeen) >= 0 for d in obj['data'])): # end of pagination
break
for d in obj['data']:
if lowestIdSeen is None or _cmp_id(d['id'], lowestIdSeen) == -1:
yield self._api_obj_to_item(d)
lowestIdSeen = d['id']
params['before'] = obj["data"][-1]["created_utc"] + 1
def _iter_api_submissions_and_comments(self, params: dict):
# Retrieve both submissions and comments, interleave the results to get a reverse-chronological order
params['size'] = '1000'
params['limit'] = '1000'
if self._before is not None:
params['until'] = self._before
if self._after is not None:
params['since'] = self._after
if self._submissions:
submissionsIter = self._iter_api('https://api.pushshift.io/reddit/search/submission/', params.copy()) # Pass copies to prevent the two iterators from messing each other up by using the same dict
submissionsIter = self._iter_api('https://api.pushshift.io/reddit/search/submission', params.copy()) # Pass copies to prevent the two iterators from messing each other up by using the same dict
else:
submissionsIter = iter(())
if self._comments:
commentsIter = self._iter_api('https://api.pushshift.io/reddit/search/comment/', params.copy())
commentsIter = self._iter_api('https://api.pushshift.io/reddit/search/comment', params.copy())
else:
commentsIter = iter(())
@@ -260,21 +260,15 @@ class RedditSubmissionScraper(_RedditPushshiftScraper):
self._submissionId = submissionId
def get_items(self):
obj = self._get_api(f'https://api.pushshift.io/reddit/search/submission/?ids={self._submissionId}')
obj = self._get_api(f'https://api.pushshift.io/reddit/search/submission?ids={self._submissionId}')
if not obj['data']:
return
if len(obj['data']) != 1:
raise snscrape.base.ScraperException(f'Got {len(obj["data"])} results instead of 1')
yield self._api_obj_to_item(obj['data'][0])
obj = self._get_api(f'https://api.pushshift.io/reddit/submission/comment_ids/{self._submissionId}')
if not obj['data']:
return
commentIds = obj['data']
for i in range(0, len(commentIds), 500):
ids = commentIds[i : i + 500]
obj = self._get_api(f'https://api.pushshift.io/reddit/comment/search?ids={",".join(ids)}')
yield from map(self._api_obj_to_item, obj['data'])
# Upstream bug: link_id must be provided in decimal https://old.reddit.com/r/pushshift/comments/zkggt0/update_on_colo_switchover_bug_fixes_reindexing/
yield from self._iter_api('https://api.pushshift.io/reddit/search/comment', {'link_id': int(self._submissionId, 36), 'limit': 1000})
@classmethod
def _cli_setup_parser(cls, subparser):

View File

@@ -24,7 +24,7 @@ class LinkPreview:
@dataclasses.dataclass
class Channel(snscrape.base.Entity):
class Channel(snscrape.base.Item):
username: str
title: typing.Optional[str] = None
verified: typing.Optional[bool] = None
@@ -269,13 +269,10 @@ class TelegramChannelScraper(snscrape.base.Scraper):
if r.status_code != 200:
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
soup = bs4.BeautifulSoup(r.text, 'lxml')
membersDiv = soup.find('div', class_ = 'tgme_page_extra')
if membersDiv.text.split(',')[0].endswith((' members', ' subscribers')):
membersStr = ''.join(membersDiv.text.split(',')[0].split(' ')[:-1])
if membersStr == 'no':
kwargs['members'] = 0
else:
kwargs['members'] = int(membersStr)
if (membersDiv := soup.find('div', class_ = 'tgme_page_extra')):
if membersDiv.text.split(',')[0].endswith((' members', ' subscribers')):
membersStr = ''.join(membersDiv.text.split(',')[0].split(' ')[:-1])
kwargs['members'] = 0 if membersStr == 'no' else int(membersStr)
photoImg = soup.find('img', class_ = 'tgme_page_photo_image')
if photoImg is not None:
kwargs['photo'] = photoImg.attrs['src']

File diff suppressed because it is too large Load Diff

View File

@@ -38,35 +38,11 @@ _datePattern = re.compile(r'^(?P<date>today'
r'\s+at\s+(?P<hour>\d+):(?P<minute>\d+)\s+(?P<ampm>[ap]m)$')
@dataclasses.dataclass
class User(snscrape.base.Entity):
username: str
name: str
verified: bool
description: typing.Optional[str] = None
websites: typing.Optional[typing.List[str]] = None
followers: typing.Optional[snscrape.base.IntWithGranularity] = None
posts: typing.Optional[snscrape.base.IntWithGranularity] = None
photos: typing.Optional[snscrape.base.IntWithGranularity] = None
tags: typing.Optional[snscrape.base.IntWithGranularity] = None
following: typing.Optional[snscrape.base.IntWithGranularity] = None
followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
tagsGranularity = snscrape.base._DeprecatedProperty('tagsGranularity', lambda self: self.tags.granularity, 'tags.granularity')
followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
def __str__(self):
return f'https://vk.com/{self.username}'
@dataclasses.dataclass
class VKontaktePost(snscrape.base.Item):
url: str
date: typing.Optional[typing.Union[datetime.datetime, datetime.date]]
content: str
user: User
outlinks: typing.Optional[typing.List[str]] = None
photos: typing.Optional[typing.List['Photo']] = None
video: typing.Optional['Video'] = None
@@ -98,6 +74,29 @@ class Video:
thumbUrl: str
@dataclasses.dataclass
class User(snscrape.base.Item):
username: str
name: str
verified: bool
description: typing.Optional[str] = None
websites: typing.Optional[typing.List[str]] = None
followers: typing.Optional[snscrape.base.IntWithGranularity] = None
posts: typing.Optional[snscrape.base.IntWithGranularity] = None
photos: typing.Optional[snscrape.base.IntWithGranularity] = None
tags: typing.Optional[snscrape.base.IntWithGranularity] = None
following: typing.Optional[snscrape.base.IntWithGranularity] = None
followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
tagsGranularity = snscrape.base._DeprecatedProperty('tagsGranularity', lambda self: self.tags.granularity, 'tags.granularity')
followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
def __str__(self):
return f'https://vk.com/{self.username}'
class VKontakteUserScraper(snscrape.base.Scraper):
name = 'vkontakte-user'
@@ -118,6 +117,9 @@ class VKontakteUserScraper(snscrape.base.Scraper):
return urllib.parse.unquote(a['href'][13 : end])
return None
def is_photo(self, a):
return 'aria-label' in a.attrs and a.attrs['aria-label'].startswith('photo')
def _date_span_to_date(self, dateSpan):
if not dateSpan:
return None
@@ -173,7 +175,7 @@ class VKontakteUserScraper(snscrape.base.Scraper):
not (not isCopy and thumbsDiv.parent.name == 'div' and 'class' in thumbsDiv.parent.attrs and 'copy_quote' in thumbsDiv.parent.attrs['class']): # Skip post quotes
photos = []
for a in thumbsDiv.find_all('a', class_ = 'page_post_thumb_wrap'):
if 'data-photo-id' not in a.attrs and 'data-video' not in a.attrs:
if not self.is_photo(a) and 'data-video' not in a.attrs:
_logger.warning(f'Skipping non-photo and non-video thumb wrap on {url}')
continue
if 'data-video' in a.attrs:
@@ -213,24 +215,14 @@ class VKontakteUserScraper(snscrape.base.Scraper):
photoUrl = f'https://vk.com{a["href"]}' if 'href' in a.attrs and a['href'].startswith('/photo') and a['href'][6:].strip('0123456789-_') == '' else None
photos.append(Photo(variants = photoVariants, url = photoUrl))
quotedPost = self._post_div_to_item(quoteDiv, isCopy = True) if (quoteDiv := post.find('div', class_ = 'copy_quote')) else None
authorHeading = post.find('h5', class_ = ['post_author', 'copy_post_author'])
authorLink = authorHeading.find('a', class_ = ['author', 'copy_author'])
username = authorLink['href'].split('/')[-1]
name = authorLink.text
if authorHeading.find('div', class_ = 'page_verified') is not None:
verified = True
else:
verified = False
user = User(username = username, name = name, verified = verified)
return VKontaktePost(
url = url,
date = self._date_span_to_date(dateSpan),
content = textDiv.text if textDiv else None,
user = user,
outlinks = outlinks or None,
photos = photos or None,
video = video or None,
quotedPost = quotedPost,
url = url,
date = self._date_span_to_date(dateSpan),
content = textDiv.text if textDiv else None,
outlinks = outlinks or None,
photos = photos or None,
video = video or None,
quotedPost = quotedPost,
)
def _soup_to_items(self, soup):
@@ -387,13 +379,6 @@ class VKontakteUserScraper(snscrape.base.Scraper):
if (followersDiv := soup.find('div', id = 'public_followers')):
if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Followers':
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
# On community groups, this is where followers are listed
elif (followersDiv := soup.find('div', class_ = 'group_friends_text')):
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(followersDiv.find('span', class_ = 'group_friends_count').text))
# On public groups, this is where followers are listed
elif (followersDiv := soup.find('div', id = 'group_followers')):
if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Members':
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
return User(**kwargs)

View File

@@ -34,7 +34,7 @@ class Post(snscrape.base.Item):
@dataclasses.dataclass
class User(snscrape.base.Entity):
class User(snscrape.base.Item):
screenname: str
uid: int
verified: bool
@@ -81,6 +81,8 @@ class WeiboUserScraper(snscrape.base.Scraper):
return True, None
def _mblog_to_item(self, mblog):
if mblog.get('page_info', {}).get('type') not in (None, 'video', 'webpage'):
_logger.warning(f'Skipping unknown page info {mblog["page_info"]["type"]!r} on status {mblog["id"]}')
return Post(
url = f'https://m.weibo.cn/status/{mblog["bid"]}',
id = mblog['id'],
@@ -92,7 +94,7 @@ class WeiboUserScraper(snscrape.base.Scraper):
likesCount = mblog.get('attitudes_count'),
picturesCount = mblog.get('pic_num'),
pictures = [x['large']['url'] for x in mblog['pics']] if 'pics' in mblog else None,
video = mblog['page_info']['media_info']['mp4_720p_mp4'] if 'page_info' in mblog and mblog['page_info']['type'] == 'video' else None,
video = urls.get('mp4_720p_mp4') or urls.get('mp4_hd_mp4') or urls['mp4_ld_mp4'] if 'page_info' in mblog and mblog['page_info']['type'] == 'video' and (urls := mblog['page_info']['urls']) else None,
link = mblog['page_info']['page_url'] if 'page_info' in mblog and mblog['page_info']['type'] == 'webpage' else None,
repostedPost = self._mblog_to_item(mblog['retweeted_status']) if 'retweeted_status' in mblog else None,
)

16
snscrape/utils.py Normal file
View File

@@ -0,0 +1,16 @@
def dict_map(input, keyMap):
'''Return a new dict from an input dict and a {'input_key': 'output_key'} mapping'''
return {outputKey: input[inputKey] for inputKey, outputKey in keyMap.items() if inputKey in input}
def snake_to_camel(**kwargs):
'''Return a new dict from kwargs with snake_case keys replaced by camelCase'''
out = {}
for key, value in kwargs.items():
keyParts = key.split('_')
for i in range(1, len(keyParts)):
keyParts[i] = keyParts[i][:1].upper() + keyParts[i][1:]
out[''.join(keyParts)] = value
return out