mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-08 10:38:28 +03:00
Compare commits
336 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
40b8d9f267 | ||
|
|
fdc40f7411 | ||
|
|
82351800d6 | ||
|
|
73f10a4f24 | ||
|
|
cb429909d0 | ||
|
|
d72b51953f | ||
|
|
056cd6215c | ||
|
|
d5b406bc1b | ||
|
|
56e4232083 | ||
|
|
50899c01f3 | ||
|
|
bcad6923c2 | ||
|
|
0d361685ff | ||
|
|
530f4fa122 | ||
|
|
dc6bc9bf9d | ||
|
|
01cf6a09b3 | ||
|
|
ef7c4fad3e | ||
|
|
65723f10ff | ||
|
|
07a5f6fd7d | ||
|
|
0822a9c354 | ||
|
|
faeffe2603 | ||
|
|
e3bdc02a7c | ||
|
|
e2d922301e | ||
|
|
b13e62eb5d | ||
|
|
f38513503d | ||
|
|
0a4bd39ca6 | ||
|
|
c18ca0f047 | ||
|
|
5648e957d0 | ||
|
|
21f7b620ec | ||
|
|
9b3faec980 | ||
|
|
97d38e5cde | ||
|
|
b276c3cc27 | ||
|
|
1e4e0c278d | ||
|
|
babcddda19 | ||
|
|
ed3ea944d1 | ||
|
|
e7a6d38a5f | ||
|
|
6c50eee31b | ||
|
|
5103a33afa | ||
|
|
247bd82d79 | ||
|
|
5fc67f2bcf | ||
|
|
65e7d8bd24 | ||
|
|
3870282a42 | ||
|
|
7c0fcdec43 | ||
|
|
9af1f19034 | ||
|
|
5fc3c0e290 | ||
|
|
f978954bb3 | ||
|
|
2ce014ade4 | ||
|
|
5d156c6a15 | ||
|
|
4e59638e7c | ||
|
|
a7eb54d226 | ||
|
|
d32c9add8a | ||
|
|
fb8d73ac95 | ||
|
|
ed829163a0 | ||
|
|
694657ef80 | ||
|
|
b8efce2a12 | ||
|
|
1ab0f4fccb | ||
|
|
3a92b5bf0d | ||
|
|
2480b173f4 | ||
|
|
de4ebed81f | ||
|
|
72b26f2373 | ||
|
|
77bbb9f61f | ||
|
|
57a624c618 | ||
|
|
b1cfd51121 | ||
|
|
ace2c16f54 | ||
|
|
2f9c0457df | ||
|
|
878f2a3c7a | ||
|
|
25ee014e29 | ||
|
|
a192dc6236 | ||
|
|
a7242f340b | ||
|
|
359cc25cdf | ||
|
|
01799a7391 | ||
|
|
b0753c34ed | ||
|
|
7f78fa0bc0 | ||
|
|
8702a9c7e2 | ||
|
|
8ac1fd3ea8 | ||
|
|
9235890f9a | ||
|
|
7d939c110c | ||
|
|
8e95e9a9a7 | ||
|
|
aa7d7d3dc3 | ||
|
|
560c78c5cf | ||
|
|
107c3c71c2 | ||
|
|
7f88678253 | ||
|
|
52e4f9fb69 | ||
|
|
eebdfc1c55 | ||
|
|
e6076353c8 | ||
|
|
a32d79fab2 | ||
|
|
65391297f6 | ||
|
|
deb2659dd6 | ||
|
|
93e62744d7 | ||
|
|
3f3632d341 | ||
|
|
5070953feb | ||
|
|
853848ed5d | ||
|
|
0b4abdc43f | ||
|
|
267b7d0e32 | ||
|
|
acb7f10a4f | ||
|
|
ca00b480b1 | ||
|
|
f189ab4241 | ||
|
|
c6e1e33a23 | ||
|
|
a37ea528d3 | ||
|
|
eee06d8593 | ||
|
|
4dd3ee6e47 | ||
|
|
0336ce13ed | ||
|
|
193d4f80d6 | ||
|
|
e7d35ec1eb | ||
|
|
8540045658 | ||
|
|
1f1c1bd8af | ||
|
|
7fdc8bcb53 | ||
|
|
4b3c6aefe7 | ||
|
|
525cd71225 | ||
|
|
72abff9e5c | ||
|
|
bcaa477b3d | ||
|
|
66d4c99f82 | ||
|
|
0ac50f1383 | ||
|
|
c2257ad16e | ||
|
|
58f654405f | ||
|
|
35fb61a327 | ||
|
|
a6b6f3faaa | ||
|
|
5e829e2541 | ||
|
|
d4567da23c | ||
|
|
e5e0da25a0 | ||
|
|
821326bcfb | ||
|
|
4bf9ef239c | ||
|
|
e382891642 | ||
|
|
e5f4389464 | ||
|
|
d91f971f51 | ||
|
|
67e8295293 | ||
|
|
5fc2562642 | ||
|
|
2825bd0a73 | ||
|
|
9831f2a4a0 | ||
|
|
a11eef6b06 | ||
|
|
3fb731ade1 | ||
|
|
c76f1637ce | ||
|
|
ed117e8891 | ||
|
|
f9a3fafb3f | ||
|
|
660b8c7a0a | ||
|
|
0c22608dc7 | ||
|
|
2bb706feda | ||
|
|
5e6bc4ec50 | ||
|
|
57d0aaafc1 | ||
|
|
157e4d4265 | ||
|
|
54588e9c42 | ||
|
|
9e7274f3d7 | ||
|
|
ac4e335bdb | ||
|
|
1d255de48d | ||
|
|
9c1dcd37f9 | ||
|
|
f8dac183d0 | ||
|
|
45d1fa27de | ||
|
|
98b798b0e5 | ||
|
|
f18b64e7da | ||
|
|
460be9d581 | ||
|
|
97c8caea48 | ||
|
|
a34f93076a | ||
|
|
8f1c470061 | ||
|
|
dbf2a2f689 | ||
|
|
39a34a57ac | ||
|
|
f44b39705a | ||
|
|
f64ce217b7 | ||
|
|
cdf87f4b8f | ||
|
|
47fbc2a84d | ||
|
|
5cd3b7d7cc | ||
|
|
0121fa51c2 | ||
|
|
892941b609 | ||
|
|
e3022628b6 | ||
|
|
fdc33d0dba | ||
|
|
6d6411cc24 | ||
|
|
61a1ecffc5 | ||
|
|
d2dce37fa0 | ||
|
|
d65f0434da | ||
|
|
7499384110 | ||
|
|
7a0f68b7ec | ||
|
|
1a219fd2b6 | ||
|
|
6fb98dae12 | ||
|
|
8c2c0fa47a | ||
|
|
58c8365c33 | ||
|
|
2c11ec38fa | ||
|
|
fe5e23502d | ||
|
|
644cd1d2fb | ||
|
|
5ccfab6314 | ||
|
|
bf895ea5b1 | ||
|
|
e956e2562b | ||
|
|
defe874bf4 | ||
|
|
3f8935ee4d | ||
|
|
cd12500dbf | ||
|
|
5dc61d50ac | ||
|
|
11a82e110a | ||
|
|
16ebe8bf48 | ||
|
|
1bbe25647a | ||
|
|
e22b461563 | ||
|
|
c4a5715e18 | ||
|
|
5cb64faa72 | ||
|
|
0f78aa45fc | ||
|
|
179112a310 | ||
|
|
4ce9ed4eb3 | ||
|
|
11414cb68f | ||
|
|
bd53e729a0 | ||
|
|
ffd9289edc | ||
|
|
b1a7b9607f | ||
|
|
119e53d07c | ||
|
|
c3e2e12369 | ||
|
|
a70b361176 | ||
|
|
8b68f1a8af | ||
|
|
c72bf3174f | ||
|
|
472cef2382 | ||
|
|
b1d8475a03 | ||
|
|
3d3faf80bf | ||
|
|
bbb372284b | ||
|
|
8cf81e9bfc | ||
|
|
d90f06b389 | ||
|
|
c519832755 | ||
|
|
397a0b988e | ||
|
|
f1428fa0e0 | ||
|
|
7d2c546ee5 | ||
|
|
2332c30e26 | ||
|
|
b78bf3e642 | ||
|
|
1a09f9b9a3 | ||
|
|
5ae5ec7bcd | ||
|
|
c0ff6631aa | ||
|
|
ae60a4d0fd | ||
|
|
800cfd5be0 | ||
|
|
f296f9d21d | ||
|
|
8265ffc19e | ||
|
|
f8efe98608 | ||
|
|
2b5444f89e | ||
|
|
07d446fd19 | ||
|
|
a25426043b | ||
|
|
84692846b9 | ||
|
|
039b2c6719 | ||
|
|
70a3d9ba3a | ||
|
|
bd619bf4e9 | ||
|
|
072519f539 | ||
|
|
d9572ec450 | ||
|
|
ba250aabf2 | ||
|
|
0cc4f0c016 | ||
|
|
1a2e367a87 | ||
|
|
4f24843f89 | ||
|
|
bfb92a47b9 | ||
|
|
dc5d55004b | ||
|
|
d8e7f96d4d | ||
|
|
bb83d1d72f | ||
|
|
1480260e47 | ||
|
|
c8d688d39f | ||
|
|
9df4352089 | ||
|
|
dd25fd0526 | ||
|
|
c90fd54b6b | ||
|
|
9528df48cd | ||
|
|
924c35f883 | ||
|
|
588ec415ff | ||
|
|
bf229414ba | ||
|
|
afa819547d | ||
|
|
dbcdc159ef | ||
|
|
30f945897a | ||
|
|
eee5794ff9 | ||
|
|
966a6ebd8e | ||
|
|
4d3d0fe0d7 | ||
|
|
7b967ff82a | ||
|
|
90f9598ecc | ||
|
|
7b3c7deb28 | ||
|
|
040a11656c | ||
|
|
1459245258 | ||
|
|
dbe4c5ce55 | ||
|
|
80491ecc2c | ||
|
|
1a71b58101 | ||
|
|
0ce37a69d4 | ||
|
|
722bfd5f7c | ||
|
|
b6cc3180d9 | ||
|
|
613395d1c2 | ||
|
|
82a87b7b5a | ||
|
|
9568028bf9 | ||
|
|
6df351772e | ||
|
|
541173b0c8 | ||
|
|
b6772d3778 | ||
|
|
20ea117a2c | ||
|
|
ff54c350bc | ||
|
|
e6aae35304 | ||
|
|
b698a201f5 | ||
|
|
7fe72cf708 | ||
|
|
4651cde447 | ||
|
|
c99cc4b5d3 | ||
|
|
628074d6fc | ||
|
|
64b293bd9e | ||
|
|
180f4dfeb7 | ||
|
|
6d6e3fa16c | ||
|
|
5f7e6936c1 | ||
|
|
e2c05c9e0c | ||
|
|
14e11b28d2 | ||
|
|
1a07b3b7e8 | ||
|
|
4d8cc7bdb9 | ||
|
|
eec83f181e | ||
|
|
fae7432c64 | ||
|
|
757818474d | ||
|
|
e6c934c0b8 | ||
|
|
d2315feec1 | ||
|
|
765ceeeb10 | ||
|
|
731a2e8c8b | ||
|
|
7d1916292c | ||
|
|
0d509c4ba0 | ||
|
|
907a003a59 | ||
|
|
8ada279b57 | ||
|
|
900eae54a6 | ||
|
|
7989af27b5 | ||
|
|
e528ca3f26 | ||
|
|
32a427dac3 | ||
|
|
7001983556 | ||
|
|
64438afc92 | ||
|
|
9e6538556a | ||
|
|
9c8bbf051c | ||
|
|
c6a11298ac | ||
|
|
02cbf6ddf6 | ||
|
|
3817aa59d4 | ||
|
|
46a51008f8 | ||
|
|
f91979eb32 | ||
|
|
85fff319bc | ||
|
|
6b145526b7 | ||
|
|
abf31764b1 | ||
|
|
64693f74bb | ||
|
|
a7d08ed51c | ||
|
|
f48ca7726e | ||
|
|
78c295f7e0 | ||
|
|
a5aca1a14f | ||
|
|
96f7d871c1 | ||
|
|
b5dfd37949 | ||
|
|
b511397791 | ||
|
|
536fcb3303 | ||
|
|
f8d812f799 | ||
|
|
c2cebd9166 | ||
|
|
73bc99596f | ||
|
|
8458c12218 | ||
|
|
b59c7e8d8f | ||
|
|
3ceb849d98 | ||
|
|
f5ee1f7ac5 | ||
|
|
1984110f78 | ||
|
|
c5a5dcb92c | ||
|
|
cfb1c9a2aa | ||
|
|
d0d3c8b2a6 | ||
|
|
4d0350e541 | ||
|
|
d17aa15bcb | ||
|
|
d1ef280d6e |
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
__pycache__/
|
||||
/dist/
|
||||
/snscrape.egg-info/
|
||||
/.eggs/
|
||||
50
README.md
50
README.md
@@ -1,38 +1,66 @@
|
||||
# snscrape
|
||||
snscrape is a scraper for social networking services (SNS). It scrapes things like user profiles, hashtags, or searches and returns the discovered items, e.g. the relevant posts.
|
||||
snscrape is a scraper for social networking services (SNS). It scrapes things like user profiles, hashtags, or searches and returns the discovered items, e.g. the relevant posts.
|
||||
|
||||
The following services are currently supported:
|
||||
* Facebook: user profiles
|
||||
* Google Plus: user profiles
|
||||
* Instagram: user profiles
|
||||
* Twitter: user profiles, hashtags, and searches
|
||||
|
||||
* Facebook: user profiles, groups, and communities (aka visitor posts)
|
||||
* Instagram: user profiles, hashtags, and locations
|
||||
* Mastodon: user profiles and toots (single or thread)
|
||||
* Reddit: users, subreddits, and searches (via Pushshift)
|
||||
* Telegram: channels
|
||||
* Twitter: users, user profiles, hashtags, searches, tweets (single or surrounding thread), list posts, and trends
|
||||
* VKontakte: user profiles
|
||||
* Weibo (Sina Weibo): user profiles
|
||||
|
||||
## Requirements
|
||||
snscrape requires Python 3.6 or higher. The Python package dependencies are installed automatically when you install snscrape.
|
||||
snscrape requires Python 3.8 or higher. The Python package dependencies are installed automatically when you install snscrape.
|
||||
|
||||
Note that one of the dependencies, lxml, also requires libxml2 and libxslt to be installed.
|
||||
|
||||
## Installation
|
||||
pip3 install snscrape
|
||||
|
||||
If you want to use the development version:
|
||||
|
||||
pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git
|
||||
|
||||
## Usage
|
||||
To get all tweets by Jason Scott (@textfiles):
|
||||
### CLI
|
||||
The generic syntax of snscrape's CLI is:
|
||||
|
||||
snscrape [GLOBAL-OPTIONS] SCRAPER-NAME [SCRAPER-OPTIONS] [SCRAPER-ARGUMENTS...]
|
||||
|
||||
`snscrape --help` and `snscrape SCRAPER-NAME --help` provide details on the options and arguments. `snscrape --help` also lists all available scrapers.
|
||||
|
||||
The default output of the CLI is the URL of each result.
|
||||
|
||||
Some noteworthy global options are:
|
||||
|
||||
* `--jsonl` to get output as JSONL. This includes all information extracted by snscrape (e.g. message content, datetime, images; details vary by scraper).
|
||||
* `--max-results NUMBER` to only return the first `NUMBER` results.
|
||||
* `--with-entity` to get an item on the entity being scraped, e.g. the user or channel. This is not supported on all scrapers. (You can use this together with `--max-results 0` to only fetch the entity info.)
|
||||
|
||||
#### Examples
|
||||
Collect all tweets by Jason Scott (@textfiles):
|
||||
|
||||
snscrape twitter-user textfiles
|
||||
|
||||
It's usually useful to redirect the output to a file for further processing, e.g. in bash using the filename `@textfiles-tweets`:
|
||||
It's usually useful to redirect the output to a file for further processing, e.g. in bash using the filename `twitter-@textfiles`:
|
||||
|
||||
```bash
|
||||
snscrape twitter-user textfiles >@textfiles-tweets
|
||||
snscrape twitter-user textfiles >twitter-@textfiles
|
||||
```
|
||||
|
||||
To get the latest 100 tweets with the hashtag #archiveteam:
|
||||
|
||||
snscrape --max-results 100 twitter-hashtag archiveteam
|
||||
|
||||
`snscrape --help` or `snscrape <module> --help` provides details on the available options. `snscrape --help` also lists all available modules.
|
||||
|
||||
### Library
|
||||
It is also possible to use snscrape as a library in Python, but this is currently undocumented.
|
||||
|
||||
## Issue reporting
|
||||
If you discover an issue with snscrape, please report it at <https://github.com/JustAnotherArchivist/snscrape/issues>. If possible please run snscrape with `-vv` and `--dump-locals` and include the log output as well as the dump files referenced in the log in the issue. Note that the files may contain sensitive information in some cases and could potentially be used to identify you (e.g. if the service includes your IP address in its response). If you prefer to arrange a file transfer privately, just mention that in the issue.
|
||||
|
||||
## License
|
||||
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
|
||||
|
||||
|
||||
30
setup.py
30
setup.py
@@ -1,22 +1,42 @@
|
||||
import os.path
|
||||
import setuptools
|
||||
|
||||
|
||||
with open(os.path.join(os.path.dirname(__file__), 'README.md')) as fp:
|
||||
readme = fp.read()
|
||||
|
||||
|
||||
setuptools.setup(
|
||||
name = 'snscrape',
|
||||
version = '0.1.1',
|
||||
description = 'A social networking service scraper',
|
||||
long_description = readme,
|
||||
long_description_content_type = 'text/markdown',
|
||||
author = 'JustAnotherArchivist',
|
||||
url = 'https://github.com/JustAnotherArchivist/snscrape',
|
||||
classifiers = [
|
||||
'Development Status :: 4 - Beta',
|
||||
'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Programming Language :: Python :: 3.9',
|
||||
'Programming Language :: Python :: 3.10',
|
||||
],
|
||||
packages = ['snscrape'],
|
||||
install_requires = ['requests', 'lxml', 'beautifulsoup4'],
|
||||
packages = ['snscrape', 'snscrape.modules'],
|
||||
setup_requires = ['setuptools_scm'],
|
||||
use_scm_version = True,
|
||||
install_requires = [
|
||||
'requests[socks]',
|
||||
'lxml',
|
||||
'beautifulsoup4',
|
||||
'pytz; python_version < "3.9.0"',
|
||||
'filelock',
|
||||
],
|
||||
python_requires = '~=3.8',
|
||||
extras_require = {
|
||||
'test': ['coverage'],
|
||||
},
|
||||
entry_points = {
|
||||
'console_scripts': [
|
||||
'snscrape = snscrape.cli:main',
|
||||
'snscrape = snscrape._cli:main',
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
338
snscrape/_cli.py
Normal file
338
snscrape/_cli.py
Normal file
@@ -0,0 +1,338 @@
|
||||
import argparse
|
||||
import collections
|
||||
import contextlib
|
||||
import dataclasses
|
||||
import datetime
|
||||
import importlib.metadata
|
||||
import inspect
|
||||
import logging
|
||||
import requests
|
||||
# Imported in parse_args() after setting up the logger:
|
||||
#import snscrape.base
|
||||
#import snscrape.modules
|
||||
#import snscrape.version
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
|
||||
## Logging
|
||||
dumpLocals = False
|
||||
logger = logging # Replaced below after setting the logger class
|
||||
|
||||
|
||||
class Logger(logging.Logger):
|
||||
def _log_with_stack(self, level, *args, **kwargs):
|
||||
super().log(level, *args, **kwargs)
|
||||
if dumpLocals:
|
||||
stack = inspect.stack()
|
||||
if len(stack) >= 3:
|
||||
name = _dump_stack_and_locals(stack[2:][::-1])
|
||||
super().log(level, f'Dumped stack and locals to {name}')
|
||||
|
||||
def warning(self, *args, **kwargs):
|
||||
self._log_with_stack(logging.WARNING, *args, **kwargs)
|
||||
|
||||
def error(self, *args, **kwargs):
|
||||
self._log_with_stack(logging.ERROR, *args, **kwargs)
|
||||
|
||||
def critical(self, *args, **kwargs):
|
||||
self._log_with_stack(logging.CRITICAL, *args, **kwargs)
|
||||
|
||||
def log(self, level, *args, **kwargs):
|
||||
if level >= logging.WARNING:
|
||||
self._log_with_stack(level, *args, **kwargs)
|
||||
else:
|
||||
super().log(level, *args, **kwargs)
|
||||
|
||||
|
||||
def _requests_request_repr(name, request):
|
||||
ret = []
|
||||
ret.append(f'{name} = {request!r}')
|
||||
ret.append(f'\n {name}.method = {request.method}')
|
||||
ret.append(f'\n {name}.url = {request.url}')
|
||||
ret.append(f'\n {name}.headers = \\')
|
||||
for field in request.headers:
|
||||
ret.append(f'\n {field} = {_repr("_", request.headers[field])}')
|
||||
for attr in ('body', 'params', 'data'):
|
||||
if hasattr(request, attr) and getattr(request, attr):
|
||||
ret.append(f'\n {name}.{attr} = ')
|
||||
ret.append(_repr('_', getattr(request, attr)).replace('\n', '\n '))
|
||||
return ''.join(ret)
|
||||
|
||||
|
||||
def _requests_response_repr(name, response, withHistory = True):
|
||||
ret = []
|
||||
ret.append(f'{name} = {response!r}')
|
||||
ret.append(f'\n {name}.url = {response.url}')
|
||||
ret.append(f'\n {name}.request = ')
|
||||
ret.append(_repr('_', response.request).replace('\n', '\n '))
|
||||
if withHistory and response.history:
|
||||
ret.append(f'\n {name}.history = [')
|
||||
for previousResponse in response.history:
|
||||
ret.append('\n ')
|
||||
ret.append(_requests_response_repr('_', previousResponse, withHistory = False).replace('\n', '\n '))
|
||||
ret.append('\n ]')
|
||||
ret.append(f'\n {name}.status_code = {response.status_code}')
|
||||
ret.append(f'\n {name}.headers = \\')
|
||||
for field in response.headers:
|
||||
ret.append(f'\n {field} = {_repr("_", response.headers[field])}')
|
||||
ret.append(f'\n {name}.content = {_repr("_", response.content)}')
|
||||
return ''.join(ret)
|
||||
|
||||
|
||||
def _requests_exception_repr(name, exc):
|
||||
ret = []
|
||||
ret.append(f'{name} = {exc!r}')
|
||||
ret.append('\n ' + _repr(f'{name}.request', exc.request).replace('\n', '\n '))
|
||||
ret.append('\n ' + _repr(f'{name}.response', exc.response).replace('\n', '\n '))
|
||||
return ''.join(ret)
|
||||
|
||||
|
||||
def _repr(name, value):
|
||||
if type(value) is requests.Response:
|
||||
return _requests_response_repr(name, value)
|
||||
if type(value) in (requests.PreparedRequest, requests.Request):
|
||||
return _requests_request_repr(name, value)
|
||||
if isinstance(value, requests.exceptions.RequestException):
|
||||
return _requests_exception_repr(name, value)
|
||||
if isinstance(value, dict):
|
||||
return f'{name} = <{type(value).__module__}.{type(value).__name__}>\n ' + \
|
||||
'\n '.join(_repr(f'{name}[{k!r}]', v).replace('\n', '\n ') for k, v in value.items())
|
||||
if isinstance(value, (list, tuple, collections.deque)) and not all(isinstance(v, (int, str)) for v in value):
|
||||
return f'{name} = <{type(value).__module__}.{type(value).__name__}>\n ' + \
|
||||
'\n '.join(_repr(f'{name}[{i}]', v).replace('\n', '\n ') for i, v in enumerate(value))
|
||||
if dataclasses.is_dataclass(value) and not isinstance(value, type):
|
||||
return f'{name} = <{type(value).__module__}.{type(value).__name__}>\n ' + \
|
||||
'\n '.join(_repr(f'{name}.{f.name}', f.name) + ' = ' + _repr(f'{name}.{f.name}', getattr(value, f.name)).replace('\n', '\n ') for f in dataclasses.fields(value))
|
||||
valueRepr = f'{name} = {value!r}'
|
||||
if '\n' in valueRepr:
|
||||
return ''.join(['\\\n ', valueRepr.replace('\n', '\n ')])
|
||||
return valueRepr
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _dump_locals_on_exception():
|
||||
try:
|
||||
yield
|
||||
except Exception as e:
|
||||
trace = inspect.trace()
|
||||
if len(trace) >= 2:
|
||||
name = _dump_stack_and_locals(trace[1:], exc = e)
|
||||
logger.fatal(f'Dumped stack and locals to {name}')
|
||||
raise
|
||||
|
||||
|
||||
def _dump_stack_and_locals(trace, exc = None):
|
||||
with tempfile.NamedTemporaryFile('w', prefix = 'snscrape_locals_', delete = False) as fp:
|
||||
if exc is not None:
|
||||
fp.write('Exception:\n')
|
||||
fp.write(f' {type(exc).__module__}.{type(exc).__name__}: {exc!s}\n')
|
||||
fp.write(f' args: {exc.args!r}\n')
|
||||
fp.write('\n')
|
||||
|
||||
fp.write('Stack:\n')
|
||||
for frameRecord in trace:
|
||||
fp.write(f' File "{frameRecord.filename}", line {frameRecord.lineno}, in {frameRecord.function}\n')
|
||||
if frameRecord.code_context is not None:
|
||||
for line in frameRecord.code_context:
|
||||
fp.write(f' {line.strip()}\n')
|
||||
fp.write('\n')
|
||||
|
||||
modules = [inspect.getmodule(frameRecord[0]) for frameRecord in trace]
|
||||
for i, (module, frameRecord) in enumerate(zip(modules, trace)):
|
||||
if module is None:
|
||||
# Module-less frame, e.g. dataclass.__init__
|
||||
for j in reversed(range(i)):
|
||||
if modules[j] is not None:
|
||||
break
|
||||
else:
|
||||
# No previous module scope
|
||||
continue
|
||||
module = modules[j]
|
||||
if not module.__name__.startswith('snscrape.') and module.__name__ != 'snscrape':
|
||||
continue
|
||||
locals_ = frameRecord[0].f_locals
|
||||
fp.write(f'Locals from file "{frameRecord.filename}", line {frameRecord.lineno}, in {frameRecord.function}:\n')
|
||||
for variableName in locals_:
|
||||
variable = locals_[variableName]
|
||||
varRepr = _repr(variableName, variable)
|
||||
fp.write(f' {variableName} {type(variable)} = ')
|
||||
fp.write(varRepr.replace('\n', '\n '))
|
||||
fp.write('\n')
|
||||
fp.write('\n')
|
||||
if 'self' in locals_ and hasattr(locals_['self'], '__dict__'):
|
||||
fp.write('Object dict:\n')
|
||||
fp.write(repr(locals_['self'].__dict__))
|
||||
fp.write('\n\n')
|
||||
name = fp.name
|
||||
return name
|
||||
|
||||
|
||||
def parse_datetime_arg(arg):
|
||||
for format in ('%Y-%m-%d %H:%M:%S %z', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %z', '%Y-%m-%d'):
|
||||
try:
|
||||
d = datetime.datetime.strptime(arg, format)
|
||||
except ValueError:
|
||||
continue
|
||||
else:
|
||||
if d.tzinfo is None:
|
||||
return d.replace(tzinfo = datetime.timezone.utc)
|
||||
return d
|
||||
# Try treating it as a unix timestamp
|
||||
try:
|
||||
d = datetime.datetime.fromtimestamp(int(arg), datetime.timezone.utc)
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
return d
|
||||
raise argparse.ArgumentTypeError(f'Cannot parse {arg!r} into a datetime object')
|
||||
|
||||
|
||||
def parse_format(arg):
|
||||
# Replace '{' by '{0.' to use properties of the item, but keep '{{' intact
|
||||
parts = arg.split('{')
|
||||
out = ''
|
||||
it = iter(zip(parts, parts[1:]))
|
||||
for part, nextPart in it:
|
||||
out += part
|
||||
if nextPart == '': # Double brace
|
||||
out += '{{'
|
||||
next(it)
|
||||
else: # Single brace
|
||||
out += '{0.'
|
||||
out += parts[-1]
|
||||
return out
|
||||
|
||||
|
||||
class CitationAction(argparse.Action):
|
||||
def __init__(self, option_strings, dest = argparse.SUPPRESS, *args, default = argparse.SUPPRESS, **kwargs):
|
||||
super().__init__(option_strings, dest, *args, **kwargs)
|
||||
|
||||
def __call__(self, parser, namespace, values, optionString):
|
||||
try:
|
||||
m = importlib.metadata.metadata('snscrape')
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
print('Error: could not find snscrape installation. --citation does not work without the package being installed.', file = sys.stderr)
|
||||
parser.exit(1)
|
||||
print(f'Author: {m["author"]}')
|
||||
print(f'Title: {m["name"]}: {m["summary"]}')
|
||||
print(f'URL: {m["home-page"]}')
|
||||
print(f'Version: {m["version"]}')
|
||||
print(f'Date: 2018‒{m["version"].split(".", 3)[3][:4]}')
|
||||
|
||||
if '.dev' in m['version']:
|
||||
print()
|
||||
print('WARNING! You are running a development version. The date range may be incorrect. Please adjust the upper end of the range to the year of the commit.')
|
||||
|
||||
parser.exit()
|
||||
|
||||
|
||||
def parse_args():
|
||||
import snscrape.base
|
||||
import snscrape.modules
|
||||
import snscrape.version
|
||||
|
||||
parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--version', action = 'version', version = f'snscrape {snscrape.version.__version__}')
|
||||
parser.add_argument('--citation', action = CitationAction, nargs = 0, help = 'Display recommended citation information and exit')
|
||||
parser.add_argument('-v', '--verbose', '--verbosity', dest = 'verbosity', action = 'count', default = 0, help = 'Increase output verbosity')
|
||||
parser.add_argument('--dump-locals', dest = 'dumpLocals', action = 'store_true', default = False, help = 'Dump local variables on serious log messages (warnings or higher)')
|
||||
parser.add_argument('--retry', '--retries', dest = 'retries', type = int, default = 3, metavar = 'N',
|
||||
help = 'When the connection fails or the server returns an unexpected response, retry up to N times with an exponential backoff')
|
||||
parser.add_argument('-n', '--max-results', dest = 'maxResults', type = lambda x: int(x) if int(x) >= 0 else parser.error('--max-results N must be zero or positive'), metavar = 'N', help = 'Only return the first N results')
|
||||
group = parser.add_mutually_exclusive_group(required = False)
|
||||
group.add_argument('-f', '--format', dest = 'format', type = parse_format, default = None, help = 'Output format')
|
||||
group.add_argument('--jsonl', dest = 'jsonl', action = 'store_true', default = False, help = 'Output JSONL')
|
||||
parser.add_argument('--with-entity', dest = 'withEntity', action = 'store_true', default = False, help = 'Include the entity (e.g. user, channel) as the first output item')
|
||||
parser.add_argument('--since', type = parse_datetime_arg, metavar = 'DATETIME', help = 'Only return results newer than DATETIME')
|
||||
parser.add_argument('--progress', action = 'store_true', default = False, help = 'Report progress on stderr')
|
||||
|
||||
subparsers = parser.add_subparsers(dest = 'scraper', metavar = 'SCRAPER', title = 'scrapers', required = True)
|
||||
classes = snscrape.base.Scraper.__subclasses__()
|
||||
scrapers = {}
|
||||
for cls in classes:
|
||||
if cls.name is not None:
|
||||
scrapers[cls.name] = cls
|
||||
classes.extend(cls.__subclasses__())
|
||||
for scraper, cls in sorted(scrapers.items()):
|
||||
subparser = subparsers.add_parser(cls.name, help = '', formatter_class = argparse.ArgumentDefaultsHelpFormatter)
|
||||
cls._cli_setup_parser(subparser)
|
||||
subparser.set_defaults(cls = cls)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.withEntity and args.maxResults == 0:
|
||||
parser.error('--max-results 0 is only valid when used with --with-entity')
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def setup_logging():
|
||||
logging.setLoggerClass(Logger)
|
||||
global logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def configure_logging(verbosity, dumpLocals_):
|
||||
global dumpLocals
|
||||
dumpLocals = dumpLocals_
|
||||
|
||||
rootLogger = logging.getLogger()
|
||||
|
||||
# Set level
|
||||
if verbosity > 0:
|
||||
level = logging.INFO if verbosity == 1 else logging.DEBUG
|
||||
rootLogger.setLevel(level)
|
||||
for handler in rootLogger.handlers:
|
||||
handler.setLevel(level)
|
||||
|
||||
# Create formatter
|
||||
formatter = logging.Formatter('{asctime}.{msecs:03.0f} {levelname} {name} {message}', datefmt = '%Y-%m-%d %H:%M:%S', style = '{')
|
||||
|
||||
# Remove existing handlers
|
||||
for handler in rootLogger.handlers:
|
||||
rootLogger.removeHandler(handler)
|
||||
|
||||
# Add stream handler
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(formatter)
|
||||
rootLogger.addHandler(handler)
|
||||
|
||||
|
||||
def main():
|
||||
setup_logging()
|
||||
args = parse_args()
|
||||
configure_logging(args.verbosity, args.dumpLocals)
|
||||
scraper = args.cls._cli_from_args(args)
|
||||
|
||||
i = 0
|
||||
with _dump_locals_on_exception():
|
||||
if args.withEntity and (entity := scraper.entity):
|
||||
if args.jsonl:
|
||||
print(entity.json())
|
||||
else:
|
||||
print(entity)
|
||||
if args.maxResults == 0:
|
||||
logger.info('Exiting after 0 results')
|
||||
return
|
||||
for i, item in enumerate(scraper.get_items(), start = 1):
|
||||
if args.since is not None and item.date < args.since:
|
||||
logger.info(f'Exiting due to reaching older results than {args.since}')
|
||||
break
|
||||
if args.jsonl:
|
||||
print(item.json())
|
||||
elif args.format is not None:
|
||||
print(args.format.format(item))
|
||||
else:
|
||||
print(item)
|
||||
if args.progress and i % 100 == 0:
|
||||
print(f'Scraping, {i} results so far', file = sys.stderr)
|
||||
if args.maxResults and i >= args.maxResults:
|
||||
logger.info(f'Exiting after {i} results')
|
||||
if args.progress:
|
||||
print(f'Stopped scraping after {i} results due to --max-results', file = sys.stderr)
|
||||
break
|
||||
else:
|
||||
logger.info(f'Done, found {i} results')
|
||||
if args.progress:
|
||||
print(f'Finished, {i} results', file = sys.stderr)
|
||||
183
snscrape/base.py
183
snscrape/base.py
@@ -1,22 +1,121 @@
|
||||
import abc
|
||||
import copy
|
||||
import dataclasses
|
||||
import datetime
|
||||
import functools
|
||||
import json
|
||||
import logging
|
||||
import requests
|
||||
import time
|
||||
import warnings
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Item:
|
||||
class _DeprecatedProperty:
|
||||
def __init__(self, name, repl, replStr):
|
||||
self.name = name
|
||||
self.repl = repl
|
||||
self.replStr = replStr
|
||||
|
||||
def __get__(self, obj, objType):
|
||||
if obj is None: # if the access is through the class using _DeprecatedProperty rather than an instance of the class:
|
||||
return self
|
||||
warnings.warn(f'{self.name} is deprecated, use {self.replStr} instead', FutureWarning, stacklevel = 2)
|
||||
return self.repl(obj)
|
||||
|
||||
|
||||
def _json_serialise_datetime(obj):
|
||||
'''A JSON serialiser that converts datetime.datetime and datetime.date objects to ISO-8601 strings.'''
|
||||
|
||||
if isinstance(obj, (datetime.datetime, datetime.date)):
|
||||
return obj.isoformat()
|
||||
raise TypeError(f'Object of type {type(obj)} is not JSON serializable')
|
||||
|
||||
|
||||
def _json_dataclass_to_dict(obj):
|
||||
if isinstance(obj, _JSONDataclass) or dataclasses.is_dataclass(obj):
|
||||
out = {}
|
||||
out['_type'] = f'{type(obj).__module__}.{type(obj).__name__}'
|
||||
for field in dataclasses.fields(obj):
|
||||
assert field.name != '_type'
|
||||
if field.name.startswith('_'):
|
||||
continue
|
||||
out[field.name] = _json_dataclass_to_dict(getattr(obj, field.name))
|
||||
# Add in (non-deprecated) properties
|
||||
for k in dir(obj):
|
||||
if isinstance(getattr(type(obj), k, None), property):
|
||||
assert k != '_type'
|
||||
if k.startswith('_'):
|
||||
continue
|
||||
out[k] = _json_dataclass_to_dict(getattr(obj, k))
|
||||
return out
|
||||
elif isinstance(obj, (tuple, list)):
|
||||
return type(obj)(_json_dataclass_to_dict(x) for x in obj)
|
||||
elif isinstance(obj, dict):
|
||||
return {_json_dataclass_to_dict(k): _json_dataclass_to_dict(v) for k, v in obj.items()}
|
||||
elif isinstance(obj, set):
|
||||
return {_json_dataclass_to_dict(v) for v in obj}
|
||||
else:
|
||||
return copy.deepcopy(obj)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class _JSONDataclass:
|
||||
'''A base class for dataclasses for conversion to JSON'''
|
||||
|
||||
def json(self):
|
||||
'''Convert the object to a JSON string'''
|
||||
|
||||
out = _json_dataclass_to_dict(self)
|
||||
for key, value in list(out.items()): # Modifying the dict below, so make a copy first
|
||||
if isinstance(value, IntWithGranularity):
|
||||
out[key] = int(value)
|
||||
assert f'{key}.granularity' not in out, f'Granularity collision on {key}.granularity'
|
||||
out[f'{key}.granularity'] = value.granularity
|
||||
return json.dumps(out, default = _json_serialise_datetime)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Item(_JSONDataclass):
|
||||
'''An abstract base class for an item returned by the scraper's get_items generator.
|
||||
|
||||
An item can really be anything. The string representation should be useful for the CLI output (e.g. a direct URL for the item).'''
|
||||
An item can really be anything. The string representation should be useful for the CLI output (e.g. a direct URL for the item).
|
||||
'''
|
||||
|
||||
@abc.abstractmethod
|
||||
def __str__(self):
|
||||
pass
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Entity(_JSONDataclass):
|
||||
'''An abstract base class for an entity returned by the scraper's entity property.
|
||||
|
||||
An entity is typically the account of a person or organisation. The string representation should be the preferred direct URL to the entity's page on the network.
|
||||
'''
|
||||
|
||||
@abc.abstractmethod
|
||||
def __str__(self):
|
||||
pass
|
||||
|
||||
|
||||
class IntWithGranularity(int):
|
||||
'''A number with an associated granularity
|
||||
|
||||
For example, an IntWithGranularity(42000, 1000) represents a number on the order of 42000 with two significant digits, i.e. something counted with a granularity of 1000.
|
||||
'''
|
||||
|
||||
def __new__(cls, value, granularity, *args, **kwargs):
|
||||
obj = super().__new__(cls, value, *args, **kwargs)
|
||||
obj.granularity = granularity
|
||||
return obj
|
||||
|
||||
def __reduce__(self):
|
||||
return (IntWithGranularity, (int(self), self.granularity))
|
||||
|
||||
|
||||
class URLItem(Item):
|
||||
'''A generic item which only holds a URL string.'''
|
||||
|
||||
@@ -40,30 +139,74 @@ class Scraper:
|
||||
|
||||
name = None
|
||||
|
||||
def __init__(self, retries = 3):
|
||||
def __init__(self, *, retries = 3, proxies = None):
|
||||
self._retries = retries
|
||||
self._proxies = proxies
|
||||
self._session = requests.Session()
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_items(self):
|
||||
'''Iterator yielding Items.'''
|
||||
|
||||
pass
|
||||
|
||||
def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None):
|
||||
def _get_entity(self):
|
||||
'''Get the entity behind the scraper, if any.
|
||||
|
||||
This is the method implemented by subclasses for doing the actual retrieval/entity object creation. For accessing the scraper's entity, use the entity property.
|
||||
'''
|
||||
|
||||
return None
|
||||
|
||||
@functools.cached_property
|
||||
def entity(self):
|
||||
return self._get_entity()
|
||||
|
||||
def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None, allowRedirects = True, proxies = None):
|
||||
proxies = proxies or self._proxies or {}
|
||||
for attempt in range(self._retries + 1):
|
||||
# The request is newly prepared on each retry because of potential cookie updates.
|
||||
req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
|
||||
environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None)
|
||||
logger.info(f'Retrieving {req.url}')
|
||||
logger.debug(f'... with headers: {headers!r}')
|
||||
if data:
|
||||
logger.debug(f'... with data: {data!r}')
|
||||
if environmentSettings:
|
||||
logger.debug(f'... with environmentSettings: {environmentSettings!r}')
|
||||
try:
|
||||
r = self._session.send(req, timeout = timeout)
|
||||
if responseOkCallback is None or responseOkCallback(r):
|
||||
logger.debug(f'{req.url} retrieved successfully')
|
||||
return r
|
||||
r = self._session.send(req, allow_redirects = allowRedirects, timeout = timeout, **environmentSettings)
|
||||
except requests.exceptions.RequestException as exc:
|
||||
logger.error(f'Error retrieving {url}: {exc!r}')
|
||||
if attempt < self._retries:
|
||||
retrying = ', retrying'
|
||||
level = logging.INFO
|
||||
else:
|
||||
retrying = ''
|
||||
level = logging.ERROR
|
||||
logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
|
||||
else:
|
||||
redirected = f' (redirected to {r.url})' if r.history else ''
|
||||
logger.info(f'Retrieved {req.url}{redirected}: {r.status_code}')
|
||||
if r.history:
|
||||
for i, redirect in enumerate(r.history):
|
||||
logger.debug(f'... request {i}: {redirect.request.url}: {r.status_code} (Location: {r.headers.get("Location")})')
|
||||
if responseOkCallback is not None:
|
||||
success, msg = responseOkCallback(r)
|
||||
else:
|
||||
success, msg = (True, None)
|
||||
msg = f': {msg}' if msg else ''
|
||||
|
||||
if success:
|
||||
logger.debug(f'{req.url} retrieved successfully{msg}')
|
||||
return r
|
||||
else:
|
||||
if attempt < self._retries:
|
||||
retrying = ', retrying'
|
||||
level = logging.INFO
|
||||
else:
|
||||
retrying = ''
|
||||
level = logging.ERROR
|
||||
logger.log(level, f'Error retrieving {req.url}{msg}{retrying}')
|
||||
if attempt < self._retries:
|
||||
sleepTime = 1.0 * 2**attempt # exponential backoff: sleep 1 second after first attempt, 2 after second, 4 after third, etc.
|
||||
logger.info(f'Waiting {sleepTime:.0f} seconds')
|
||||
@@ -81,11 +224,23 @@ class Scraper:
|
||||
return self._request('POST', *args, **kwargs)
|
||||
|
||||
@classmethod
|
||||
@abc.abstractmethod
|
||||
def setup_parser(cls, subparser):
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
@abc.abstractmethod
|
||||
def from_args(cls, args):
|
||||
pass
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args)
|
||||
|
||||
@classmethod
|
||||
def _cli_construct(cls, argparseArgs, *args, **kwargs):
|
||||
return cls(*args, **kwargs, retries = argparseArgs.retries)
|
||||
|
||||
|
||||
def nonempty_string(name):
|
||||
def f(s):
|
||||
s = s.strip()
|
||||
if s:
|
||||
return s
|
||||
raise ValueError('must not be an empty string')
|
||||
f.__name__ = name
|
||||
return f
|
||||
|
||||
@@ -1,65 +0,0 @@
|
||||
import argparse
|
||||
import logging
|
||||
import snscrape.base
|
||||
import snscrape.modules
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('-v', '--verbose', '--verbosity', dest = 'verbosity', action = 'count', default = 0, help = 'Increase output verbosity')
|
||||
parser.add_argument('--retry', '--retries', dest = 'retries', type = int, default = 3, metavar = 'N',
|
||||
help = 'When the connection fails or the server returns an unexpected response, retry up to N times with an exponential backoff')
|
||||
parser.add_argument('-n', '--max-results', dest = 'maxResults', type = int, metavar = 'N', help = 'Only return the first N results')
|
||||
|
||||
subparsers = parser.add_subparsers(dest = 'scraper', help = 'The scraper you want to use')
|
||||
classes = snscrape.base.Scraper.__subclasses__()
|
||||
for cls in classes:
|
||||
subparser = subparsers.add_parser(cls.name, formatter_class = argparse.ArgumentDefaultsHelpFormatter)
|
||||
cls.setup_parser(subparser)
|
||||
subparser.set_defaults(cls = cls)
|
||||
classes.extend(cls.__subclasses__())
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# http://bugs.python.org/issue16308 / https://bugs.python.org/issue26510 (fixed in Python 3.7)
|
||||
if not args.scraper:
|
||||
raise RuntimeError('Error: no scraper specified')
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def setup_logging(verbosity):
|
||||
rootLogger = logging.getLogger()
|
||||
|
||||
# Set level
|
||||
if verbosity > 0:
|
||||
level = logging.INFO if verbosity == 1 else logging.DEBUG
|
||||
rootLogger.setLevel(level)
|
||||
for handler in rootLogger.handlers:
|
||||
handler.setLevel(level)
|
||||
|
||||
# Create formatter
|
||||
formatter = logging.Formatter('{asctime}.{msecs:03.0f} {levelname} {name} {message}', datefmt = '%Y-%m-%d %H:%M:%S', style = '{')
|
||||
|
||||
# Add stream handler
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(formatter)
|
||||
rootLogger.addHandler(handler)
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
setup_logging(args.verbosity)
|
||||
scraper = args.cls.from_args(args)
|
||||
|
||||
i = 0
|
||||
for i, item in enumerate(scraper.get_items(), start = 1):
|
||||
print(item)
|
||||
if args.maxResults and i >= args.maxResults:
|
||||
logger.info(f'Exiting after {i} results')
|
||||
break
|
||||
else:
|
||||
logger.info(f'Done, found {i} results')
|
||||
@@ -1,15 +1,17 @@
|
||||
import importlib
|
||||
import os
|
||||
import snscrape.base
|
||||
import pkgutil
|
||||
|
||||
|
||||
__all__ = []
|
||||
|
||||
|
||||
def _import_modules():
|
||||
files = os.listdir(__path__[0])
|
||||
for fn in files:
|
||||
if fn.endswith('.py') and fn != '__init__.py':
|
||||
# Import module if not already imported
|
||||
moduleName = f'snscrape.modules.{fn[:-3]}'
|
||||
module = importlib.import_module(moduleName)
|
||||
prefixLen = len(__name__) + 1
|
||||
for importer, moduleName, isPkg in pkgutil.iter_modules(__path__, prefix = f'{__name__}.'):
|
||||
assert not isPkg
|
||||
moduleNameWithoutPrefix = moduleName[prefixLen:]
|
||||
__all__.append(moduleNameWithoutPrefix)
|
||||
module = importer.find_module(moduleName).load_module(moduleName)
|
||||
globals()[moduleNameWithoutPrefix] = module
|
||||
|
||||
|
||||
_import_modules()
|
||||
|
||||
@@ -1,77 +1,364 @@
|
||||
__all__ = ['FacebookPost', 'User', 'FacebookUserScraper', 'FacebookCommunityScraper', 'FacebookGroupScraper']
|
||||
|
||||
|
||||
import bs4
|
||||
import dataclasses
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import snscrape.base
|
||||
import typing
|
||||
import urllib.parse
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FacebookUserScraper(snscrape.base.Scraper):
|
||||
name = 'facebook-user'
|
||||
@dataclasses.dataclass
|
||||
class FacebookPost(snscrape.base.Item):
|
||||
cleanUrl: str
|
||||
dirtyUrl: str
|
||||
date: datetime.datetime
|
||||
content: typing.Optional[str]
|
||||
outlinks: list
|
||||
|
||||
outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks), 'outlinks')
|
||||
|
||||
def __str__(self):
|
||||
return self.cleanUrl
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Entity):
|
||||
username: str
|
||||
pageId: int
|
||||
name: str
|
||||
verified: bool
|
||||
created: typing.Optional[datetime.date] = None
|
||||
pageOwner: typing.Optional[str] = None
|
||||
likes: typing.Optional[int] = None
|
||||
followers: typing.Optional[int] = None
|
||||
checkins: typing.Optional[int] = None
|
||||
address: typing.Optional[str] = None
|
||||
phone: typing.Optional[str] = None
|
||||
web: typing.Optional[str] = None
|
||||
keywords: typing.Optional[typing.List[str]] = None
|
||||
|
||||
def __str__(self):
|
||||
return f'https://www.facebook.com/{self.username}/'
|
||||
|
||||
|
||||
class _FacebookCommonScraper(snscrape.base.Scraper):
|
||||
def _clean_url(self, dirtyUrl):
|
||||
u = urllib.parse.urlparse(dirtyUrl)
|
||||
if u.path == '/permalink.php':
|
||||
# Retain only story_fbid and id parameters
|
||||
q = urllib.parse.parse_qs(u.query)
|
||||
clean = (u.scheme, u.netloc, u.path, urllib.parse.urlencode((('story_fbid', q['story_fbid'][0]), ('id', q['id'][0]))), '')
|
||||
elif u.path == '/photo.php':
|
||||
# Retain only the fbid parameter
|
||||
q = urllib.parse.parse_qs(u.query)
|
||||
clean = (u.scheme, u.netloc, u.path, urllib.parse.urlencode((('fbid', q['fbid'][0]),)), '')
|
||||
elif u.path == '/media/set/':
|
||||
# Retain only the set parameter and try to shorten it to the minimum
|
||||
q = urllib.parse.parse_qs(u.query)
|
||||
setVal = q['set'][0]
|
||||
if setVal.rstrip('0123456789').endswith('.a.'):
|
||||
setVal = f'a.{setVal.rsplit(".", 1)[1]}'
|
||||
clean = (u.scheme, u.netloc, u.path, urllib.parse.urlencode((('set', setVal),)), '')
|
||||
elif u.path.split('/')[2] == 'posts' or u.path.startswith('/events/') or u.path.startswith('/notes/') or u.path.split('/')[1:4:2] == ['groups', 'permalink']:
|
||||
# No manipulation of the path needed, but strip the query string
|
||||
clean = (u.scheme, u.netloc, u.path, '', '')
|
||||
elif u.path.split('/')[2] in ('photos', 'videos'):
|
||||
# Path: "/" username or ID "/" photos or videos "/" crap "/" ID of photo or video "/"
|
||||
# But to be safe, also handle URLs that don't have that crap correctly.
|
||||
if u.path.count('/') == 4:
|
||||
clean = (u.scheme, u.netloc, u.path, '', '')
|
||||
elif u.path.count('/') == 5:
|
||||
# Strip out the third path component
|
||||
pathcomps = u.path.split('/')
|
||||
pathcomps.pop(3) # Don't forget about the empty string at the beginning!
|
||||
clean = (u.scheme, u.netloc, '/'.join(pathcomps), '', '')
|
||||
else:
|
||||
return dirtyUrl
|
||||
else:
|
||||
# If we don't recognise the URL, just return the original one.
|
||||
return dirtyUrl
|
||||
return urllib.parse.urlunsplit(clean)
|
||||
|
||||
def _is_odd_link(self, href, entryText, mode):
|
||||
# Returns (isOddLink: bool, warn: bool|None)
|
||||
if mode == 'user':
|
||||
if not any(x in href for x in ('/posts/', '/photos/', '/videos/', '/permalink.php?', '/events/', '/notes/', '/photo.php?', '/media/set/')):
|
||||
if href == '#' and 'new photo' in entryText and 'to the album' in entryText:
|
||||
# Don't print a warning if it's a "User added 5 new photos to the album"-type entry, which doesn't have a permalink.
|
||||
return True, False
|
||||
elif href.startswith('/business/help/788160621327601/?'):
|
||||
# Skip the help article about branded content
|
||||
return True, False
|
||||
else:
|
||||
return True, True
|
||||
return False, None
|
||||
elif mode == 'group':
|
||||
if not re.match(r'^/groups/[^/]+/permalink/\d+/(\?|$)', href):
|
||||
return True, True
|
||||
return False, None
|
||||
|
||||
def _soup_to_items(self, soup, baseUrl, mode):
|
||||
cleanUrl = None # Value from previous iteration is used for warning on link-less entries
|
||||
for entry in soup.find_all('div', class_ = '_5pcr'): # also class 'fbUserContent' in 2017 and 'userContentWrapper' in 2019
|
||||
# Check that this is not inside another div._5pcr to avoid duplicates or extracting the wrong URL (e.g. 'X was mentioned in a post' on community pages)
|
||||
parent = entry.parent
|
||||
isNested = False
|
||||
while parent:
|
||||
if parent.name == 'div' and 'class' in parent.attrs and '_5pcr' in parent.attrs['class']:
|
||||
isNested = True
|
||||
break
|
||||
parent = parent.parent
|
||||
if isNested:
|
||||
continue
|
||||
|
||||
entryA = entry.find('a', class_ = '_5pcq') # There can be more than one, e.g. when a post is shared by another user, but the first one is always the one of this entry.
|
||||
mediaSetA = entry.find('a', class_ = '_17z-')
|
||||
if not mediaSetA and not entryA:
|
||||
_logger.warning(f'Ignoring link-less entry after {cleanUrl}: {entry.text!r}')
|
||||
continue
|
||||
if mediaSetA and (not entryA or entryA['href'] == '#'):
|
||||
href = mediaSetA['href']
|
||||
elif entryA:
|
||||
href = entryA['href']
|
||||
oddLink, warn = self._is_odd_link(href, entry.text, mode)
|
||||
if oddLink:
|
||||
if warn:
|
||||
_logger.warning(f'Ignoring odd link: {href}')
|
||||
continue
|
||||
dirtyUrl = urllib.parse.urljoin(baseUrl, href)
|
||||
cleanUrl = self._clean_url(dirtyUrl)
|
||||
date = datetime.datetime.fromtimestamp(int(entry.find('abbr', class_ = '_5ptz')['data-utime']), datetime.timezone.utc)
|
||||
if (contentDiv := entry.find('div', class_ = '_5pbx')):
|
||||
content = contentDiv.text
|
||||
else:
|
||||
content = None
|
||||
outlinks = []
|
||||
for a in entry.find_all('a'):
|
||||
if not a.has_attr('href'):
|
||||
continue
|
||||
href = a.get('href')
|
||||
if not href.startswith('https://l.facebook.com/l.php?'):
|
||||
continue
|
||||
query = urllib.parse.parse_qs(urllib.parse.urlparse(href).query)
|
||||
if 'u' not in query or len(query['u']) != 1:
|
||||
_logger.warning(f'Ignoring odd outlink: {href}')
|
||||
continue
|
||||
outlink = query['u'][0]
|
||||
if outlink.startswith('http://') or outlink.startswith('https://') and outlink not in outlinks:
|
||||
outlinks.append(outlink)
|
||||
yield FacebookPost(cleanUrl = cleanUrl, dirtyUrl = dirtyUrl, date = date, content = content, outlinks = outlinks)
|
||||
|
||||
|
||||
class _FacebookUserAndCommunityScraper(_FacebookCommonScraper):
|
||||
def __init__(self, username, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._username = username
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:78.0) Gecko/20100101 Firefox/78.0', 'Accept-Language': 'en-US,en;q=0.5'}
|
||||
self._initialPage = None
|
||||
self._initialPageSoup = None
|
||||
|
||||
def _soup_to_items(self, soup, username, baseUrl):
|
||||
yielded = set()
|
||||
for a in soup.find_all('a', href = re.compile(r'^/[^/]+/(posts|photos|videos)/[^/]*\d')):
|
||||
href = a.get('href')
|
||||
if href.startswith(f'/{username}/'):
|
||||
link = urllib.parse.urljoin(baseUrl, href)
|
||||
if link not in yielded:
|
||||
yield snscrape.base.URLItem(link)
|
||||
yielded.add(link)
|
||||
def _initial_page(self):
|
||||
if self._initialPage is None:
|
||||
_logger.info('Retrieving initial data')
|
||||
r = self._get(self._baseUrl, headers = self._headers)
|
||||
if r.status_code not in (200, 404):
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
self._initialPage = r
|
||||
self._initialPageSoup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
return self._initialPage, self._initialPageSoup
|
||||
|
||||
def get_items(self):
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
|
||||
|
||||
nextPageLinkPattern = re.compile(r'^/pages_reaction_units/more/\?page_id=')
|
||||
spuriousForLoopPattern = re.compile(r'^for \(;;\);')
|
||||
|
||||
logger.info('Retrieving initial data')
|
||||
baseUrl = f'https://www.facebook.com/{self._username}/'
|
||||
r = self._get(baseUrl, headers = headers)
|
||||
r, soup = self._initial_page()
|
||||
if r.status_code == 404:
|
||||
logger.warning('User does not exist')
|
||||
_logger.warning('User does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
logger.error('Got status code {r.status_code}')
|
||||
return
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
username = re.sub(r'^https://www\.facebook\.com/([^/]+)/$', r'\1', soup.find('link').get('href')) # Canonical capitalisation
|
||||
baseUrl = f'https://www.facebook.com/{username}/'
|
||||
yield from self._soup_to_items(soup, username, baseUrl)
|
||||
nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern)
|
||||
yield from self._soup_to_items(soup, self._baseUrl, 'user')
|
||||
|
||||
while nextPageLink:
|
||||
logger.info('Retrieving next page')
|
||||
while (nextPageLink := soup.find('a', ajaxify = nextPageLinkPattern)):
|
||||
_logger.info('Retrieving next page')
|
||||
|
||||
# The web app sends a bunch of additional parameters. Most of them would be easy to add, but there's also __dyn, which is a compressed list of the "modules" loaded in the browser.
|
||||
# Reproducing that would be difficult to get right, especially as Facebook's codebase evolves, so it's just not sent at all here.
|
||||
r = self._get(urllib.parse.urljoin(baseUrl, nextPageLink.get('ajaxify')) + '&__a=1', headers = headers)
|
||||
r = self._get(urllib.parse.urljoin(self._baseUrl, nextPageLink.get('ajaxify')) + '&__a=1', headers = self._headers)
|
||||
if r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
response = json.loads(spuriousForLoopPattern.sub('', r.text))
|
||||
assert 'domops' in response
|
||||
assert len(response['domops']) == 1
|
||||
assert len(response['domops'][0]) == 4
|
||||
assert response['domops'][0][0] == 'replace', f'{response["domops"][0]} is not "replace"'
|
||||
assert response['domops'][0][1] == '#www_pages_reaction_see_more_unitwww_pages_home'
|
||||
assert response['domops'][0][1] in ('#www_pages_reaction_see_more_unitwww_pages_home', '#www_pages_reaction_see_more_unitwww_pages_community_tab')
|
||||
assert response['domops'][0][2] == False
|
||||
assert '__html' in response['domops'][0][3]
|
||||
soup = bs4.BeautifulSoup(response['domops'][0][3]['__html'], 'lxml')
|
||||
yield from self._soup_to_items(soup, username, baseUrl)
|
||||
nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern)
|
||||
yield from self._soup_to_items(soup, self._baseUrl, 'user')
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
subparser.add_argument('username', help = 'A Facebook username or user ID')
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('username', type = snscrape.base.nonempty_string('username'), help = 'A Facebook username or user ID')
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls(args.username, retries = args.retries)
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.username)
|
||||
|
||||
|
||||
class FacebookUserScraper(_FacebookUserAndCommunityScraper):
|
||||
name = 'facebook-user'
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._baseUrl = f'https://www.facebook.com/{self._username}/'
|
||||
|
||||
def _get_entity(self):
|
||||
kwargs = {}
|
||||
|
||||
nameVerifiedMarkupPattern = re.compile(r'"markup":\[\["__markup_a588f507_0_0",\{"__html":(".*?")\}')
|
||||
handleDivPattern = re.compile(r'<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>')
|
||||
handlePattern = re.compile(r'<a\s[^>]*(?<=\s)href="/([^/]+)')
|
||||
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
|
||||
createdDatePattern = re.compile('^(' + '|'.join(months) + r') (\d+), (\d+)$')
|
||||
|
||||
r, soup = self._initial_page()
|
||||
if r.status_code != 200:
|
||||
return
|
||||
|
||||
handleDiv = handleDivPattern.search(r.text)
|
||||
handle = handlePattern.search(handleDiv.group(0))
|
||||
kwargs['username'] = handle.group(1)
|
||||
|
||||
nameVerifiedMarkup = nameVerifiedMarkupPattern.search(r.text)
|
||||
nameVerifiedMarkup = json.loads(nameVerifiedMarkup.group(1))
|
||||
nameVerifiedSoup = bs4.BeautifulSoup(nameVerifiedMarkup, 'lxml')
|
||||
kwargs['name'] = nameVerifiedSoup.find('a', class_ = '_64-f').text
|
||||
kwargs['verified'] = bool(nameVerifiedSoup.find('a', class_ = '_56_f'))
|
||||
|
||||
pageTransparencyContentDiv = soup.find('div', class_ = '_61-0')
|
||||
if pageTransparencyContentDiv.text.startswith('Page created - '):
|
||||
createdDateMess = pageTransparencyContentDiv.text.split(' - ', 1)[1]
|
||||
m = createdDatePattern.match(createdDateMess)
|
||||
assert m, 'unexpected created div content'
|
||||
kwargs['created'] = datetime.date(int(m.group(3)), months.index(m.group(1)) + 1, int(m.group(2)))
|
||||
if pageTransparencyContentDiv.text.startswith('Confirmed Page Owner: '):
|
||||
kwargs['pageOwner'] = pageTransparencyContentDiv.text.split(': ', 1)[1]
|
||||
|
||||
communityDiv = soup.find('div', class_ = '_6590')
|
||||
for div in communityDiv.find_all('div', class_ = '_4bl9'):
|
||||
text = div.text
|
||||
if text.endswith(' people like this'):
|
||||
kwargs['likes'] = int(text.split(' ', 1)[0].replace(',', ''))
|
||||
elif text.endswith(' people follow this'):
|
||||
kwargs['followers'] = int(text.split(' ', 1)[0].replace(',', ''))
|
||||
elif text.endswith(' check-ins'):
|
||||
kwargs['checkins'] = int(text.split(' ', 1)[0].replace(',', ''))
|
||||
|
||||
aboutDiv = soup.find('div', class_ = '_u9q')
|
||||
if aboutDiv:
|
||||
# As if the above wasn't already ugly enough, this is where it gets really bad...
|
||||
for div in aboutDiv.find_all('div', class_ = '_2pi9'):
|
||||
img = div.find('img', class_ = '_3-91')
|
||||
if not img:
|
||||
continue
|
||||
if img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/y5/r/vfXKA62x4Da.png': # Address
|
||||
rawAddress = div.find('div', class_ = '_2wzd').text
|
||||
kwargs['address'] = re.sub(r' \((\d+,)?\d+(\.\d+)? mi\)', '\n', rawAddress) # Remove distance from inferred IP location, restore linebreak
|
||||
elif img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/yW/r/mYv88EsODOI.png': # Phone number
|
||||
kwargs['phone'] = div.find('div', class_ = '_4bl9').text
|
||||
elif img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/yx/r/xVA3lB-GVep.png': # Web link
|
||||
for a in div.find_all('a'):
|
||||
if a.text == '' or 'href' not in a.attrs or a.find('span'):
|
||||
continue
|
||||
dirtyWeb = a['href']
|
||||
assert dirtyWeb.startswith('https://l.facebook.com/l.php?u='), 'unexpected web link'
|
||||
kwargs['web'] = urllib.parse.unquote(dirtyWeb.split('=', 1)[1].split('&', 1)[0])
|
||||
elif img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/yl/r/LwDWwC1d0Rx.png': # Keywords
|
||||
kwargs['keywords'] = div.find('div', class_ = '_4bl9').text.split(' · ')
|
||||
|
||||
androidUrlMeta = soup.find('meta', property = 'al:android:url')
|
||||
assert androidUrlMeta['content'].startswith('fb://page/') and androidUrlMeta['content'].endswith('?referrer=app_link')
|
||||
kwargs['pageId'] = int(androidUrlMeta['content'][10:-18])
|
||||
|
||||
return User(**kwargs)
|
||||
|
||||
|
||||
class FacebookCommunityScraper(_FacebookUserAndCommunityScraper):
|
||||
name = 'facebook-community'
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._baseUrl = f'https://www.facebook.com/{self._username}/community/'
|
||||
|
||||
|
||||
class FacebookGroupScraper(_FacebookCommonScraper):
|
||||
name = 'facebook-group'
|
||||
|
||||
def __init__(self, group, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._group = group
|
||||
|
||||
def get_items(self):
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', 'Accept-Language': 'en-US,en;q=0.5'}
|
||||
|
||||
pageletDataPattern = re.compile(r'"GroupEntstreamPagelet",\{.*?\}(?=,\{)')
|
||||
pageletDataPrefixLength = len('"GroupEntstreamPagelet",')
|
||||
spuriousForLoopPattern = re.compile(r'^for \(;;\);')
|
||||
|
||||
baseUrl = f'https://upload.facebook.com/groups/{self._group}/?sorting_setting=CHRONOLOGICAL'
|
||||
r = self._get(baseUrl, headers = headers)
|
||||
if r.status_code == 404:
|
||||
_logger.warning('Group does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
|
||||
if 'content:{pagelet_group_mall:{container_id:"' not in r.text:
|
||||
raise snscrape.base.ScraperException('Code container ID marker not found (does the group exist?)')
|
||||
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
|
||||
# Posts are inside an HTML comment in two code tags with IDs listed in JS...
|
||||
for codeContainerIdStart in ('content:{pagelet_group_mall:{container_id:"', 'content:{group_mall_after_tti:{container_id:"'):
|
||||
codeContainerIdPos = r.text.index(codeContainerIdStart) + len(codeContainerIdStart)
|
||||
codeContainerId = r.text[codeContainerIdPos : r.text.index('"', codeContainerIdPos)]
|
||||
codeContainer = soup.find('code', id = codeContainerId)
|
||||
if not codeContainer:
|
||||
raise snscrape.base.ScraperException('Code container not found')
|
||||
if type(codeContainer.string) is not bs4.element.Comment:
|
||||
raise snscrape.base.ScraperException('Code container does not contain a comment')
|
||||
codeSoup = bs4.BeautifulSoup(codeContainer.string, 'lxml')
|
||||
yield from self._soup_to_items(codeSoup, baseUrl, 'group')
|
||||
|
||||
# Pagination
|
||||
while (data := pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:]):
|
||||
# As on the user profile pages, the web app sends a lot of additional parameters, but those all seem to be unnecessary (although some change the response format, e.g. from JSON to HTML)
|
||||
r = self._get(
|
||||
'https://upload.facebook.com/ajax/pagelet/generic.php/GroupEntstreamPagelet',
|
||||
params = {'data': data, '__a': 1},
|
||||
headers = headers,
|
||||
)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
obj = json.loads(spuriousForLoopPattern.sub('', r.text))
|
||||
if obj['payload'] == '':
|
||||
# End of pagination
|
||||
break
|
||||
soup = bs4.BeautifulSoup(obj['payload'], 'lxml')
|
||||
yield from self._soup_to_items(soup, baseUrl, 'group')
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('group', type = snscrape.base.nonempty_string('group'), help = 'A group name or ID')
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.group)
|
||||
|
||||
@@ -1,102 +0,0 @@
|
||||
import datetime
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import snscrape.base
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GooglePlusUserScraper(snscrape.base.Scraper):
|
||||
name = 'googleplus-user'
|
||||
|
||||
def __init__(self, user, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._user = user
|
||||
|
||||
def get_items(self):
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
|
||||
|
||||
logger.info('Retrieving initial data')
|
||||
r = self._get(f'https://plus.google.com/{self._user}', headers = headers)
|
||||
if r.status_code == 404:
|
||||
logger.warning('User does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
|
||||
# Global data; only needed for the session ID
|
||||
#TODO: Make this more robust somehow
|
||||
match = re.search(r'''(['"])FdrFJe\1\s*:\s*(['"])(?P<sid>.*?)\2''', r.text)
|
||||
if not match:
|
||||
logger.error('Unable to find session ID')
|
||||
return
|
||||
sid = match.group('sid')
|
||||
|
||||
# Page data
|
||||
# As of 2018-05-18, the much simpler regex r'''<script[^>]*>AF_initDataCallback\(\{key: 'ds:6',.*?return (.*?)\}\}\);</script>''' would work also, but this is more generic and less likely to break:
|
||||
match = re.search(r'''<script[^>]*>\s*(?:.*?)\s*\(\s*\{(?:|.*?,)\s*key\s*:\s*(['"])ds:6\1\s*,.*?,\s*data\s*:\s*function\s*\(\s*\)\s*\{\s*return\s*(?P<data>.*?)\}\s*\}\s*\)\s*;\s*</script>''', r.text, re.DOTALL)
|
||||
if not match:
|
||||
logger.error('Unable to extract data')
|
||||
return
|
||||
jsonData = match.group('data')
|
||||
response = json.loads(jsonData)
|
||||
if response[0][7] is None:
|
||||
logger.info('User has no posts')
|
||||
return
|
||||
for postObj in response[0][7]:
|
||||
yield snscrape.base.URLItem(f'https://plus.google.com/{postObj[6]["33558957"][21]}')
|
||||
cursor = response[0][1] # 'ADSJ_x'
|
||||
if cursor is None:
|
||||
# No further pages
|
||||
return
|
||||
baseDate = datetime.datetime.utcnow()
|
||||
baseSeconds = baseDate.hour * 3600 + baseDate.minute * 60 + baseDate.second
|
||||
userid = response[1] # Alternatively and more ugly: response[0][7][0][6]['33558957'][16]
|
||||
|
||||
for counter in itertools.count(start = 2):
|
||||
logger.info('Retrieving next page')
|
||||
reqid = 1 + baseSeconds + int(1e5) * counter
|
||||
r = self._post(
|
||||
f'https://plus.google.com/_/PlusAppUi/data?ds.extension=74333095&f.sid={sid}&hl=en-US&soc-app=199&soc-platform=1&soc-device=1&_reqid={reqid}&rt=c',
|
||||
data = [('f.req', '[[[74333095,[{"74333095":["' + cursor + '","' + userid + '"]}],null,null,0]]]'), ('', '')],
|
||||
headers = headers
|
||||
)
|
||||
if r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
|
||||
# As if everything up to here wasn't terrible already, this is where it gets *really* bad.
|
||||
# The API contains a few junk characters at the beginning, apparently as an anti-CSRF measure.
|
||||
# The remainder is effectively a self-made chunked transfer encoding but with decimal digits and including everything except the digits themselves in the chunk size.
|
||||
# It sucks.
|
||||
# Each chunk is actually one JSON object; you'd think that we can just read the first one and parse that, but there are some quirks that make this difficult.
|
||||
# I was unable to figure out what the "chunk size" actually covers exactly; the response is UTF-8 encoded, but the chunk size matches neither the binary nor the decoded length.
|
||||
# Enter the awful workaround: strip away the initial chunk size, then parse the beginning of the remaining data using a parser that doesn't care if there's junk after the JSON.
|
||||
|
||||
garbage = r.text
|
||||
assert garbage[:6] == ")]}'\n\n" # anti-CSRF and two newlines
|
||||
data = []
|
||||
pos = 6
|
||||
while garbage[pos].isdigit() or garbage[pos].isspace(): # Also strip leading whitespace
|
||||
pos += 1
|
||||
response = json.JSONDecoder().raw_decode(''.join(garbage[pos:]))[0] # Parses only the first structure in the data stream without throwing an error about the extra data at the end
|
||||
|
||||
for postObj in response[0][2]['74333095'][0][7]:
|
||||
yield snscrape.base.URLItem(f'https://plus.google.com/{postObj[6]["33558957"][21]}')
|
||||
|
||||
cursor = response[0][2]['74333095'][0][1]
|
||||
|
||||
if cursor is None:
|
||||
break
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
subparser.add_argument('user', help = 'A Google Plus username (with leading "+") or numeric ID')
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls(args.user, retries = args.retries)
|
||||
@@ -1,74 +1,243 @@
|
||||
__all__ = ['InstagramPost', 'User', 'InstagramUserScraper', 'InstagramHashtagScraper', 'InstagramLocationScraper']
|
||||
|
||||
|
||||
import dataclasses
|
||||
import datetime
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import snscrape.base
|
||||
import typing
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class InstagramUserScraper(snscrape.base.Scraper):
|
||||
@dataclasses.dataclass
|
||||
class InstagramPost(snscrape.base.Item):
|
||||
url: str
|
||||
date: datetime.datetime
|
||||
content: typing.Optional[str]
|
||||
thumbnailUrl: str
|
||||
displayUrl: str
|
||||
username: typing.Optional[str]
|
||||
likes: int
|
||||
comments: int
|
||||
commentsDisabled: bool
|
||||
isVideo: bool
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Entity):
|
||||
username: str
|
||||
name: typing.Optional[str]
|
||||
followers: snscrape.base.IntWithGranularity
|
||||
following: snscrape.base.IntWithGranularity
|
||||
posts: snscrape.base.IntWithGranularity
|
||||
|
||||
followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
|
||||
followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
|
||||
postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
|
||||
|
||||
def __str__(self):
|
||||
return f'https://www.instagram.com/{self.username}/'
|
||||
|
||||
|
||||
class _InstagramCommonScraper(snscrape.base.Scraper):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
|
||||
self._initialPage = None
|
||||
|
||||
def _response_to_items(self, response):
|
||||
for node in response[self._responseContainer][self._edgeXToMedia]['edges']:
|
||||
code = node['node']['shortcode']
|
||||
username = node['node']['owner']['username'] if 'username' in node['node']['owner'] else None
|
||||
url = f'https://www.instagram.com/p/{code}/'
|
||||
yield InstagramPost(
|
||||
url = url,
|
||||
date = datetime.datetime.fromtimestamp(node['node']['taken_at_timestamp'], datetime.timezone.utc),
|
||||
content = node['node']['edge_media_to_caption']['edges'][0]['node']['text'] if len(node['node']['edge_media_to_caption']['edges']) else None,
|
||||
thumbnailUrl = node['node']['thumbnail_src'],
|
||||
displayUrl = node['node']['display_url'],
|
||||
username = username,
|
||||
likes = node['node']['edge_media_preview_like']['count'],
|
||||
comments = node['node']['edge_media_to_comment']['count'],
|
||||
commentsDisabled = node['node']['comments_disabled'],
|
||||
isVideo = node['node']['is_video'],
|
||||
)
|
||||
|
||||
def _initial_page(self):
|
||||
if self._initialPage is None:
|
||||
_logger.info('Retrieving initial data')
|
||||
r = self._get(self._initialUrl, headers = self._headers, responseOkCallback = self._check_initial_page_callback)
|
||||
if r.status_code not in (200, 404):
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
elif r.url.startswith('https://www.instagram.com/accounts/login/'):
|
||||
raise snscrape.base.ScraperException('Redirected to login page')
|
||||
self._initialPage = r
|
||||
return self._initialPage
|
||||
|
||||
def _check_initial_page_callback(self, r):
|
||||
if r.status_code != 200:
|
||||
return True, None
|
||||
jsonData = r.text.split('<script type="text/javascript">window._sharedData = ')[1].split(';</script>')[0] # May throw an IndexError if Instagram changes something again; we just let that bubble.
|
||||
try:
|
||||
obj = json.loads(jsonData)
|
||||
except json.JSONDecodeError:
|
||||
return False, 'invalid JSON'
|
||||
r._snscrape_json_obj = obj
|
||||
return True, None
|
||||
|
||||
def _check_json_callback(self, r):
|
||||
if r.status_code != 200:
|
||||
return False, f'status code {r.status_code}'
|
||||
if r.url.startswith('https://www.instagram.com/accounts/login/'):
|
||||
raise snscrape.base.ScraperException('Redirected to login page')
|
||||
try:
|
||||
obj = json.loads(r.text)
|
||||
except json.JSONDecodeError as e:
|
||||
return False, f'invalid JSON ({e!r})'
|
||||
r._snscrape_json_obj = obj
|
||||
return True, None
|
||||
|
||||
def get_items(self):
|
||||
r = self._initial_page()
|
||||
if r.status_code == 404:
|
||||
_logger.warning('Page does not exist')
|
||||
return
|
||||
response = r._snscrape_json_obj
|
||||
rhxGis = response['rhx_gis'] if 'rhx_gis' in response else ''
|
||||
if response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['count'] == 0:
|
||||
_logger.info('Page has no posts')
|
||||
return
|
||||
if not response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['edges']:
|
||||
_logger.warning('Private account')
|
||||
return
|
||||
pageID = response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._pageIDKey]
|
||||
yield from self._response_to_items(response['entry_data'][self._pageName][0]['graphql'])
|
||||
if not response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['page_info']['has_next_page']:
|
||||
return
|
||||
endCursor = response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['page_info']['end_cursor']
|
||||
|
||||
headers = self._headers.copy()
|
||||
while True:
|
||||
_logger.info(f'Retrieving endCursor = {endCursor!r}')
|
||||
variables = self._variablesFormat.format(**locals())
|
||||
headers['X-Requested-With'] = 'XMLHttpRequest'
|
||||
headers['X-Instagram-GIS'] = hashlib.md5(f'{rhxGis}:{variables}'.encode('utf-8')).hexdigest()
|
||||
r = self._get(f'https://www.instagram.com/graphql/query/?query_hash={self._queryHash}&variables={variables}', headers = headers, responseOkCallback = self._check_json_callback)
|
||||
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
|
||||
response = r._snscrape_json_obj
|
||||
if not response['data'][self._responseContainer][self._edgeXToMedia]['edges']:
|
||||
return
|
||||
yield from self._response_to_items(response['data'])
|
||||
if not response['data'][self._responseContainer][self._edgeXToMedia]['page_info']['has_next_page']:
|
||||
return
|
||||
endCursor = response['data'][self._responseContainer][self._edgeXToMedia]['page_info']['end_cursor']
|
||||
|
||||
|
||||
class InstagramUserScraper(_InstagramCommonScraper):
|
||||
name = 'instagram-user'
|
||||
|
||||
def __init__(self, username, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._username = username
|
||||
self._initialUrl = f'https://www.instagram.com/{username}/'
|
||||
self._pageName = 'ProfilePage'
|
||||
self._responseContainer = 'user'
|
||||
self._edgeXToMedia = 'edge_owner_to_timeline_media'
|
||||
self._pageIDKey = 'id'
|
||||
self._queryHash = 'f2405b236d85e8296cf30347c9f08c2a'
|
||||
self._variablesFormat = '{{"id":"{pageID}","first":50,"after":"{endCursor}"}}'
|
||||
|
||||
def _response_to_items(self, response, username):
|
||||
for node in response['user']['edge_owner_to_timeline_media']['edges']:
|
||||
code = node['node']['shortcode']
|
||||
yield snscrape.base.URLItem(f'https://www.instagram.com/p/{code}/?taken-by={username}') #TODO: Do we want the taken-by parameter in here?
|
||||
|
||||
def get_items(self):
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
|
||||
|
||||
logger.info('Retrieving initial data')
|
||||
r = self._get(f'https://www.instagram.com/{self._username}/', headers = headers)
|
||||
if r.status_code == 404:
|
||||
logger.warning('User does not exist')
|
||||
def _get_entity(self):
|
||||
r = self._initial_page()
|
||||
if r.status_code != 200:
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
if '<meta property="og:description" content="' not in r.text:
|
||||
return
|
||||
jsonData = r.text.split('<script type="text/javascript">window._sharedData = ')[1].split(';</script>')[0] # May throw an IndexError if Instagram changes something again; we just let that bubble.
|
||||
response = json.loads(jsonData)
|
||||
rhxGis = response['rhx_gis']
|
||||
if response['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['count'] == 0:
|
||||
logger.info('User has no posts')
|
||||
return
|
||||
if not response['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges']:
|
||||
logger.warning('Private account')
|
||||
return
|
||||
userID = response['entry_data']['ProfilePage'][0]['graphql']['user']['id']
|
||||
username = response['entry_data']['ProfilePage'][0]['graphql']['user']['username'] # Might have different capitalisation than self._username
|
||||
yield from self._response_to_items(response['entry_data']['ProfilePage'][0]['graphql'], username)
|
||||
if not response['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page']:
|
||||
return
|
||||
endCursor = response['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor']
|
||||
ogDescriptionContentPos = r.text.index('<meta property="og:description" content="') + len('<meta property="og:description" content="')
|
||||
ogDescription = r.text[ogDescriptionContentPos : r.text.index('"', ogDescriptionContentPos)]
|
||||
|
||||
while True:
|
||||
logger.info(f'Retrieving endCursor = {endCursor!r}')
|
||||
variables = f'{{"id":"{userID}","first":50,"after":"{endCursor}"}}'
|
||||
headers['X-Requested-With'] = 'XMLHttpRequest'
|
||||
headers['X-Instagram-GIS'] = hashlib.md5(f'{rhxGis}:{variables}'.encode('utf-8')).hexdigest()
|
||||
r = self._get(f'https://www.instagram.com/graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables={variables}', headers = headers)
|
||||
numPattern = r'\d+(?:\.\d+)?m|\d+(?:\.\d+)?k|\d+,\d+|\d+'
|
||||
ogDescriptionPattern = re.compile('^(' + numPattern + ') Followers, (' + numPattern + ') Following, (' + numPattern + r') Posts - See Instagram photos and videos from (?:(.*?) \(@([a-z0-9_.]+)\)|@([a-z0-9_-]+))$')
|
||||
m = ogDescriptionPattern.match(ogDescription)
|
||||
assert m, 'unexpected og:description format'
|
||||
|
||||
if r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
def parse_num(s):
|
||||
if s.endswith('m'):
|
||||
return int(float(s[:-1].replace(',', '')) * 1e6), 10 ** (6 if '.' not in s else 6 - len(s[:-1].replace(',', '').split('.')[1]))
|
||||
elif s.endswith('k'):
|
||||
return int(float(s[:-1].replace(',', '')) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].replace(',', '').split('.')[1]))
|
||||
else:
|
||||
return int(s.replace(',', '')), 1
|
||||
|
||||
response = json.loads(r.text)
|
||||
if not response['data']['user']['edge_owner_to_timeline_media']['edges']:
|
||||
return
|
||||
yield from self._response_to_items(response['data'], username)
|
||||
if not response['data']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page']:
|
||||
return
|
||||
endCursor = response['data']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor']
|
||||
followers = snscrape.base.IntWithGranularity(*parse_num(m.group(1)))
|
||||
following = snscrape.base.IntWithGranularity(*parse_num(m.group(2)))
|
||||
posts = snscrape.base.IntWithGranularity(*parse_num(m.group(3)))
|
||||
return User(
|
||||
username = m.group(5) or m.group(6),
|
||||
name = m.group(4) or None,
|
||||
followers = followers,
|
||||
following = following,
|
||||
posts = posts,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
subparser.add_argument('username', help = 'An Instagram username')
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('username', type = snscrape.base.nonempty_string('username'), help = 'An Instagram username (no leading @)')
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls(args.username, retries = args.retries)
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.username)
|
||||
|
||||
|
||||
class InstagramHashtagScraper(_InstagramCommonScraper):
|
||||
name = 'instagram-hashtag'
|
||||
|
||||
def __init__(self, hashtag, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._initialUrl = f'https://www.instagram.com/explore/tags/{hashtag}/'
|
||||
self._pageName = 'TagPage'
|
||||
self._responseContainer = 'hashtag'
|
||||
self._edgeXToMedia = 'edge_hashtag_to_media'
|
||||
self._pageIDKey = 'name'
|
||||
self._queryHash = 'f92f56d47dc7a55b606908374b43a314'
|
||||
self._variablesFormat = '{{"tag_name":"{pageID}","first":50,"after":"{endCursor}"}}'
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('hashtag', type = snscrape.base.nonempty_string('hashtag'), help = 'An Instagram hashtag (no leading #)')
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.hashtag)
|
||||
|
||||
|
||||
class InstagramLocationScraper(_InstagramCommonScraper):
|
||||
name = 'instagram-location'
|
||||
|
||||
def __init__(self, locationId, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._initialUrl = f'https://www.instagram.com/explore/locations/{locationId}/'
|
||||
self._pageName = 'LocationsPage'
|
||||
self._responseContainer = 'location'
|
||||
self._edgeXToMedia = 'edge_location_to_media'
|
||||
self._pageIDKey = 'id'
|
||||
self._queryHash = '1b84447a4d8b6d6d0426fefb34514485'
|
||||
self._variablesFormat = '{{"id":"{pageID}","first":50,"after":"{endCursor}"}}'
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('locationid', help = 'An Instagram location ID', type = int)
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.locationid)
|
||||
|
||||
340
snscrape/modules/mastodon.py
Normal file
340
snscrape/modules/mastodon.py
Normal file
@@ -0,0 +1,340 @@
|
||||
__all__ = ['Toot', 'Boost', 'Attachment', 'Poll', 'PollOption', 'User', 'CustomEmoji', 'MastodonProfileScraper', 'MastodonTootScraperMode', 'MastodonTootScraper']
|
||||
|
||||
|
||||
import bs4
|
||||
import dataclasses
|
||||
import datetime
|
||||
import enum
|
||||
import json
|
||||
import logging
|
||||
import snscrape.base
|
||||
import time
|
||||
import typing
|
||||
import urllib.parse
|
||||
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Toot(snscrape.base.Item):
|
||||
url: str
|
||||
id: str
|
||||
user: 'User'
|
||||
date: datetime.datetime
|
||||
text: str
|
||||
spoilerText: typing.Optional[str] = None
|
||||
attachments: typing.Optional[typing.List['Attachment']] = None
|
||||
links: typing.Optional[typing.List[str]] = None
|
||||
mentionedUsers: typing.Optional[typing.List['User']] = None
|
||||
hashtags: typing.Optional[typing.List[str]] = None
|
||||
poll: typing.Optional['Poll'] = None
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Boost(snscrape.base.Item):
|
||||
user: 'User'
|
||||
toot: Toot
|
||||
|
||||
def __str__(self):
|
||||
# Boosts don't have their own URLs
|
||||
return str(self.toot)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Attachment:
|
||||
url: str
|
||||
name: str
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Poll:
|
||||
id: str
|
||||
expirationDate: datetime.datetime
|
||||
multiple: bool
|
||||
options: typing.List['PollOption']
|
||||
votesCount: int
|
||||
votersCount: typing.Optional[int] = None # Available since version 3.0.0 (commit 3babf846)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class PollOption:
|
||||
title: str
|
||||
votesCount: int
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Entity):
|
||||
account: str # @username@domain.invalid
|
||||
displayName: typing.Optional[str] = None
|
||||
displayNameWithCustomEmojis: typing.Optional[typing.List[typing.Union[str, 'CustomEmoji']]] = None
|
||||
avatarUrl: typing.Optional[str] = None
|
||||
_url: typing.Optional[str] = None
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
if self._url:
|
||||
return self._url
|
||||
return f'https://{"/@".join(reversed(self.account[1:].split("@")))}'
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class CustomEmoji:
|
||||
shortName: str
|
||||
url: str
|
||||
staticUrl: str
|
||||
|
||||
|
||||
class _MastodonCommonScraper(snscrape.base.Scraper):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept-Language': 'en-US,en;q=0.5'}
|
||||
self._lastRequest = 0
|
||||
|
||||
def _rate_limited_get(self, *args, **kwargs):
|
||||
if (diff := time.time() - self._lastRequest) < 3:
|
||||
time.sleep(3 - diff)
|
||||
self._lastRequest = time.time()
|
||||
return self._get(*args, **kwargs)
|
||||
|
||||
def _entries_to_items(self, entries, url):
|
||||
for entry in entries:
|
||||
if entry.find('a', class_ = 'load-more'):
|
||||
continue
|
||||
|
||||
tootKwargs = {}
|
||||
|
||||
info = entry.find('div', class_ = 'status__info')
|
||||
if not info: # Before 2.5.0 (commit bb71538b)
|
||||
info = entry.find('div', class_ = 'status__header')
|
||||
if not info: # Detailed status (i.e. toot page rather than timeline)?
|
||||
info = entry.find('div', class_ = 'detailed-status__meta')
|
||||
link = info.find('a', class_ = 'status__relative-time')
|
||||
if not link: # Detailed status?
|
||||
link = info.find('a', class_ = 'detailed-status__datetime')
|
||||
tootKwargs['url'] = link['href']
|
||||
tootKwargs['id'] = tootKwargs['url'].rsplit('/', 1)[1]
|
||||
tootKwargs['date'] = datetime.datetime.strptime(info.find('data', class_ = 'dt-published')['value'], '%Y-%m-%dT%H:%M:%S+00:00').replace(tzinfo = datetime.timezone.utc)
|
||||
|
||||
userKwargs = {}
|
||||
userLink = info.find('a', class_ = 'status__display-name')
|
||||
if not userLink: # Detailed status?
|
||||
userLink = entry.find('a', class_ = 'detailed-status__display-name')
|
||||
userNameSpan = userLink.find('span', class_ = 'display-name')
|
||||
userKwargs['account'] = userNameSpan.find('span').text.strip()
|
||||
if userKwargs['account'].count('@') == 1: # Ancient versions don't include the instance for posts from accounts on the instance itself
|
||||
userKwargs['account'] = self._url_to_account(userLink['href'])
|
||||
userKwargs['_url'] = urllib.parse.urljoin(url, userLink['href'])
|
||||
userKwargs['displayName'], userKwargs['displayNameWithCustomEmojis'] = self._display_name(userNameSpan.find('strong'), url)
|
||||
userKwargs['avatarUrl'] = urllib.parse.urljoin(url, userLink.find('img', class_ = 'u-photo')['src'])
|
||||
tootKwargs['user'] = User(**userKwargs)
|
||||
|
||||
content = entry.find('div', class_ = 'status__content')
|
||||
if not content.find(class_ = 'status__content__spoiler-link'):
|
||||
tootKwargs['text'] = '\n\n'.join(p.text for p in content.find_all('p'))
|
||||
else:
|
||||
tootKwargs['text'] = content.find('span', class_ = 'p-summary').text
|
||||
tootKwargs['spoilerText'] = '\n\n'.join(p.text for p in content.find('div', class_ = 'e-content').find_all('p'))
|
||||
|
||||
if (attachmentsDiv := entry.find('div', class_ = 'attachment-list')):
|
||||
attachments = []
|
||||
for a in attachmentsDiv.find_all('a'):
|
||||
attachments.append(Attachment(url = urllib.parse.urljoin(url, a['href']), name = a.text.strip()))
|
||||
tootKwargs['attachments'] = attachments
|
||||
elif (mediaGalleryDiv := entry.find('div', attrs = {'data-component': 'MediaGallery'})): # Before 2.7.0 (https://github.com/mastodon/mastodon/issues/6714)
|
||||
o = json.loads(mediaGalleryDiv['data-props'])
|
||||
attachments = []
|
||||
for medium in o['media']:
|
||||
attachments.append(Attachment(url = urllib.parse.urljoin(url, medium['url']), name = medium['url'].rsplit('/', 1)[-1].strip()))
|
||||
tootKwargs['attachments'] = attachments
|
||||
elif (attachmentsDiv := entry.find('div', class_ = 'status__attachments')): # Before 2.3.0 (commit 2bbf987a)
|
||||
attachments = []
|
||||
for a in attachmentsDiv.find_all('a'):
|
||||
attachments.append(Attachment(url = urllib.parse.urljoin(url, a['href']), name = a['href'].rsplit('/', 1)[1]))
|
||||
tootKwargs['attachments'] = attachments
|
||||
|
||||
links = []
|
||||
mentionedUsers = []
|
||||
hashtags = []
|
||||
for a in content.find_all('a'):
|
||||
cls = a.get('class', [])
|
||||
if 'mention' in cls and 'u-url' in cls:
|
||||
mentionUrl = urllib.parse.urljoin(url, a['href'])
|
||||
mentionedUsers.append(User(account = self._url_to_account(mentionUrl), _url = mentionUrl))
|
||||
elif 'mention' in cls and 'hashtag' in cls:
|
||||
hashtags.append(a.text.strip())
|
||||
else:
|
||||
links.append(urllib.parse.urljoin(url, a['href']))
|
||||
if links:
|
||||
tootKwargs['links'] = links
|
||||
if mentionedUsers:
|
||||
tootKwargs['mentionedUsers'] = mentionedUsers
|
||||
if hashtags:
|
||||
tootKwargs['hashtags'] = hashtags
|
||||
|
||||
if (pollDiv := entry.find('div', attrs = {'data-component': 'Poll'})):
|
||||
o = json.loads(pollDiv['data-props'])
|
||||
pollKwargs = {}
|
||||
pollKwargs['id'] = o['poll']['id']
|
||||
pollKwargs['expirationDate'] = datetime.datetime.strptime(o['poll']['expires_at'], '%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo = datetime.timezone.utc)
|
||||
pollKwargs['multiple'] = o['poll']['multiple']
|
||||
pollKwargs['options'] = [PollOption(title = op['title'], votesCount = op['votes_count']) for op in o['poll']['options']]
|
||||
pollKwargs['votesCount'] = o['poll']['votes_count']
|
||||
if 'voters_count' in o['poll']: # 3.0.0 (commit 3babf846)
|
||||
pollKwargs['votersCount'] = o['poll']['voters_count']
|
||||
tootKwargs['poll'] = Poll(**pollKwargs)
|
||||
|
||||
toot = Toot(**tootKwargs)
|
||||
|
||||
# Boosts
|
||||
prepend = entry.find('div', class_ = 'status__prepend')
|
||||
if not prepend: # Before 2.5.0 (commit bb71538b)
|
||||
prepend = entry.find('div', class_ = 'pre-header')
|
||||
if prepend and prepend.find('i', class_ = 'fa-retweet'): # Is a boost
|
||||
userKwargs = {}
|
||||
userLink = prepend.find('a', class_ = 'status__display-name')
|
||||
# The user is always on this instance since that's the only place where boosts are shown, hence there is no explicit account span. Reconstruct from URL.
|
||||
userUrl = urllib.parse.urljoin(url, userLink['href'])
|
||||
assert userUrl.count('/') == 3 and userUrl.count('/@') == 1
|
||||
userKwargs['account'] = '@'.join(reversed(userUrl.split('/')[2:]))
|
||||
userKwargs['displayName'], userKwargs['displayNameWithCustomEmojis'] = self._display_name(userLink.find('strong'), url)
|
||||
toot = Boost(user = User(**userKwargs), toot = toot)
|
||||
|
||||
yield toot
|
||||
|
||||
def _display_name(self, strong, url):
|
||||
outPlain = []
|
||||
outFull = []
|
||||
hasCustomEmoji = False
|
||||
for child in strong.children:
|
||||
if isinstance(child, bs4.element.NavigableString):
|
||||
outPlain.append(str(child))
|
||||
outFull.append(str(child))
|
||||
elif child.name == 'img' and 'custom-emoji' in child.get('class', []):
|
||||
hasCustomEmoji = True
|
||||
outPlain.append(child['alt'])
|
||||
outFull.append(CustomEmoji(shortName = child['alt'], url = urllib.parse.urljoin(url, child['data-original']), staticUrl = urllib.parse.urljoin(url, child['data-static'])))
|
||||
elif child.name == 'img' and 'emojione' in child.get('class', []):
|
||||
# Version 2.0.0 (which first added custom emojis) to 2.9.4: no data-* attributes, only gets one of the URLs with no (easy, reliable) way of knowing which it is.
|
||||
hasCustomEmoji = True
|
||||
outPlain.append(child['alt'])
|
||||
outFull.append(CustomEmoji(shortName = child['alt'], url = urllib.parse.urljoin(url, child['src'])))
|
||||
else:
|
||||
_logger.warning(f'Unexpected display name child: {child!r}')
|
||||
return ''.join(outPlain), outFull if hasCustomEmoji else None
|
||||
|
||||
@staticmethod
|
||||
def _url_to_account(url):
|
||||
if url.count('/') == 3 and url.count('/@') == 1:
|
||||
return '@'.join(reversed(url.split('/')[2:]))
|
||||
if url.count('/') == 4 and '/users/' in url: # E.g. Pleroma, also supported by Mastodon
|
||||
return '@' + '@'.join(reversed(url.split('/')[2::2]))
|
||||
if url.count('/') == 4 and '/accounts/' in url: # E.g. Peertube
|
||||
return '@' + '@'.join(reversed(url.split('/')[2::2]))
|
||||
if url.count('/') == 4 and '/profile/' in url: # E.g. Friendica
|
||||
return '@' + '@'.join(reversed(url.split('/')[2::2]))
|
||||
raise ValueError('Unrecognised account URL format')
|
||||
|
||||
|
||||
class MastodonProfileScraper(_MastodonCommonScraper):
|
||||
name = 'mastodon-profile'
|
||||
|
||||
def __init__(self, account, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
if account.startswith('@') and account.count('@') == 2:
|
||||
account, domain = account[1:].split('@')
|
||||
url = f'https://{domain}/@{account}'
|
||||
else:
|
||||
url = account
|
||||
self._url = url
|
||||
|
||||
def get_items(self):
|
||||
initial = True
|
||||
while True:
|
||||
if initial:
|
||||
r = self._rate_limited_get(f'{self._url}/with_replies', headers = self._headers)
|
||||
if r.status_code not in (200, 404):
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
if r.status_code == 404: # Possibly an old instance where with_replies doesn't exist, try without that.
|
||||
r = self._rate_limited_get(self._url, headers = self._headers)
|
||||
if r.status_code not in (200, 404):
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
if r.status_code == 404:
|
||||
_logger.warning('Account does not exist')
|
||||
return
|
||||
_logger.warning('Old Mastodon instance, cannot retrieve reply toots')
|
||||
initial = False
|
||||
else:
|
||||
r = self._rate_limited_get(url, headers = self._headers)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
|
||||
yield from self._entries_to_items(soup.find('div', class_ = 'activity-stream').find_all('div', class_ = 'entry'), r.url)
|
||||
|
||||
nextA = soup.find('a', class_ = 'load-more', href = lambda x: '?max_id=' in x or '&max_id=' in x)
|
||||
if not nextA: # Before 2.5.0 (commit bb71538b)
|
||||
paginationDiv = soup.find('div', class_ = 'pagination')
|
||||
if paginationDiv:
|
||||
nextA = paginationDiv.find('a', class_ = 'next')
|
||||
if not nextA: # End of pagination
|
||||
break
|
||||
url = urllib.parse.urljoin(r.url, nextA['href'])
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('account', type = snscrape.base.nonempty_string('account'), help = 'A Mastodon account. This can be either a URL to the profile page or a string of the form @account@instance.example.org')
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.account)
|
||||
|
||||
|
||||
class MastodonTootScraperMode(enum.Enum):
|
||||
SINGLE = 'single'
|
||||
THREAD = 'thread'
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
if args.thread:
|
||||
return cls.THREAD
|
||||
return cls.SINGLE
|
||||
|
||||
|
||||
class MastodonTootScraper(_MastodonCommonScraper):
|
||||
name = 'mastodon-toot'
|
||||
|
||||
def __init__(self, url, *, mode = MastodonTootScraperMode.SINGLE, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._url = url
|
||||
self._mode = mode
|
||||
|
||||
def get_items(self):
|
||||
r = self._rate_limited_get(self._url, headers = self._headers)
|
||||
if r.status_code == 404:
|
||||
_logger.warning('Toot does not exist')
|
||||
return
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
if self._mode is MastodonTootScraperMode.SINGLE:
|
||||
status = soup.find('div', class_ = 'detailed-status')
|
||||
entry = status.parent
|
||||
yield from self._entries_to_items([entry], r.url)
|
||||
elif self._mode is MastodonTootScraperMode.THREAD:
|
||||
yield from self._entries_to_items(soup.find('div', class_ = 'activity-stream').find_all('div', class_ = 'entry'), r.url)
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('--thread', action = 'store_true', help = 'Collect thread around the toot referenced by the URL')
|
||||
subparser.add_argument('url', type = snscrape.base.nonempty_string('url'), help = 'A URL for a toot')
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.url, mode = MastodonTootScraperMode._cli_from_args(args))
|
||||
285
snscrape/modules/reddit.py
Normal file
285
snscrape/modules/reddit.py
Normal file
@@ -0,0 +1,285 @@
|
||||
__all__ = ['Submission', 'Comment', 'RedditUserScraper', 'RedditSubredditScraper', 'RedditSearchScraper', 'RedditSubmissionScraper']
|
||||
|
||||
|
||||
import dataclasses
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
import snscrape.base
|
||||
import snscrape.version
|
||||
import string
|
||||
import time
|
||||
import typing
|
||||
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Most of these fields should never be None, but due to broken data, they sometimes are anyway...
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Submission(snscrape.base.Item):
|
||||
author: typing.Optional[str] # E.g. submission hf7k6
|
||||
date: datetime.datetime
|
||||
id: str
|
||||
link: typing.Optional[str]
|
||||
selftext: typing.Optional[str]
|
||||
subreddit: typing.Optional[str] # E.g. submission 617p51
|
||||
title: str
|
||||
url: str
|
||||
|
||||
created = snscrape.base._DeprecatedProperty('created', lambda self: self.date, 'date')
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Comment(snscrape.base.Item):
|
||||
author: typing.Optional[str]
|
||||
body: str
|
||||
date: datetime.datetime
|
||||
id: str
|
||||
parentId: typing.Optional[str]
|
||||
subreddit: typing.Optional[str]
|
||||
url: str
|
||||
|
||||
created = snscrape.base._DeprecatedProperty('created', lambda self: self.date, 'date')
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
def _cmp_id(id1, id2):
|
||||
'''Compare two Reddit IDs. Returns -1 if id1 is less than id2, 0 if they are equal, and 1 if id1 is greater than id2.
|
||||
|
||||
id1 and id2 may have prefixes like t1_, but if included, they must be present on both and equal.'''
|
||||
|
||||
if id1.startswith('t') and '_' in id1:
|
||||
prefix, id1 = id1.split('_', 1)
|
||||
if not id2.startswith(f'{prefix}_'):
|
||||
raise ValueError('id2 must have the same prefix as id1')
|
||||
_, id2 = id2.split('_', 1)
|
||||
if id1.strip(string.ascii_lowercase + string.digits) != '':
|
||||
raise ValueError('invalid characters in id1')
|
||||
if id2.strip(string.ascii_lowercase + string.digits) != '':
|
||||
raise ValueError('invalid characters in id2')
|
||||
if len(id1) < len(id2):
|
||||
return -1
|
||||
if len(id1) > len(id2):
|
||||
return 1
|
||||
if id1 < id2:
|
||||
return -1
|
||||
if id1 > id2:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
class _RedditPushshiftScraper(snscrape.base.Scraper):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._headers = {'User-Agent': f'snscrape/{snscrape.version.__version__}'}
|
||||
|
||||
def _handle_rate_limiting(self, r):
|
||||
if r.status_code == 429:
|
||||
_logger.info('Got 429 response, sleeping')
|
||||
time.sleep(10)
|
||||
return False, 'rate-limited'
|
||||
if r.status_code != 200:
|
||||
return False, 'non-200 status code'
|
||||
return True, None
|
||||
|
||||
def _get_api(self, url, params = None):
|
||||
r = self._get(url, params = params, headers = self._headers, responseOkCallback = self._handle_rate_limiting)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
return r.json()
|
||||
|
||||
def _api_obj_to_item(self, d):
|
||||
cls = Submission if 'title' in d else Comment
|
||||
|
||||
# Pushshift doesn't always return a permalink; sometimes, there's a permalink_url instead, and sometimes there's nothing at all
|
||||
permalink = d.get('permalink')
|
||||
if permalink is None:
|
||||
# E.g. comment dovj2v7
|
||||
permalink = d.get('permalink_url')
|
||||
if permalink is None:
|
||||
if 'link_id' in d and d['link_id'].startswith('t3_'): # E.g. comment doraazf
|
||||
if 'subreddit' in d:
|
||||
permalink = f'/r/{d["subreddit"]}/comments/{d["link_id"][3:]}/_/{d["id"]}/'
|
||||
else: # E.g. submission 617p51 but can likely happen for comments as well
|
||||
permalink = f'/comments/{d["link_id"][3:]}/_/{d["id"]}/'
|
||||
else:
|
||||
_logger.warning('Unable to find or construct permalink')
|
||||
permalink = '/'
|
||||
|
||||
kwargs = {
|
||||
'author': d.get('author'),
|
||||
'date': datetime.datetime.fromtimestamp(d['created_utc'], datetime.timezone.utc),
|
||||
'url': f'https://old.reddit.com{permalink}',
|
||||
'subreddit': d.get('subreddit'),
|
||||
}
|
||||
if cls is Submission:
|
||||
kwargs['selftext'] = d.get('selftext') or None
|
||||
kwargs['link'] = (d['url'] if not d['url'].startswith('/') else f'https://old.reddit.com{d["url"]}') if not kwargs['selftext'] else None
|
||||
if kwargs['link'] == kwargs['url'] or kwargs['url'].replace('//old.reddit.com/', '//www.reddit.com/') == kwargs['link']:
|
||||
kwargs['link'] = None
|
||||
kwargs['title'] = d['title']
|
||||
kwargs['id'] = f't3_{d["id"]}'
|
||||
else:
|
||||
kwargs['body'] = d['body']
|
||||
kwargs['parentId'] = d.get('parent_id')
|
||||
kwargs['id'] = f't1_{d["id"]}'
|
||||
|
||||
return cls(**kwargs)
|
||||
|
||||
|
||||
class _RedditPushshiftSearchScraper(_RedditPushshiftScraper):
|
||||
def __init__(self, name, *, submissions = True, comments = True, before = None, after = None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._name = name
|
||||
self._submissions = submissions
|
||||
self._comments = comments
|
||||
self._before = before
|
||||
self._after = after
|
||||
|
||||
if not type(self)._validationFunc(self._name):
|
||||
raise ValueError(f'invalid {type(self).name.split("-", 1)[1]} name')
|
||||
if not self._submissions and not self._comments:
|
||||
raise ValueError('At least one of submissions and comments must be True')
|
||||
|
||||
def _iter_api(self, url, params = None):
|
||||
'''Iterate through the Pushshift API using the 'before' parameter and yield the items.'''
|
||||
lowestIdSeen = None
|
||||
if params is None:
|
||||
params = {}
|
||||
if self._before is not None:
|
||||
params['before'] = self._before
|
||||
if self._after is not None:
|
||||
params['after'] = self._after
|
||||
params['sort'] = 'desc'
|
||||
while True:
|
||||
obj = self._get_api(url, params = params)
|
||||
if not obj['data'] or (lowestIdSeen is not None and all(_cmp_id(d['id'], lowestIdSeen) >= 0 for d in obj['data'])): # end of pagination
|
||||
break
|
||||
for d in obj['data']:
|
||||
if lowestIdSeen is None or _cmp_id(d['id'], lowestIdSeen) == -1:
|
||||
yield self._api_obj_to_item(d)
|
||||
lowestIdSeen = d['id']
|
||||
params['before'] = obj["data"][-1]["created_utc"] + 1
|
||||
|
||||
def _iter_api_submissions_and_comments(self, params: dict):
|
||||
# Retrieve both submissions and comments, interleave the results to get a reverse-chronological order
|
||||
params['size'] = '1000'
|
||||
if self._submissions:
|
||||
submissionsIter = self._iter_api('https://api.pushshift.io/reddit/search/submission/', params.copy()) # Pass copies to prevent the two iterators from messing each other up by using the same dict
|
||||
else:
|
||||
submissionsIter = iter(())
|
||||
if self._comments:
|
||||
commentsIter = self._iter_api('https://api.pushshift.io/reddit/search/comment/', params.copy())
|
||||
else:
|
||||
commentsIter = iter(())
|
||||
|
||||
try:
|
||||
tipSubmission = next(submissionsIter)
|
||||
except StopIteration:
|
||||
# There are no submissions, just yield comments and return
|
||||
yield from commentsIter
|
||||
return
|
||||
try:
|
||||
tipComment = next(commentsIter)
|
||||
except StopIteration:
|
||||
# There are no comments, just yield submissions and return
|
||||
yield tipSubmission
|
||||
yield from submissionsIter
|
||||
return
|
||||
|
||||
while True:
|
||||
# Return newer first; if both have the same creation datetime, return the comment first
|
||||
if tipSubmission.date > tipComment.date:
|
||||
yield tipSubmission
|
||||
try:
|
||||
tipSubmission = next(submissionsIter)
|
||||
except StopIteration:
|
||||
# Reached the end of submissions, just yield the remaining comments and stop
|
||||
yield tipComment
|
||||
yield from commentsIter
|
||||
break
|
||||
else:
|
||||
yield tipComment
|
||||
try:
|
||||
tipComment = next(commentsIter)
|
||||
except StopIteration:
|
||||
yield tipSubmission
|
||||
yield from submissionsIter
|
||||
break
|
||||
|
||||
def get_items(self):
|
||||
yield from self._iter_api_submissions_and_comments({type(self)._apiField: self._name})
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('--no-submissions', dest = 'noSubmissions', action = 'store_true', default = False, help = 'Don\'t list submissions')
|
||||
subparser.add_argument('--no-comments', dest = 'noComments', action = 'store_true', default = False, help = 'Don\'t list comments')
|
||||
subparser.add_argument('--before', metavar = 'TIMESTAMP', type = int, help = 'Fetch results before a Unix timestamp')
|
||||
subparser.add_argument('--after', metavar = 'TIMESTAMP', type = int, help = 'Fetch results after a Unix timestamp')
|
||||
name = cls.name.split('-', 1)[1]
|
||||
subparser.add_argument(name, type = snscrape.base.nonempty_string(name))
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
name = cls.name.split('-', 1)[1]
|
||||
return cls._cli_construct(args, getattr(args, name), submissions = not args.noSubmissions, comments = not args.noComments, before = args.before, after = args.after)
|
||||
|
||||
|
||||
class RedditUserScraper(_RedditPushshiftSearchScraper):
|
||||
name = 'reddit-user'
|
||||
_validationFunc = lambda x: re.match('^[A-Za-z0-9_-]{3,20}$', x)
|
||||
_apiField = 'author'
|
||||
|
||||
|
||||
class RedditSubredditScraper(_RedditPushshiftSearchScraper):
|
||||
name = 'reddit-subreddit'
|
||||
_validationFunc = lambda x: re.match('^[A-Za-z0-9][A-Za-z0-9_]{2,20}$', x)
|
||||
_apiField = 'subreddit'
|
||||
|
||||
|
||||
class RedditSearchScraper(_RedditPushshiftSearchScraper):
|
||||
name = 'reddit-search'
|
||||
_validationFunc = lambda x: True
|
||||
_apiField = 'q'
|
||||
|
||||
|
||||
class RedditSubmissionScraper(_RedditPushshiftScraper):
|
||||
name = 'reddit-submission'
|
||||
|
||||
def __init__(self, submissionId, **kwargs):
|
||||
if (submissionId[3:] if submissionId.startswith('t3_') else submissionId).strip(string.ascii_lowercase + string.digits) != '':
|
||||
raise ValueError('invalid submissionId')
|
||||
super().__init__(**kwargs)
|
||||
self._submissionId = submissionId
|
||||
|
||||
def get_items(self):
|
||||
obj = self._get_api(f'https://api.pushshift.io/reddit/search/submission/?ids={self._submissionId}')
|
||||
if not obj['data']:
|
||||
return
|
||||
if len(obj['data']) != 1:
|
||||
raise snscrape.base.ScraperException(f'Got {len(obj["data"])} results instead of 1')
|
||||
yield self._api_obj_to_item(obj['data'][0])
|
||||
|
||||
obj = self._get_api(f'https://api.pushshift.io/reddit/submission/comment_ids/{self._submissionId}')
|
||||
if not obj['data']:
|
||||
return
|
||||
commentIds = obj['data']
|
||||
for i in range(0, len(commentIds), 500):
|
||||
ids = commentIds[i : i + 500]
|
||||
obj = self._get_api(f'https://api.pushshift.io/reddit/comment/search?ids={",".join(ids)}')
|
||||
yield from map(self._api_obj_to_item, obj['data'])
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('submissionId', type = snscrape.base.nonempty_string('submissionId'))
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.submissionId)
|
||||
339
snscrape/modules/telegram.py
Normal file
339
snscrape/modules/telegram.py
Normal file
@@ -0,0 +1,339 @@
|
||||
__all__ = ['LinkPreview', 'TelegramPost', 'Channel', 'TelegramChannelScraper']
|
||||
|
||||
|
||||
import bs4
|
||||
import dataclasses
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
import snscrape.base
|
||||
import typing
|
||||
import urllib.parse
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
_SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
|
||||
_STYLE_MEDIA_URL_PATTERN = re.compile(r'url\(\'(.*?)\'\)')
|
||||
|
||||
@dataclasses.dataclass
|
||||
class LinkPreview:
|
||||
href: str
|
||||
siteName: typing.Optional[str] = None
|
||||
title: typing.Optional[str] = None
|
||||
description: typing.Optional[str] = None
|
||||
image: typing.Optional[str] = None
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Channel(snscrape.base.Entity):
|
||||
username: str
|
||||
title: typing.Optional[str] = None
|
||||
verified: typing.Optional[bool] = None
|
||||
photo: typing.Optional[str] = None
|
||||
description: typing.Optional[str] = None
|
||||
members: typing.Optional[int] = None
|
||||
photos: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
videos: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
links: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
files: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
|
||||
photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
|
||||
videosGranularity = snscrape.base._DeprecatedProperty('videosGranularity', lambda self: self.videos.granularity, 'videos.granularity')
|
||||
linksGranularity = snscrape.base._DeprecatedProperty('linksGranularity', lambda self: self.links.granularity, 'links.granularity')
|
||||
filesGranularity = snscrape.base._DeprecatedProperty('filesGranularity', lambda self: self.files.granularity, 'files.granularity')
|
||||
|
||||
def __str__(self):
|
||||
return f'https://t.me/s/{self.username}'
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class TelegramPost(snscrape.base.Item):
|
||||
url: str
|
||||
date: datetime.datetime
|
||||
content: str
|
||||
outlinks: typing.List[str] = None
|
||||
mentions: typing.List[str] = None
|
||||
hashtags: typing.List[str] = None
|
||||
forwarded: typing.Optional['Channel'] = None
|
||||
forwardedUrl: typing.Optional[str] = None
|
||||
media: typing.Optional[typing.List['Medium']] = None
|
||||
views: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
linkPreview: typing.Optional[LinkPreview] = None
|
||||
|
||||
outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks), 'outlinks')
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
class Medium:
|
||||
pass
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Photo(Medium):
|
||||
url: str
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Video(Medium):
|
||||
thumbnailUrl: str
|
||||
duration: float
|
||||
url: typing.Optional[str] = None
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class VoiceMessage(Medium):
|
||||
url: str
|
||||
duration: str
|
||||
bars:typing.List[float]
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Gif(Medium):
|
||||
thumbnailUrl: str
|
||||
url: typing.Optional[str] = None
|
||||
|
||||
|
||||
class TelegramChannelScraper(snscrape.base.Scraper):
|
||||
name = 'telegram-channel'
|
||||
|
||||
def __init__(self, name, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._name = name
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
|
||||
self._initialPage = None
|
||||
self._initialPageSoup = None
|
||||
|
||||
def _initial_page(self):
|
||||
if self._initialPage is None:
|
||||
r = self._get(f'https://t.me/s/{self._name}', headers = self._headers)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
self._initialPage, self._initialPageSoup = r, bs4.BeautifulSoup(r.text, 'lxml')
|
||||
return self._initialPage, self._initialPageSoup
|
||||
|
||||
def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
|
||||
posts = soup.find_all('div', attrs = {'class': 'tgme_widget_message', 'data-post': True})
|
||||
for post in reversed(posts):
|
||||
if onlyUsername:
|
||||
yield post['data-post'].split('/')[0]
|
||||
return
|
||||
dateDiv = post.find('div', class_ = 'tgme_widget_message_footer').find('a', class_ = 'tgme_widget_message_date')
|
||||
rawUrl = dateDiv['href']
|
||||
if not rawUrl.startswith('https://t.me/') or sum(x == '/' for x in rawUrl) != 4 or rawUrl.rsplit('/', 1)[1].strip('0123456789') != '':
|
||||
_logger.warning(f'Possibly incorrect URL: {rawUrl!r}')
|
||||
url = rawUrl.replace('//t.me/', '//t.me/s/')
|
||||
date = datetime.datetime.strptime(dateDiv.find('time', datetime = True)['datetime'].replace('-', '', 2).replace(':', ''), '%Y%m%dT%H%M%S%z')
|
||||
media = []
|
||||
outlinks = []
|
||||
mentions = []
|
||||
hashtags = []
|
||||
forwarded = None
|
||||
forwardedUrl = None
|
||||
|
||||
if (forwardTag := post.find('a', class_ = 'tgme_widget_message_forwarded_from_name')):
|
||||
forwardedUrl = forwardTag['href']
|
||||
forwardedName = forwardedUrl.split('t.me/')[1].split('/')[0]
|
||||
forwarded = Channel(username = forwardedName)
|
||||
|
||||
if (message := post.find('div', class_ = 'tgme_widget_message_text')):
|
||||
content = message.get_text(separator="\n")
|
||||
else:
|
||||
content = None
|
||||
|
||||
for link in post.find_all('a'):
|
||||
if any(x in link.parent.attrs.get('class', []) for x in ('tgme_widget_message_user', 'tgme_widget_message_author')):
|
||||
# Author links at the top (avatar and name)
|
||||
continue
|
||||
if link['href'] == rawUrl or link['href'] == url:
|
||||
style = link.attrs.get('style', '')
|
||||
# Generic filter of links to the post itself, catches videos, photos, and the date link
|
||||
if style != '':
|
||||
imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style)
|
||||
if len(imageUrls) == 1:
|
||||
media.append(Photo(url = imageUrls[0]))
|
||||
continue
|
||||
if _SINGLE_MEDIA_LINK_PATTERN.match(link['href']):
|
||||
style = link.attrs.get('style', '')
|
||||
imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style)
|
||||
if len(imageUrls) == 1:
|
||||
media.append(Photo(url = imageUrls[0]))
|
||||
# resp = self._get(image[0])
|
||||
# encoded_string = base64.b64encode(resp.content)
|
||||
# Individual photo or video link
|
||||
continue
|
||||
if link.text.startswith('@'):
|
||||
mentions.append(link.text.strip('@'))
|
||||
continue
|
||||
if link.text.startswith('#'):
|
||||
hashtags.append(link.text.strip('#'))
|
||||
continue
|
||||
href = urllib.parse.urljoin(pageUrl, link['href'])
|
||||
if (href not in outlinks) and (href != rawUrl) and (href != forwardedUrl):
|
||||
outlinks.append(href)
|
||||
|
||||
for voicePlayer in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}):
|
||||
audioUrl = voicePlayer.find('audio')['src']
|
||||
durationStr = voicePlayer.find('time').text
|
||||
duration = _durationStrToSeconds(durationStr)
|
||||
barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')]
|
||||
|
||||
media.append(VoiceMessage(url = audioUrl, duration = duration, bars = barHeights))
|
||||
|
||||
for videoPlayer in post.find_all('a', {'class': 'tgme_widget_message_video_player'}):
|
||||
iTag = videoPlayer.find('i')
|
||||
if iTag is None:
|
||||
videoUrl = None
|
||||
videoThumbnailUrl = None
|
||||
else:
|
||||
style = iTag['style']
|
||||
videoThumbnailUrl = _STYLE_MEDIA_URL_PATTERN.findall(style)[0]
|
||||
videoTag = videoPlayer.find('video')
|
||||
videoUrl = None if videoTag is None else videoTag['src']
|
||||
mKwargs = {
|
||||
'thumbnailUrl': videoThumbnailUrl,
|
||||
'url': videoUrl,
|
||||
}
|
||||
timeTag = videoPlayer.find('time')
|
||||
if timeTag is None:
|
||||
cls = Gif
|
||||
else:
|
||||
cls = Video
|
||||
durationStr = videoPlayer.find('time').text
|
||||
mKwargs['duration'] = _durationStrToSeconds(durationStr)
|
||||
media.append(cls(**mKwargs))
|
||||
|
||||
linkPreview = None
|
||||
if (linkPreviewA := post.find('a', class_ = 'tgme_widget_message_link_preview')):
|
||||
kwargs = {}
|
||||
kwargs['href'] = urllib.parse.urljoin(pageUrl, linkPreviewA['href'])
|
||||
if (siteNameDiv := linkPreviewA.find('div', class_ = 'link_preview_site_name')):
|
||||
kwargs['siteName'] = siteNameDiv.text
|
||||
if (titleDiv := linkPreviewA.find('div', class_ = 'link_preview_title')):
|
||||
kwargs['title'] = titleDiv.text
|
||||
if (descriptionDiv := linkPreviewA.find('div', class_ = 'link_preview_description')):
|
||||
kwargs['description'] = descriptionDiv.text
|
||||
if (imageI := linkPreviewA.find('i', class_ = 'link_preview_image')):
|
||||
if imageI['style'].startswith("background-image:url('"):
|
||||
kwargs['image'] = imageI['style'][22 : imageI['style'].index("'", 22)]
|
||||
else:
|
||||
_logger.warning(f'Could not process link preview image on {url}')
|
||||
linkPreview = LinkPreview(**kwargs)
|
||||
if kwargs['href'] in outlinks:
|
||||
outlinks.remove(kwargs['href'])
|
||||
|
||||
viewsSpan = post.find('span', class_ = 'tgme_widget_message_views')
|
||||
views = None if viewsSpan is None else _parse_num(viewsSpan.text)
|
||||
|
||||
outlinks = outlinks if outlinks else None
|
||||
media = media if media else None
|
||||
mentions = mentions if mentions else None
|
||||
hashtags = hashtags if hashtags else None
|
||||
|
||||
yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, mentions = mentions, hashtags = hashtags, linkPreview = linkPreview, media = media, forwarded = forwarded, forwardedUrl = forwardedUrl, views = views)
|
||||
|
||||
def get_items(self):
|
||||
r, soup = self._initial_page()
|
||||
if '/s/' not in r.url:
|
||||
_logger.warning('No public post list for this user')
|
||||
return
|
||||
nextPageUrl = ''
|
||||
while True:
|
||||
yield from self._soup_to_items(soup, r.url)
|
||||
try:
|
||||
if soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)['href'].split('/')[-1] == '1':
|
||||
# if message 1 is the first message in the page, terminate scraping
|
||||
break
|
||||
except:
|
||||
pass
|
||||
pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})
|
||||
if not pageLink:
|
||||
# some pages are missing a "tme_messages_more" tag, causing early termination
|
||||
if '=' not in nextPageUrl:
|
||||
nextPageUrl = soup.find('link', attrs = {'rel': 'canonical'}, href = True)['href']
|
||||
nextPostIndex = int(nextPageUrl.split('=')[-1]) - 20
|
||||
if nextPostIndex > 20:
|
||||
pageLink = {'href': nextPageUrl.split('=')[0] + f'={nextPostIndex}'}
|
||||
else:
|
||||
break
|
||||
nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
|
||||
r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = _telegramResponseOkCallback)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
|
||||
def _get_entity(self):
|
||||
kwargs = {}
|
||||
# /channel has a more accurate member count and bigger profile picture
|
||||
r = self._get(f'https://t.me/{self._name}', headers = self._headers)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
membersDiv = soup.find('div', class_ = 'tgme_page_extra')
|
||||
if membersDiv.text.split(',')[0].endswith((' members', ' subscribers')):
|
||||
membersStr = ''.join(membersDiv.text.split(',')[0].split(' ')[:-1])
|
||||
if membersStr == 'no':
|
||||
kwargs['members'] = 0
|
||||
else:
|
||||
kwargs['members'] = int(membersStr)
|
||||
photoImg = soup.find('img', class_ = 'tgme_page_photo_image')
|
||||
if photoImg is not None:
|
||||
kwargs['photo'] = photoImg.attrs['src']
|
||||
else:
|
||||
kwargs['photo'] = None
|
||||
|
||||
r, soup = self._initial_page()
|
||||
if '/s/' not in r.url: # Redirect on channels without public posts
|
||||
return
|
||||
channelInfoDiv = soup.find('div', class_ = 'tgme_channel_info')
|
||||
assert channelInfoDiv, 'channel info div not found'
|
||||
titleDiv = channelInfoDiv.find('div', class_ = 'tgme_channel_info_header_title')
|
||||
kwargs['title'] = titleDiv.find('span').text
|
||||
kwargs['verified'] = bool(titleDiv.find('i', class_ = 'verified-icon'))
|
||||
# The username in the channel info is not canonicalised, nor is the one on the /channel page anywhere.
|
||||
# However, the post URLs are, so extract the first post and use that.
|
||||
try:
|
||||
kwargs['username'] = next(self._soup_to_items(soup, r.url, onlyUsername = True))
|
||||
except StopIteration:
|
||||
# If there are no posts, fall back to the channel info div, although that should never happen due to the 'Channel created' entry.
|
||||
_logger.warning('Could not find a post; extracting username from channel info div, which may not be capitalised correctly')
|
||||
kwargs['username'] = channelInfoDiv.find('div', class_ = 'tgme_channel_info_header_username').text[1:] # Remove @
|
||||
if (descriptionDiv := channelInfoDiv.find('div', class_ = 'tgme_channel_info_description')):
|
||||
kwargs['description'] = descriptionDiv.text
|
||||
|
||||
for div in channelInfoDiv.find_all('div', class_ = 'tgme_channel_info_counter'):
|
||||
value, granularity = _parse_num(div.find('span', class_ = 'counter_value').text)
|
||||
type_ = div.find('span', class_ = 'counter_type').text
|
||||
if type_ == 'members':
|
||||
# Already extracted more accurately from /channel, skip
|
||||
continue
|
||||
elif type_ in ('photos', 'videos', 'links', 'files'):
|
||||
kwargs[type_] = snscrape.base.IntWithGranularity(value, granularity)
|
||||
|
||||
return Channel(**kwargs)
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('channel', type = snscrape.base.nonempty_string('channel'), help = 'A channel name')
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.channel)
|
||||
|
||||
def _parse_num(s):
|
||||
s = s.replace(' ', '')
|
||||
if s.endswith('M'):
|
||||
return int(float(s[:-1]) * 1e6), 10 ** (6 if '.' not in s else 6 - len(s[:-1].split('.')[1]))
|
||||
elif s.endswith('K'):
|
||||
return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1]))
|
||||
return int(s), 1
|
||||
|
||||
def _durationStrToSeconds(durationStr):
|
||||
durationList = durationStr.split(':')
|
||||
return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(durationList))])
|
||||
|
||||
def _telegramResponseOkCallback(r):
|
||||
if r.status_code == 200:
|
||||
return (True, None)
|
||||
return (False, f'{r.status_code=}')
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
406
snscrape/modules/vkontakte.py
Normal file
406
snscrape/modules/vkontakte.py
Normal file
@@ -0,0 +1,406 @@
|
||||
__all__ = ['VKontaktePost', 'Photo', 'PhotoVariant', 'Video', 'User', 'VKontakteUserScraper']
|
||||
|
||||
|
||||
import bs4
|
||||
import collections
|
||||
import dataclasses
|
||||
import datetime
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import snscrape.base
|
||||
import typing
|
||||
import urllib.parse
|
||||
try:
|
||||
import zoneinfo
|
||||
except ImportError:
|
||||
# Python 3.8 support; nowadays, Europe/Moscow is always UTC+3, but it's more complicated before 2014, so need proper zone info
|
||||
import pytz
|
||||
def _timezone(s):
|
||||
return pytz.timezone(s)
|
||||
def _localised_datetime(tz, *args, **kwargs):
|
||||
return tz.localize(datetime.datetime(*args, **kwargs))
|
||||
else:
|
||||
def _timezone(s):
|
||||
return zoneinfo.ZoneInfo(s)
|
||||
def _localised_datetime(tz, *args, **kwargs):
|
||||
return datetime.datetime(*args, tzinfo = tz, **kwargs)
|
||||
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
|
||||
_datePattern = re.compile(r'^(?P<date>today'
|
||||
r'|yesterday'
|
||||
r'|(?P<day1>\d+)\s+(?P<month1>' + '|'.join(_months) + r')(\s+(?P<year1>\d{4}))?'
|
||||
r'|(?P<month2>' + '|'.join(_months) + r')\s+(?P<day2>\d+),\s+(?P<year2>\d{4})'
|
||||
')'
|
||||
r'\s+at\s+(?P<hour>\d+):(?P<minute>\d+)\s+(?P<ampm>[ap]m)$')
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Entity):
|
||||
username: str
|
||||
name: str
|
||||
verified: bool
|
||||
description: typing.Optional[str] = None
|
||||
websites: typing.Optional[typing.List[str]] = None
|
||||
followers: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
posts: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
photos: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
tags: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
following: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
|
||||
followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
|
||||
postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
|
||||
photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
|
||||
tagsGranularity = snscrape.base._DeprecatedProperty('tagsGranularity', lambda self: self.tags.granularity, 'tags.granularity')
|
||||
followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
|
||||
|
||||
def __str__(self):
|
||||
return f'https://vk.com/{self.username}'
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class VKontaktePost(snscrape.base.Item):
|
||||
url: str
|
||||
date: typing.Optional[typing.Union[datetime.datetime, datetime.date]]
|
||||
content: str
|
||||
user: User
|
||||
outlinks: typing.Optional[typing.List[str]] = None
|
||||
photos: typing.Optional[typing.List['Photo']] = None
|
||||
video: typing.Optional['Video'] = None
|
||||
quotedPost: typing.Optional['VKontaktePost'] = None
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Photo:
|
||||
variants: typing.List['PhotoVariant']
|
||||
url: typing.Optional[str] = None
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class PhotoVariant:
|
||||
url: str
|
||||
width: int
|
||||
height: int
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Video:
|
||||
id: str
|
||||
list: str
|
||||
duration: int
|
||||
url: str
|
||||
thumbUrl: str
|
||||
|
||||
|
||||
class VKontakteUserScraper(snscrape.base.Scraper):
|
||||
name = 'vkontakte-user'
|
||||
|
||||
def __init__(self, username, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._username = username
|
||||
self._baseUrl = f'https://vk.com/{self._username}'
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept-Language': 'en-US,en;q=0.5'}
|
||||
self._initialPage = None
|
||||
self._initialPageSoup = None
|
||||
|
||||
def _away_a_to_url(self, a):
|
||||
# Transform an <a> tag with an href of /away.php?to=... to a plain URL; returns None if a doesn't have that form.
|
||||
if a and a.get('href', '').startswith('/away.php?to='):
|
||||
end = a['href'].find('&', 13)
|
||||
if end == -1:
|
||||
end = None
|
||||
return urllib.parse.unquote(a['href'][13 : end])
|
||||
return None
|
||||
|
||||
def _date_span_to_date(self, dateSpan):
|
||||
if not dateSpan:
|
||||
return None
|
||||
if 'time' in dateSpan.attrs:
|
||||
return datetime.datetime.fromtimestamp(int(dateSpan['time']), datetime.timezone.utc)
|
||||
if (match := _datePattern.match(dateSpan.text)):
|
||||
# Datetime information down to minutes
|
||||
tz = _timezone('Europe/Moscow')
|
||||
if match.group('date') in ('today', 'yesterday'):
|
||||
date = datetime.datetime.now(tz = tz)
|
||||
if match.group('date') == 'yesterday':
|
||||
date -= datetime.timedelta(days = 1)
|
||||
year, month, day = date.year, date.month, date.day
|
||||
else:
|
||||
year = int(match.group('year1') or match.group('year2') or datetime.datetime.now(tz = tz).year)
|
||||
month = _months.index(match.group('month1') or match.group('month2')) + 1
|
||||
day = int(match.group('day1') or match.group('day2'))
|
||||
hour = int(match.group('hour'))
|
||||
# Damn AM/PM...
|
||||
if hour == 12:
|
||||
hour -= 12
|
||||
if match.group('ampm') == 'pm':
|
||||
hour += 12
|
||||
minute = int(match.group('minute'))
|
||||
return _localised_datetime(tz, year, month, day, hour, minute)
|
||||
if (match := re.match(r'^(?P<day>\d+)\s+(?P<month>' + '|'.join(_months) + r')\s+(?P<year>\d{4})$', dateSpan.text)):
|
||||
# Date only
|
||||
return datetime.date(int(match.group('year')), _months.index(match.group('month')) + 1, int(match.group('day')))
|
||||
if dateSpan.text not in ('video', 'photo'): # Silently ignore video and photo reposts which have no original date attached
|
||||
_logger.warning(f'Could not parse date string: {dateSpan.text!r}')
|
||||
|
||||
def _post_div_to_item(self, post, isCopy = False):
|
||||
postLink = post.find('a', class_ = 'post_link' if not isCopy else 'published_by_date')
|
||||
if not postLink:
|
||||
_logger.warning(f'Skipping post without link: {str(post)[:200]!r}')
|
||||
return
|
||||
url = urllib.parse.urljoin(self._baseUrl, postLink['href'])
|
||||
assert (url.startswith('https://vk.com/wall') or (isCopy and (url.startswith('https://vk.com/video') or url.startswith('https://vk.com/photo')))) and '_' in url and url[-1] != '_' and url.rsplit('_', 1)[1].strip('0123456789') in ('', '?reply=')
|
||||
if not isCopy:
|
||||
dateSpan = post.find('div', class_ = 'post_date').find('span', class_ = 'rel_date')
|
||||
else:
|
||||
dateSpan = post.find('div', class_ = 'copy_post_date').find('a', class_ = 'published_by_date')
|
||||
textDiv = post.find('div', class_ = 'wall_post_text')
|
||||
outlinks = [h for a in textDiv.find_all('a') if (h := self._away_a_to_url(a))] if textDiv else []
|
||||
if (mediaLinkDiv := post.find('div', class_ = 'media_link')) and \
|
||||
(mediaLinkA := mediaLinkDiv.find('a', class_ = 'media_link__title')) and \
|
||||
(href := self._away_a_to_url(mediaLinkA)) and \
|
||||
href not in outlinks:
|
||||
outlinks.append(href)
|
||||
photos = None
|
||||
video = None
|
||||
if (thumbsDiv := (post.find('div', class_ = 'wall_text') if not isCopy else post).find('div', class_ = 'page_post_sized_thumbs')) and \
|
||||
not (not isCopy and thumbsDiv.parent.name == 'div' and 'class' in thumbsDiv.parent.attrs and 'copy_quote' in thumbsDiv.parent.attrs['class']): # Skip post quotes
|
||||
photos = []
|
||||
for a in thumbsDiv.find_all('a', class_ = 'page_post_thumb_wrap'):
|
||||
if 'data-photo-id' not in a.attrs and 'data-video' not in a.attrs:
|
||||
_logger.warning(f'Skipping non-photo and non-video thumb wrap on {url}')
|
||||
continue
|
||||
if 'data-video' in a.attrs:
|
||||
# Video
|
||||
video = Video(
|
||||
id = a['data-video'],
|
||||
list = a['data-list'],
|
||||
duration = int(a['data-duration']),
|
||||
url = f'https://vk.com{a["href"]}',
|
||||
thumbUrl = a['style'][(begin := a['style'].find('background-image: url(') + 22) : a['style'].find(')', begin)],
|
||||
)
|
||||
continue
|
||||
# From here on: photo
|
||||
if 'onclick' not in a.attrs or not a['onclick'].startswith("return showPhoto('") or '{"temp":' not in a['onclick'] or not a['onclick'].endswith('}, event)'):
|
||||
_logger.warning(f'Photo thumb wrap on {url} has no or unexpected onclick, skipping')
|
||||
continue
|
||||
photoData = a['onclick'][a['onclick'].find('{"temp":') : -8] # -8 = len(', event)')
|
||||
photoObj = json.loads(photoData)
|
||||
singleLetterKeys = [k for k in photoObj['temp'].keys() if len(k) == 1 and 97 <= ord(k) <= 122] # 97 = ord('a'), 122 = ord('z')
|
||||
for x in singleLetterKeys:
|
||||
# Merge base into URLs
|
||||
if not photoObj['temp'][x].startswith('https://'):
|
||||
photoObj['temp'][x] = f'{photoObj["temp"]["base"]}{photoObj["temp"][x]}'
|
||||
x_ = f'{x}_'
|
||||
if not photoObj['temp'][x_][0].startswith('https://'):
|
||||
photoObj['temp'][x_][0] = f'{photoObj["temp"]["base"]}{photoObj["temp"][x_][0]}'
|
||||
if any(k not in {'base', 'w', 'w_', 'x', 'x_', 'y', 'y_', 'z', 'z_'} for k in photoObj['temp'].keys()) or \
|
||||
not all(photoObj['temp'][x] in (photoObj['temp'][f'{x}_'][0], photoObj['temp'][f'{x}_'][0] + '.jpg') for x in singleLetterKeys) or \
|
||||
not all(photoObj['temp'][x].startswith('https://sun') and '.userapi.com/' in photoObj['temp'][x] for x in singleLetterKeys) or \
|
||||
not all(len(photoObj['temp'][(x_ := f'{x}_')]) == 3 and isinstance(photoObj['temp'][x_][1], int) and isinstance(photoObj['temp'][x_][2], int) for x in singleLetterKeys):
|
||||
_logger.warning(f'Photo thumb wrap on {url} has unexpected data structure, skipping')
|
||||
continue
|
||||
photoVariants = []
|
||||
for x in singleLetterKeys:
|
||||
x_ = f'{x}_'
|
||||
photoVariants.append(PhotoVariant(url = f'{photoObj["temp"][x_][0]}.jpg' if '.jpg' not in photoObj['temp'][x_][0] else photoObj['temp'][x_][0], width = photoObj['temp'][x_][1], height = photoObj['temp'][x_][2]))
|
||||
photoUrl = f'https://vk.com{a["href"]}' if 'href' in a.attrs and a['href'].startswith('/photo') and a['href'][6:].strip('0123456789-_') == '' else None
|
||||
photos.append(Photo(variants = photoVariants, url = photoUrl))
|
||||
quotedPost = self._post_div_to_item(quoteDiv, isCopy = True) if (quoteDiv := post.find('div', class_ = 'copy_quote')) else None
|
||||
authorHeading = post.find('h5', class_ = ['post_author', 'copy_post_author'])
|
||||
authorLink = authorHeading.find('a', class_ = ['author', 'copy_author'])
|
||||
username = authorLink['href'].split('/')[-1]
|
||||
name = authorLink.text
|
||||
if authorHeading.find('div', class_ = 'page_verified') is not None:
|
||||
verified = True
|
||||
else:
|
||||
verified = False
|
||||
user = User(username = username, name = name, verified = verified)
|
||||
return VKontaktePost(
|
||||
url = url,
|
||||
date = self._date_span_to_date(dateSpan),
|
||||
content = textDiv.text if textDiv else None,
|
||||
user = user,
|
||||
outlinks = outlinks or None,
|
||||
photos = photos or None,
|
||||
video = video or None,
|
||||
quotedPost = quotedPost,
|
||||
)
|
||||
|
||||
def _soup_to_items(self, soup):
|
||||
for post in soup.find_all('div', class_ = 'post'):
|
||||
yield self._post_div_to_item(post)
|
||||
|
||||
def _initial_page(self):
|
||||
if self._initialPage is None:
|
||||
_logger.info('Retrieving initial data')
|
||||
r = self._get(self._baseUrl, headers = self._headers)
|
||||
if r.status_code not in (200, 404):
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
# VK sends windows-1251-encoded data, but Requests's decoding doesn't seem to work correctly and causes lxml to choke, so we need to pass the binary content and the encoding explicitly.
|
||||
self._initialPage, self._initialPageSoup = r, bs4.BeautifulSoup(r.content, 'lxml', from_encoding = r.encoding)
|
||||
return self._initialPage, self._initialPageSoup
|
||||
|
||||
def get_items(self):
|
||||
r, soup = self._initial_page()
|
||||
if r.status_code == 404:
|
||||
_logger.warning('Wall does not exist')
|
||||
return
|
||||
|
||||
if soup.find('div', class_ = 'profile_closed_wall_dummy'):
|
||||
_logger.warning('Private profile')
|
||||
return
|
||||
|
||||
if (profileDeleted := soup.find('h5', class_ = 'profile_deleted_text')):
|
||||
# Unclear what this state represents, so just log website text.
|
||||
_logger.warning(profileDeleted.text)
|
||||
return
|
||||
|
||||
newestPost = soup.find('div', class_ = 'post')
|
||||
if not newestPost:
|
||||
_logger.info('Wall has no posts')
|
||||
return
|
||||
ownerID = newestPost.attrs['data-post-id'].split('_')[0]
|
||||
# If there is a pinned post, we need its ID for the pagination requests
|
||||
if 'post_fixed' in newestPost.attrs['class']:
|
||||
fixedPostID = int(newestPost.attrs['id'].split('_')[1])
|
||||
else:
|
||||
fixedPostID = ''
|
||||
|
||||
last1000PostIDs = collections.deque(maxlen = 1000)
|
||||
|
||||
def _process_soup(soup):
|
||||
nonlocal last1000PostIDs
|
||||
for item in self._soup_to_items(soup):
|
||||
postID = int(item.url.rsplit('_', 1)[1])
|
||||
if postID not in last1000PostIDs:
|
||||
yield item
|
||||
last1000PostIDs.append(postID)
|
||||
|
||||
yield from _process_soup(soup)
|
||||
|
||||
lastWorkingOffset = 0
|
||||
for offset in itertools.count(start = 10, step = 10):
|
||||
posts = self._get_wall_offset(fixedPostID, ownerID, offset)
|
||||
if posts.startswith('<div class="page_block no_posts">'):
|
||||
# Reached the end
|
||||
break
|
||||
if not posts.startswith('<div id="post'):
|
||||
if posts == '"\\/blank.php?block=119910902"':
|
||||
_logger.warning(f'Encountered geoblock on offset {offset}, trying to work around the block but might be missing content')
|
||||
for geoblockOffset in range(lastWorkingOffset + 1, offset + 10):
|
||||
geoPosts = self._get_wall_offset(fixedPostID, ownerID, geoblockOffset)
|
||||
if geoPosts.startswith('<div class="page_block no_posts">'):
|
||||
# No breaking the outer loop, it'll just make one extra request and exit as well
|
||||
break
|
||||
if not geoPosts.startswith('<div id="post'):
|
||||
if geoPosts == '"\\/blank.php?block=119910902"':
|
||||
continue
|
||||
raise snscrape.base.ScraperException(f'Got an unknown response: {geoPosts[:200]!r}...')
|
||||
yield from _process_soup(soup = bs4.BeautifulSoup(geoPosts, 'lxml'))
|
||||
continue
|
||||
raise snscrape.base.ScraperException(f'Got an unknown response: {posts[:200]!r}...')
|
||||
lastWorkingOffset = offset
|
||||
soup = bs4.BeautifulSoup(posts, 'lxml')
|
||||
yield from _process_soup(soup)
|
||||
|
||||
def _get_wall_offset(self, fixedPostID, ownerID, offset):
|
||||
headers = self._headers.copy()
|
||||
headers['X-Requested-With'] = 'XMLHttpRequest'
|
||||
_logger.info(f'Retrieving page offset {offset}')
|
||||
r = self._post(
|
||||
'https://vk.com/al_wall.php',
|
||||
data = [('act', 'get_wall'), ('al', 1), ('fixed', fixedPostID), ('offset', offset), ('onlyCache', 'false'), ('owner_id', ownerID), ('type', 'own'), ('wall_start_from', offset)],
|
||||
headers = headers
|
||||
)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
# Convert to JSON and read the HTML payload. Note that this implicitly converts the data to a Python string (i.e., Unicode), away from a windows-1251-encoded bytes.
|
||||
posts = r.json()['payload'][1][0]
|
||||
return posts
|
||||
|
||||
def _get_entity(self):
|
||||
r, soup = self._initial_page()
|
||||
if r.status_code != 200:
|
||||
return
|
||||
kwargs = {}
|
||||
kwargs['username'] = r.url.rsplit('/', 1)[1]
|
||||
nameH1 = soup.find('h1', class_ = 'page_name')
|
||||
kwargs['name'] = nameH1.text
|
||||
kwargs['verified'] = bool(nameH1.find('div', class_ = 'page_verified'))
|
||||
|
||||
if (descriptionDiv := soup.find('div', id = 'page_current_info')):
|
||||
kwargs['description'] = descriptionDiv.text
|
||||
|
||||
if (infoDiv := soup.find('div', id = 'page_info_wrap')):
|
||||
websites = []
|
||||
for rowDiv in infoDiv.find_all('div', class_ = ['profile_info_row', 'group_info_row']):
|
||||
if 'profile_info_row' in rowDiv['class']:
|
||||
labelDiv = rowDiv.find('div', class_ = 'fl_l')
|
||||
if not labelDiv or labelDiv.text != 'Website:':
|
||||
continue
|
||||
else: # group_info_row
|
||||
if rowDiv['title'] == 'Description':
|
||||
kwargs['description'] = rowDiv.text
|
||||
if rowDiv['title'] != 'Website':
|
||||
continue
|
||||
for a in rowDiv.find_all('a'):
|
||||
if not a['href'].startswith('/away.php?to='):
|
||||
_logger.warning(f'Skipping odd website link: {a["href"]!r}')
|
||||
continue
|
||||
websites.append(urllib.parse.unquote(a['href'].split('=', 1)[1].split('&', 1)[0]))
|
||||
if websites:
|
||||
kwargs['websites'] = websites
|
||||
|
||||
def parse_num(s: str) -> typing.Tuple[int, int]:
|
||||
if s.endswith('K'):
|
||||
return int(s[:-1]) * 1000, 1000
|
||||
elif s.endswith('M'):
|
||||
baseNum = s[:-1]
|
||||
precision = 1000000
|
||||
if '.' in s:
|
||||
precision //= (10 ** len(baseNum.split('.')[1]))
|
||||
return int(float(baseNum) * 1000000), precision
|
||||
else:
|
||||
return int(s.replace(',', '')), 1
|
||||
|
||||
if (countsDiv := soup.find('div', class_ = 'counts_module')):
|
||||
for a in countsDiv.find_all('a', class_ = 'page_counter'):
|
||||
count, granularity = parse_num(a.find('div', class_ = 'count').text)
|
||||
label = a.find('div', class_ = 'label').text
|
||||
if label in ('follower', 'post', 'photo', 'tag'):
|
||||
label = f'{label}s'
|
||||
if label in ('followers', 'posts', 'photos', 'tags'):
|
||||
kwargs[label] = snscrape.base.IntWithGranularity(count, granularity)
|
||||
|
||||
if (idolsDiv := soup.find('div', id = 'profile_idols')):
|
||||
if (topDiv := idolsDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Following':
|
||||
kwargs['following'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
|
||||
|
||||
# On public pages, this is where followers are listed
|
||||
if (followersDiv := soup.find('div', id = 'public_followers')):
|
||||
if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Followers':
|
||||
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
|
||||
# On community groups, this is where followers are listed
|
||||
elif (followersDiv := soup.find('div', class_ = 'group_friends_text')):
|
||||
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(followersDiv.find('span', class_ = 'group_friends_count').text))
|
||||
# On public groups, this is where followers are listed
|
||||
elif (followersDiv := soup.find('div', id = 'group_followers')):
|
||||
if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Members':
|
||||
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
|
||||
|
||||
return User(**kwargs)
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('username', type = snscrape.base.nonempty_string('username'), help = 'A VK username')
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.username)
|
||||
151
snscrape/modules/weibo.py
Normal file
151
snscrape/modules/weibo.py
Normal file
@@ -0,0 +1,151 @@
|
||||
__all__ = ['Post', 'User', 'WeiboUserScraper']
|
||||
|
||||
|
||||
import dataclasses
|
||||
import logging
|
||||
import re
|
||||
import snscrape.base
|
||||
import typing
|
||||
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
_userDoesNotExist = object()
|
||||
_HTML_STRIP_PATTERN = re.compile(r'<[^>]*>')
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Post(snscrape.base.Item):
|
||||
url: str
|
||||
id: str
|
||||
user: typing.Optional['User']
|
||||
createdAt: str # Can have a variety of inconsistent formats
|
||||
text: str
|
||||
repostsCount: typing.Optional[int]
|
||||
commentsCount: typing.Optional[typing.Union[int, str]]
|
||||
likesCount: typing.Optional[int]
|
||||
picturesCount: typing.Optional[int]
|
||||
pictures: typing.Optional[typing.List[str]] # May be shorter than pictureCount if the API didn't return all of them (e.g. post Ipay2evb0)
|
||||
video: typing.Optional[str]
|
||||
link: typing.Optional[str]
|
||||
repostedPost: typing.Optional['Post']
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Entity):
|
||||
screenname: str
|
||||
uid: int
|
||||
verified: bool
|
||||
verifiedReason: typing.Optional[str]
|
||||
description: str
|
||||
statusesCount: int
|
||||
followersCount: int
|
||||
followCount: int
|
||||
avatar: str
|
||||
|
||||
def __str__(self):
|
||||
return f'https://m.weibo.cn/u/{self.uid}'
|
||||
|
||||
|
||||
class WeiboUserScraper(snscrape.base.Scraper):
|
||||
name = 'weibo-user'
|
||||
|
||||
def __init__(self, user, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._user = user
|
||||
self._isUserId = isinstance(user, int)
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
|
||||
|
||||
def _ensure_user_id(self):
|
||||
if self._isUserId:
|
||||
return
|
||||
r = self._get(f'https://m.weibo.cn/n/{self._user}', headers = self._headers, allowRedirects = False)
|
||||
if r.status_code == 302 and r.headers['Location'].startswith('/u/') and len(r.headers['Location']) == 13 and r.headers['Location'][3:].strip('0123456789') == '':
|
||||
# Redirect to uid URL
|
||||
self._user = int(r.headers['Location'][3:])
|
||||
self._isUserId = True
|
||||
elif r.status_code == 200 and '<p class="h5-4con">用户不存在</p>' in r.text:
|
||||
_logger.warning('User does not exist')
|
||||
self._user = _userDoesNotExist
|
||||
else:
|
||||
raise snscrape.base.ScraperException(f'Got unexpected response on resolving username ({r.status_code})')
|
||||
|
||||
def _check_timeline_response(self, r):
|
||||
if r.status_code == 200 and r.content == b'{"ok":0,"msg":"\\u8fd9\\u91cc\\u8fd8\\u6ca1\\u6709\\u5185\\u5bb9","data":{"cards":[]}}':
|
||||
# 'No content here yet'. Appears to happen sometimes on pagination, possibly due to too fast requests; retry this
|
||||
return False, 'no-content message'
|
||||
if r.status_code != 200:
|
||||
return False, 'non-200 status code'
|
||||
return True, None
|
||||
|
||||
def _mblog_to_item(self, mblog):
|
||||
return Post(
|
||||
url = f'https://m.weibo.cn/status/{mblog["bid"]}',
|
||||
id = mblog['id'],
|
||||
user = self._user_info_to_entity(mblog['user']) if mblog['user'] is not None else None,
|
||||
createdAt = mblog['created_at'],
|
||||
text = mblog['raw_text'] if 'raw_text' in mblog else _HTML_STRIP_PATTERN.sub('', mblog['text']),
|
||||
repostsCount = mblog.get('reposts_count'),
|
||||
commentsCount = mblog.get('comments_count'),
|
||||
likesCount = mblog.get('attitudes_count'),
|
||||
picturesCount = mblog.get('pic_num'),
|
||||
pictures = [x['large']['url'] for x in mblog['pics']] if 'pics' in mblog else None,
|
||||
video = mblog['page_info']['media_info']['mp4_720p_mp4'] if 'page_info' in mblog and mblog['page_info']['type'] == 'video' else None,
|
||||
link = mblog['page_info']['page_url'] if 'page_info' in mblog and mblog['page_info']['type'] == 'webpage' else None,
|
||||
repostedPost = self._mblog_to_item(mblog['retweeted_status']) if 'retweeted_status' in mblog else None,
|
||||
)
|
||||
|
||||
def get_items(self):
|
||||
self._ensure_user_id()
|
||||
if self._user is _userDoesNotExist:
|
||||
return
|
||||
sinceId = None
|
||||
while True:
|
||||
sinceParam = f'&since_id={sinceId}' if sinceId is not None else ''
|
||||
r = self._get(f'https://m.weibo.cn/api/container/getIndex?type=uid&value={self._user}&containerid=107603{self._user}&count=25{sinceParam}', headers = self._headers, responseOkCallback = self._check_timeline_response)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
o = r.json()
|
||||
for card in o['data']['cards']:
|
||||
if card['card_type'] != 9:
|
||||
_logger.warning(f'Skipping card of type {card["card_type"]}')
|
||||
continue
|
||||
yield self._mblog_to_item(card['mblog'])
|
||||
if 'since_id' not in o['data']['cardlistInfo']:
|
||||
# End of pagination
|
||||
break
|
||||
sinceId = o['data']['cardlistInfo']['since_id']
|
||||
|
||||
def _user_info_to_entity(self, userInfo):
|
||||
return User(
|
||||
screenname = userInfo['screen_name'],
|
||||
uid = userInfo['id'],
|
||||
verified = userInfo['verified'],
|
||||
verifiedReason = userInfo.get('verified_reason'),
|
||||
description = userInfo['description'],
|
||||
statusesCount = userInfo['statuses_count'],
|
||||
followersCount = userInfo['followers_count'],
|
||||
followCount = userInfo['follow_count'],
|
||||
avatar = userInfo['avatar_hd'],
|
||||
)
|
||||
|
||||
def _get_entity(self):
|
||||
self._ensure_user_id()
|
||||
if self._user is _userDoesNotExist:
|
||||
return
|
||||
r = self._get(f'https://m.weibo.cn/api/container/getIndex?type=uid&value={self._user}', headers = self._headers)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException('Could not fetch user info')
|
||||
o = r.json()
|
||||
return self._user_info_to_entity(o['data']['userInfo'])
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('--name', dest = 'isName', action = 'store_true', help = 'Use username instead of user ID')
|
||||
subparser.add_argument('user', type = snscrape.base.nonempty_string('user'), help = 'A user ID')
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, user = args.user if args.isName else int(args.user))
|
||||
7
snscrape/version.py
Normal file
7
snscrape/version.py
Normal file
@@ -0,0 +1,7 @@
|
||||
import importlib.metadata
|
||||
|
||||
|
||||
try:
|
||||
__version__ = importlib.metadata.version('snscrape')
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
__version__ = None
|
||||
Reference in New Issue
Block a user