mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-08 18:48:28 +03:00
Compare commits
254 Commits
v0.3.0
...
add-vk-use
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cb429909d0 | ||
|
|
0822a9c354 | ||
|
|
faeffe2603 | ||
|
|
e3bdc02a7c | ||
|
|
e2d922301e | ||
|
|
b13e62eb5d | ||
|
|
f38513503d | ||
|
|
0a4bd39ca6 | ||
|
|
c18ca0f047 | ||
|
|
5648e957d0 | ||
|
|
21f7b620ec | ||
|
|
9b3faec980 | ||
|
|
97d38e5cde | ||
|
|
b276c3cc27 | ||
|
|
1e4e0c278d | ||
|
|
babcddda19 | ||
|
|
ed3ea944d1 | ||
|
|
e7a6d38a5f | ||
|
|
6c50eee31b | ||
|
|
5103a33afa | ||
|
|
247bd82d79 | ||
|
|
5fc67f2bcf | ||
|
|
65e7d8bd24 | ||
|
|
3870282a42 | ||
|
|
7c0fcdec43 | ||
|
|
9af1f19034 | ||
|
|
5fc3c0e290 | ||
|
|
f978954bb3 | ||
|
|
2ce014ade4 | ||
|
|
5d156c6a15 | ||
|
|
4e59638e7c | ||
|
|
a7eb54d226 | ||
|
|
d32c9add8a | ||
|
|
fb8d73ac95 | ||
|
|
ed829163a0 | ||
|
|
694657ef80 | ||
|
|
1ab0f4fccb | ||
|
|
3a92b5bf0d | ||
|
|
2480b173f4 | ||
|
|
de4ebed81f | ||
|
|
72b26f2373 | ||
|
|
77bbb9f61f | ||
|
|
57a624c618 | ||
|
|
b1cfd51121 | ||
|
|
ace2c16f54 | ||
|
|
2f9c0457df | ||
|
|
878f2a3c7a | ||
|
|
25ee014e29 | ||
|
|
a192dc6236 | ||
|
|
a7242f340b | ||
|
|
359cc25cdf | ||
|
|
01799a7391 | ||
|
|
b0753c34ed | ||
|
|
7f78fa0bc0 | ||
|
|
8702a9c7e2 | ||
|
|
8ac1fd3ea8 | ||
|
|
9235890f9a | ||
|
|
7d939c110c | ||
|
|
8e95e9a9a7 | ||
|
|
aa7d7d3dc3 | ||
|
|
560c78c5cf | ||
|
|
107c3c71c2 | ||
|
|
7f88678253 | ||
|
|
52e4f9fb69 | ||
|
|
eebdfc1c55 | ||
|
|
e6076353c8 | ||
|
|
a32d79fab2 | ||
|
|
65391297f6 | ||
|
|
deb2659dd6 | ||
|
|
93e62744d7 | ||
|
|
3f3632d341 | ||
|
|
5070953feb | ||
|
|
853848ed5d | ||
|
|
0b4abdc43f | ||
|
|
267b7d0e32 | ||
|
|
acb7f10a4f | ||
|
|
ca00b480b1 | ||
|
|
f189ab4241 | ||
|
|
c6e1e33a23 | ||
|
|
a37ea528d3 | ||
|
|
eee06d8593 | ||
|
|
4dd3ee6e47 | ||
|
|
0336ce13ed | ||
|
|
193d4f80d6 | ||
|
|
e7d35ec1eb | ||
|
|
8540045658 | ||
|
|
1f1c1bd8af | ||
|
|
7fdc8bcb53 | ||
|
|
4b3c6aefe7 | ||
|
|
525cd71225 | ||
|
|
72abff9e5c | ||
|
|
bcaa477b3d | ||
|
|
66d4c99f82 | ||
|
|
0ac50f1383 | ||
|
|
c2257ad16e | ||
|
|
58f654405f | ||
|
|
35fb61a327 | ||
|
|
a6b6f3faaa | ||
|
|
5e829e2541 | ||
|
|
d4567da23c | ||
|
|
e5e0da25a0 | ||
|
|
821326bcfb | ||
|
|
4bf9ef239c | ||
|
|
e382891642 | ||
|
|
e5f4389464 | ||
|
|
d91f971f51 | ||
|
|
67e8295293 | ||
|
|
5fc2562642 | ||
|
|
2825bd0a73 | ||
|
|
9831f2a4a0 | ||
|
|
a11eef6b06 | ||
|
|
3fb731ade1 | ||
|
|
c76f1637ce | ||
|
|
ed117e8891 | ||
|
|
f9a3fafb3f | ||
|
|
660b8c7a0a | ||
|
|
0c22608dc7 | ||
|
|
2bb706feda | ||
|
|
5e6bc4ec50 | ||
|
|
57d0aaafc1 | ||
|
|
157e4d4265 | ||
|
|
54588e9c42 | ||
|
|
9e7274f3d7 | ||
|
|
ac4e335bdb | ||
|
|
1d255de48d | ||
|
|
9c1dcd37f9 | ||
|
|
f8dac183d0 | ||
|
|
45d1fa27de | ||
|
|
98b798b0e5 | ||
|
|
f18b64e7da | ||
|
|
460be9d581 | ||
|
|
97c8caea48 | ||
|
|
a34f93076a | ||
|
|
8f1c470061 | ||
|
|
dbf2a2f689 | ||
|
|
39a34a57ac | ||
|
|
f44b39705a | ||
|
|
f64ce217b7 | ||
|
|
cdf87f4b8f | ||
|
|
47fbc2a84d | ||
|
|
5cd3b7d7cc | ||
|
|
0121fa51c2 | ||
|
|
892941b609 | ||
|
|
e3022628b6 | ||
|
|
fdc33d0dba | ||
|
|
6d6411cc24 | ||
|
|
61a1ecffc5 | ||
|
|
d2dce37fa0 | ||
|
|
d65f0434da | ||
|
|
7499384110 | ||
|
|
7a0f68b7ec | ||
|
|
1a219fd2b6 | ||
|
|
6fb98dae12 | ||
|
|
8c2c0fa47a | ||
|
|
58c8365c33 | ||
|
|
2c11ec38fa | ||
|
|
fe5e23502d | ||
|
|
644cd1d2fb | ||
|
|
5ccfab6314 | ||
|
|
bf895ea5b1 | ||
|
|
e956e2562b | ||
|
|
defe874bf4 | ||
|
|
3f8935ee4d | ||
|
|
cd12500dbf | ||
|
|
5dc61d50ac | ||
|
|
11a82e110a | ||
|
|
16ebe8bf48 | ||
|
|
1bbe25647a | ||
|
|
e22b461563 | ||
|
|
c4a5715e18 | ||
|
|
5cb64faa72 | ||
|
|
0f78aa45fc | ||
|
|
179112a310 | ||
|
|
4ce9ed4eb3 | ||
|
|
11414cb68f | ||
|
|
bd53e729a0 | ||
|
|
ffd9289edc | ||
|
|
b1a7b9607f | ||
|
|
119e53d07c | ||
|
|
c3e2e12369 | ||
|
|
a70b361176 | ||
|
|
8b68f1a8af | ||
|
|
c72bf3174f | ||
|
|
472cef2382 | ||
|
|
b1d8475a03 | ||
|
|
3d3faf80bf | ||
|
|
bbb372284b | ||
|
|
8cf81e9bfc | ||
|
|
d90f06b389 | ||
|
|
c519832755 | ||
|
|
397a0b988e | ||
|
|
f1428fa0e0 | ||
|
|
7d2c546ee5 | ||
|
|
2332c30e26 | ||
|
|
b78bf3e642 | ||
|
|
1a09f9b9a3 | ||
|
|
5ae5ec7bcd | ||
|
|
c0ff6631aa | ||
|
|
ae60a4d0fd | ||
|
|
800cfd5be0 | ||
|
|
f296f9d21d | ||
|
|
8265ffc19e | ||
|
|
f8efe98608 | ||
|
|
2b5444f89e | ||
|
|
07d446fd19 | ||
|
|
a25426043b | ||
|
|
84692846b9 | ||
|
|
039b2c6719 | ||
|
|
70a3d9ba3a | ||
|
|
bd619bf4e9 | ||
|
|
072519f539 | ||
|
|
d9572ec450 | ||
|
|
ba250aabf2 | ||
|
|
0cc4f0c016 | ||
|
|
1a2e367a87 | ||
|
|
4f24843f89 | ||
|
|
bfb92a47b9 | ||
|
|
dc5d55004b | ||
|
|
d8e7f96d4d | ||
|
|
bb83d1d72f | ||
|
|
1480260e47 | ||
|
|
c8d688d39f | ||
|
|
9df4352089 | ||
|
|
dd25fd0526 | ||
|
|
c90fd54b6b | ||
|
|
9528df48cd | ||
|
|
924c35f883 | ||
|
|
588ec415ff | ||
|
|
bf229414ba | ||
|
|
afa819547d | ||
|
|
dbcdc159ef | ||
|
|
30f945897a | ||
|
|
eee5794ff9 | ||
|
|
966a6ebd8e | ||
|
|
4d3d0fe0d7 | ||
|
|
7b967ff82a | ||
|
|
90f9598ecc | ||
|
|
7b3c7deb28 | ||
|
|
040a11656c | ||
|
|
1459245258 | ||
|
|
dbe4c5ce55 | ||
|
|
80491ecc2c | ||
|
|
1a71b58101 | ||
|
|
0ce37a69d4 | ||
|
|
722bfd5f7c | ||
|
|
b6cc3180d9 | ||
|
|
613395d1c2 | ||
|
|
82a87b7b5a | ||
|
|
9568028bf9 | ||
|
|
6df351772e | ||
|
|
541173b0c8 | ||
|
|
b6772d3778 | ||
|
|
20ea117a2c | ||
|
|
ff54c350bc |
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
__pycache__/
|
||||
/dist/
|
||||
/snscrape.egg-info/
|
||||
/.eggs/
|
||||
39
README.md
39
README.md
@@ -1,16 +1,19 @@
|
||||
# snscrape
|
||||
snscrape is a scraper for social networking services (SNS). It scrapes things like user profiles, hashtags, or searches and returns the discovered items, e.g. the relevant posts.
|
||||
snscrape is a scraper for social networking services (SNS). It scrapes things like user profiles, hashtags, or searches and returns the discovered items, e.g. the relevant posts.
|
||||
|
||||
The following services are currently supported:
|
||||
* Facebook: user profiles and groups
|
||||
* Gab: user profile posts, media, and comments
|
||||
* Google+: user profiles
|
||||
|
||||
* Facebook: user profiles, groups, and communities (aka visitor posts)
|
||||
* Instagram: user profiles, hashtags, and locations
|
||||
* Twitter: user profiles, hashtags, searches, threads, and lists (members as well as posts)
|
||||
* Mastodon: user profiles and toots (single or thread)
|
||||
* Reddit: users, subreddits, and searches (via Pushshift)
|
||||
* Telegram: channels
|
||||
* Twitter: users, user profiles, hashtags, searches, tweets (single or surrounding thread), list posts, and trends
|
||||
* VKontakte: user profiles
|
||||
* Weibo (Sina Weibo): user profiles
|
||||
|
||||
## Requirements
|
||||
snscrape requires Python 3.6 or higher. The Python package dependencies are installed automatically when you install snscrape.
|
||||
snscrape requires Python 3.8 or higher. The Python package dependencies are installed automatically when you install snscrape.
|
||||
|
||||
Note that one of the dependencies, lxml, also requires libxml2 and libxslt to be installed.
|
||||
|
||||
@@ -22,11 +25,28 @@ If you want to use the development version:
|
||||
pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git
|
||||
|
||||
## Usage
|
||||
To get all tweets by Jason Scott (@textfiles):
|
||||
### CLI
|
||||
The generic syntax of snscrape's CLI is:
|
||||
|
||||
snscrape [GLOBAL-OPTIONS] SCRAPER-NAME [SCRAPER-OPTIONS] [SCRAPER-ARGUMENTS...]
|
||||
|
||||
`snscrape --help` and `snscrape SCRAPER-NAME --help` provide details on the options and arguments. `snscrape --help` also lists all available scrapers.
|
||||
|
||||
The default output of the CLI is the URL of each result.
|
||||
|
||||
Some noteworthy global options are:
|
||||
|
||||
* `--jsonl` to get output as JSONL. This includes all information extracted by snscrape (e.g. message content, datetime, images; details vary by scraper).
|
||||
* `--max-results NUMBER` to only return the first `NUMBER` results.
|
||||
* `--with-entity` to get an item on the entity being scraped, e.g. the user or channel. This is not supported on all scrapers. (You can use this together with `--max-results 0` to only fetch the entity info.)
|
||||
|
||||
#### Examples
|
||||
Collect all tweets by Jason Scott (@textfiles):
|
||||
|
||||
snscrape twitter-user textfiles
|
||||
|
||||
It's usually useful to redirect the output to a file for further processing, e.g. in bash using the filename `@textfiles-tweets`:
|
||||
It's usually useful to redirect the output to a file for further processing, e.g. in bash using the filename `twitter-@textfiles`:
|
||||
|
||||
```bash
|
||||
snscrape twitter-user textfiles >twitter-@textfiles
|
||||
```
|
||||
@@ -35,8 +55,7 @@ To get the latest 100 tweets with the hashtag #archiveteam:
|
||||
|
||||
snscrape --max-results 100 twitter-hashtag archiveteam
|
||||
|
||||
`snscrape --help` or `snscrape <module> --help` provides details on the available options. `snscrape --help` also lists all available modules.
|
||||
|
||||
### Library
|
||||
It is also possible to use snscrape as a library in Python, but this is currently undocumented.
|
||||
|
||||
## Issue reporting
|
||||
|
||||
25
setup.py
25
setup.py
@@ -1,23 +1,42 @@
|
||||
import os.path
|
||||
import setuptools
|
||||
|
||||
|
||||
with open(os.path.join(os.path.dirname(__file__), 'README.md')) as fp:
|
||||
readme = fp.read()
|
||||
|
||||
|
||||
setuptools.setup(
|
||||
name = 'snscrape',
|
||||
description = 'A social networking service scraper',
|
||||
long_description = readme,
|
||||
long_description_content_type = 'text/markdown',
|
||||
author = 'JustAnotherArchivist',
|
||||
url = 'https://github.com/JustAnotherArchivist/snscrape',
|
||||
classifiers = [
|
||||
'Development Status :: 4 - Beta',
|
||||
'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Programming Language :: Python :: 3.9',
|
||||
'Programming Language :: Python :: 3.10',
|
||||
],
|
||||
packages = ['snscrape', 'snscrape.modules'],
|
||||
setup_requires = ['setuptools_scm'],
|
||||
use_scm_version = True,
|
||||
install_requires = ['requests[socks]', 'lxml', 'beautifulsoup4'],
|
||||
install_requires = [
|
||||
'requests[socks]',
|
||||
'lxml',
|
||||
'beautifulsoup4',
|
||||
'pytz; python_version < "3.9.0"',
|
||||
'filelock',
|
||||
],
|
||||
python_requires = '~=3.8',
|
||||
extras_require = {
|
||||
'test': ['coverage'],
|
||||
},
|
||||
entry_points = {
|
||||
'console_scripts': [
|
||||
'snscrape = snscrape.cli:main',
|
||||
'snscrape = snscrape._cli:main',
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
@@ -1,13 +1,17 @@
|
||||
import argparse
|
||||
import collections
|
||||
import contextlib
|
||||
import dataclasses
|
||||
import datetime
|
||||
import importlib.metadata
|
||||
import inspect
|
||||
import logging
|
||||
import requests.models
|
||||
import requests
|
||||
# Imported in parse_args() after setting up the logger:
|
||||
#import snscrape.base
|
||||
#import snscrape.modules
|
||||
#import snscrape.version
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
|
||||
@@ -41,30 +45,31 @@ class Logger(logging.Logger):
|
||||
super().log(level, *args, **kwargs)
|
||||
|
||||
|
||||
def _requests_preparedrequest_repr(name, request):
|
||||
def _requests_request_repr(name, request):
|
||||
ret = []
|
||||
ret.append(repr(request))
|
||||
ret.append(f'{name} = {request!r}')
|
||||
ret.append(f'\n {name}.method = {request.method}')
|
||||
ret.append(f'\n {name}.url = {request.url}')
|
||||
ret.append(f'\n {name}.headers = \\')
|
||||
for field in request.headers:
|
||||
ret.append(f'\n {field} = {_repr("_", request.headers[field])}')
|
||||
if request.body:
|
||||
ret.append(f'\n {name}.body = ')
|
||||
ret.append(_repr('_', request.body).replace('\n', '\n '))
|
||||
for attr in ('body', 'params', 'data'):
|
||||
if hasattr(request, attr) and getattr(request, attr):
|
||||
ret.append(f'\n {name}.{attr} = ')
|
||||
ret.append(_repr('_', getattr(request, attr)).replace('\n', '\n '))
|
||||
return ''.join(ret)
|
||||
|
||||
|
||||
def _requests_response_repr(name, response, withHistory = True):
|
||||
ret = []
|
||||
ret.append(repr(response))
|
||||
ret.append(f'{name} = {response!r}')
|
||||
ret.append(f'\n {name}.url = {response.url}')
|
||||
ret.append(f'\n {name}.request = ')
|
||||
ret.append(_repr('_', response.request).replace('\n', '\n '))
|
||||
if withHistory and response.history:
|
||||
ret.append(f'\n {name}.history = [')
|
||||
for previousResponse in response.history:
|
||||
ret.append(f'\n ')
|
||||
ret.append('\n ')
|
||||
ret.append(_requests_response_repr('_', previousResponse, withHistory = False).replace('\n', '\n '))
|
||||
ret.append('\n ]')
|
||||
ret.append(f'\n {name}.status_code = {response.status_code}')
|
||||
@@ -75,12 +80,31 @@ def _requests_response_repr(name, response, withHistory = True):
|
||||
return ''.join(ret)
|
||||
|
||||
|
||||
def _requests_exception_repr(name, exc):
|
||||
ret = []
|
||||
ret.append(f'{name} = {exc!r}')
|
||||
ret.append('\n ' + _repr(f'{name}.request', exc.request).replace('\n', '\n '))
|
||||
ret.append('\n ' + _repr(f'{name}.response', exc.response).replace('\n', '\n '))
|
||||
return ''.join(ret)
|
||||
|
||||
|
||||
def _repr(name, value):
|
||||
if type(value) is requests.models.Response:
|
||||
if type(value) is requests.Response:
|
||||
return _requests_response_repr(name, value)
|
||||
if type(value) is requests.models.PreparedRequest:
|
||||
return _requests_preparedrequest_repr(name, value)
|
||||
valueRepr = repr(value)
|
||||
if type(value) in (requests.PreparedRequest, requests.Request):
|
||||
return _requests_request_repr(name, value)
|
||||
if isinstance(value, requests.exceptions.RequestException):
|
||||
return _requests_exception_repr(name, value)
|
||||
if isinstance(value, dict):
|
||||
return f'{name} = <{type(value).__module__}.{type(value).__name__}>\n ' + \
|
||||
'\n '.join(_repr(f'{name}[{k!r}]', v).replace('\n', '\n ') for k, v in value.items())
|
||||
if isinstance(value, (list, tuple, collections.deque)) and not all(isinstance(v, (int, str)) for v in value):
|
||||
return f'{name} = <{type(value).__module__}.{type(value).__name__}>\n ' + \
|
||||
'\n '.join(_repr(f'{name}[{i}]', v).replace('\n', '\n ') for i, v in enumerate(value))
|
||||
if dataclasses.is_dataclass(value) and not isinstance(value, type):
|
||||
return f'{name} = <{type(value).__module__}.{type(value).__name__}>\n ' + \
|
||||
'\n '.join(_repr(f'{name}.{f.name}', f.name) + ' = ' + _repr(f'{name}.{f.name}', getattr(value, f.name)).replace('\n', '\n ') for f in dataclasses.fields(value))
|
||||
valueRepr = f'{name} = {value!r}'
|
||||
if '\n' in valueRepr:
|
||||
return ''.join(['\\\n ', valueRepr.replace('\n', '\n ')])
|
||||
return valueRepr
|
||||
@@ -93,22 +117,38 @@ def _dump_locals_on_exception():
|
||||
except Exception as e:
|
||||
trace = inspect.trace()
|
||||
if len(trace) >= 2:
|
||||
name = _dump_stack_and_locals(trace[1:])
|
||||
name = _dump_stack_and_locals(trace[1:], exc = e)
|
||||
logger.fatal(f'Dumped stack and locals to {name}')
|
||||
raise
|
||||
|
||||
|
||||
def _dump_stack_and_locals(trace):
|
||||
def _dump_stack_and_locals(trace, exc = None):
|
||||
with tempfile.NamedTemporaryFile('w', prefix = 'snscrape_locals_', delete = False) as fp:
|
||||
if exc is not None:
|
||||
fp.write('Exception:\n')
|
||||
fp.write(f' {type(exc).__module__}.{type(exc).__name__}: {exc!s}\n')
|
||||
fp.write(f' args: {exc.args!r}\n')
|
||||
fp.write('\n')
|
||||
|
||||
fp.write('Stack:\n')
|
||||
for frameRecord in trace:
|
||||
fp.write(f' File "{frameRecord.filename}", line {frameRecord.lineno}, in {frameRecord.function}\n')
|
||||
for line in frameRecord.code_context:
|
||||
fp.write(f' {line.strip()}\n')
|
||||
if frameRecord.code_context is not None:
|
||||
for line in frameRecord.code_context:
|
||||
fp.write(f' {line.strip()}\n')
|
||||
fp.write('\n')
|
||||
|
||||
for frameRecord in trace:
|
||||
module = inspect.getmodule(frameRecord[0])
|
||||
modules = [inspect.getmodule(frameRecord[0]) for frameRecord in trace]
|
||||
for i, (module, frameRecord) in enumerate(zip(modules, trace)):
|
||||
if module is None:
|
||||
# Module-less frame, e.g. dataclass.__init__
|
||||
for j in reversed(range(i)):
|
||||
if modules[j] is not None:
|
||||
break
|
||||
else:
|
||||
# No previous module scope
|
||||
continue
|
||||
module = modules[j]
|
||||
if not module.__name__.startswith('snscrape.') and module.__name__ != 'snscrape':
|
||||
continue
|
||||
locals_ = frameRecord[0].f_locals
|
||||
@@ -121,7 +161,7 @@ def _dump_stack_and_locals(trace):
|
||||
fp.write('\n')
|
||||
fp.write('\n')
|
||||
if 'self' in locals_ and hasattr(locals_['self'], '__dict__'):
|
||||
fp.write(f'Object dict:\n')
|
||||
fp.write('Object dict:\n')
|
||||
fp.write(repr(locals_['self'].__dict__))
|
||||
fp.write('\n\n')
|
||||
name = fp.name
|
||||
@@ -148,6 +188,45 @@ def parse_datetime_arg(arg):
|
||||
raise argparse.ArgumentTypeError(f'Cannot parse {arg!r} into a datetime object')
|
||||
|
||||
|
||||
def parse_format(arg):
|
||||
# Replace '{' by '{0.' to use properties of the item, but keep '{{' intact
|
||||
parts = arg.split('{')
|
||||
out = ''
|
||||
it = iter(zip(parts, parts[1:]))
|
||||
for part, nextPart in it:
|
||||
out += part
|
||||
if nextPart == '': # Double brace
|
||||
out += '{{'
|
||||
next(it)
|
||||
else: # Single brace
|
||||
out += '{0.'
|
||||
out += parts[-1]
|
||||
return out
|
||||
|
||||
|
||||
class CitationAction(argparse.Action):
|
||||
def __init__(self, option_strings, dest = argparse.SUPPRESS, *args, default = argparse.SUPPRESS, **kwargs):
|
||||
super().__init__(option_strings, dest, *args, **kwargs)
|
||||
|
||||
def __call__(self, parser, namespace, values, optionString):
|
||||
try:
|
||||
m = importlib.metadata.metadata('snscrape')
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
print('Error: could not find snscrape installation. --citation does not work without the package being installed.', file = sys.stderr)
|
||||
parser.exit(1)
|
||||
print(f'Author: {m["author"]}')
|
||||
print(f'Title: {m["name"]}: {m["summary"]}')
|
||||
print(f'URL: {m["home-page"]}')
|
||||
print(f'Version: {m["version"]}')
|
||||
print(f'Date: 2018‒{m["version"].split(".", 3)[3][:4]}')
|
||||
|
||||
if '.dev' in m['version']:
|
||||
print()
|
||||
print('WARNING! You are running a development version. The date range may be incorrect. Please adjust the upper end of the range to the year of the commit.')
|
||||
|
||||
parser.exit()
|
||||
|
||||
|
||||
def parse_args():
|
||||
import snscrape.base
|
||||
import snscrape.modules
|
||||
@@ -155,28 +234,35 @@ def parse_args():
|
||||
|
||||
parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--version', action = 'version', version = f'snscrape {snscrape.version.__version__}')
|
||||
parser.add_argument('--citation', action = CitationAction, nargs = 0, help = 'Display recommended citation information and exit')
|
||||
parser.add_argument('-v', '--verbose', '--verbosity', dest = 'verbosity', action = 'count', default = 0, help = 'Increase output verbosity')
|
||||
parser.add_argument('--dump-locals', dest = 'dumpLocals', action = 'store_true', default = False, help = 'Dump local variables on serious log messages (warnings or higher)')
|
||||
parser.add_argument('--retry', '--retries', dest = 'retries', type = int, default = 3, metavar = 'N',
|
||||
help = 'When the connection fails or the server returns an unexpected response, retry up to N times with an exponential backoff')
|
||||
parser.add_argument('-n', '--max-results', dest = 'maxResults', type = int, metavar = 'N', help = 'Only return the first N results')
|
||||
parser.add_argument('-f', '--format', dest = 'format', type = str, default = None, help = 'Output format')
|
||||
parser.add_argument('-n', '--max-results', dest = 'maxResults', type = lambda x: int(x) if int(x) >= 0 else parser.error('--max-results N must be zero or positive'), metavar = 'N', help = 'Only return the first N results')
|
||||
group = parser.add_mutually_exclusive_group(required = False)
|
||||
group.add_argument('-f', '--format', dest = 'format', type = parse_format, default = None, help = 'Output format')
|
||||
group.add_argument('--jsonl', dest = 'jsonl', action = 'store_true', default = False, help = 'Output JSONL')
|
||||
parser.add_argument('--with-entity', dest = 'withEntity', action = 'store_true', default = False, help = 'Include the entity (e.g. user, channel) as the first output item')
|
||||
parser.add_argument('--since', type = parse_datetime_arg, metavar = 'DATETIME', help = 'Only return results newer than DATETIME')
|
||||
parser.add_argument('--progress', action = 'store_true', default = False, help = 'Report progress on stderr')
|
||||
|
||||
subparsers = parser.add_subparsers(dest = 'scraper', help = 'The scraper you want to use')
|
||||
subparsers = parser.add_subparsers(dest = 'scraper', metavar = 'SCRAPER', title = 'scrapers', required = True)
|
||||
classes = snscrape.base.Scraper.__subclasses__()
|
||||
scrapers = {}
|
||||
for cls in classes:
|
||||
if cls.name is not None:
|
||||
subparser = subparsers.add_parser(cls.name, formatter_class = argparse.ArgumentDefaultsHelpFormatter)
|
||||
cls.setup_parser(subparser)
|
||||
subparser.set_defaults(cls = cls)
|
||||
scrapers[cls.name] = cls
|
||||
classes.extend(cls.__subclasses__())
|
||||
for scraper, cls in sorted(scrapers.items()):
|
||||
subparser = subparsers.add_parser(cls.name, help = '', formatter_class = argparse.ArgumentDefaultsHelpFormatter)
|
||||
cls._cli_setup_parser(subparser)
|
||||
subparser.set_defaults(cls = cls)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# http://bugs.python.org/issue16308 / https://bugs.python.org/issue26510 (fixed in Python 3.7)
|
||||
if not args.scraper:
|
||||
raise RuntimeError('Error: no scraper specified')
|
||||
if not args.withEntity and args.maxResults == 0:
|
||||
parser.error('--max-results 0 is only valid when used with --with-entity')
|
||||
|
||||
return args
|
||||
|
||||
@@ -217,20 +303,36 @@ def main():
|
||||
setup_logging()
|
||||
args = parse_args()
|
||||
configure_logging(args.verbosity, args.dumpLocals)
|
||||
scraper = args.cls.from_args(args)
|
||||
scraper = args.cls._cli_from_args(args)
|
||||
|
||||
i = 0
|
||||
with _dump_locals_on_exception():
|
||||
if args.withEntity and (entity := scraper.entity):
|
||||
if args.jsonl:
|
||||
print(entity.json())
|
||||
else:
|
||||
print(entity)
|
||||
if args.maxResults == 0:
|
||||
logger.info('Exiting after 0 results')
|
||||
return
|
||||
for i, item in enumerate(scraper.get_items(), start = 1):
|
||||
if args.since is not None and item.date < args.since:
|
||||
logger.info(f'Exiting due to reaching older results than {args.since}')
|
||||
break
|
||||
if args.format is not None:
|
||||
print(args.format.format(**item._asdict()))
|
||||
if args.jsonl:
|
||||
print(item.json())
|
||||
elif args.format is not None:
|
||||
print(args.format.format(item))
|
||||
else:
|
||||
print(item)
|
||||
if args.progress and i % 100 == 0:
|
||||
print(f'Scraping, {i} results so far', file = sys.stderr)
|
||||
if args.maxResults and i >= args.maxResults:
|
||||
logger.info(f'Exiting after {i} results')
|
||||
if args.progress:
|
||||
print(f'Stopped scraping after {i} results due to --max-results', file = sys.stderr)
|
||||
break
|
||||
else:
|
||||
logger.info(f'Done, found {i} results')
|
||||
if args.progress:
|
||||
print(f'Finished, {i} results', file = sys.stderr)
|
||||
158
snscrape/base.py
158
snscrape/base.py
@@ -1,22 +1,121 @@
|
||||
import abc
|
||||
import copy
|
||||
import dataclasses
|
||||
import datetime
|
||||
import functools
|
||||
import json
|
||||
import logging
|
||||
import requests
|
||||
import time
|
||||
import warnings
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Item:
|
||||
class _DeprecatedProperty:
|
||||
def __init__(self, name, repl, replStr):
|
||||
self.name = name
|
||||
self.repl = repl
|
||||
self.replStr = replStr
|
||||
|
||||
def __get__(self, obj, objType):
|
||||
if obj is None: # if the access is through the class using _DeprecatedProperty rather than an instance of the class:
|
||||
return self
|
||||
warnings.warn(f'{self.name} is deprecated, use {self.replStr} instead', FutureWarning, stacklevel = 2)
|
||||
return self.repl(obj)
|
||||
|
||||
|
||||
def _json_serialise_datetime(obj):
|
||||
'''A JSON serialiser that converts datetime.datetime and datetime.date objects to ISO-8601 strings.'''
|
||||
|
||||
if isinstance(obj, (datetime.datetime, datetime.date)):
|
||||
return obj.isoformat()
|
||||
raise TypeError(f'Object of type {type(obj)} is not JSON serializable')
|
||||
|
||||
|
||||
def _json_dataclass_to_dict(obj):
|
||||
if isinstance(obj, _JSONDataclass) or dataclasses.is_dataclass(obj):
|
||||
out = {}
|
||||
out['_type'] = f'{type(obj).__module__}.{type(obj).__name__}'
|
||||
for field in dataclasses.fields(obj):
|
||||
assert field.name != '_type'
|
||||
if field.name.startswith('_'):
|
||||
continue
|
||||
out[field.name] = _json_dataclass_to_dict(getattr(obj, field.name))
|
||||
# Add in (non-deprecated) properties
|
||||
for k in dir(obj):
|
||||
if isinstance(getattr(type(obj), k, None), property):
|
||||
assert k != '_type'
|
||||
if k.startswith('_'):
|
||||
continue
|
||||
out[k] = _json_dataclass_to_dict(getattr(obj, k))
|
||||
return out
|
||||
elif isinstance(obj, (tuple, list)):
|
||||
return type(obj)(_json_dataclass_to_dict(x) for x in obj)
|
||||
elif isinstance(obj, dict):
|
||||
return {_json_dataclass_to_dict(k): _json_dataclass_to_dict(v) for k, v in obj.items()}
|
||||
elif isinstance(obj, set):
|
||||
return {_json_dataclass_to_dict(v) for v in obj}
|
||||
else:
|
||||
return copy.deepcopy(obj)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class _JSONDataclass:
|
||||
'''A base class for dataclasses for conversion to JSON'''
|
||||
|
||||
def json(self):
|
||||
'''Convert the object to a JSON string'''
|
||||
|
||||
out = _json_dataclass_to_dict(self)
|
||||
for key, value in list(out.items()): # Modifying the dict below, so make a copy first
|
||||
if isinstance(value, IntWithGranularity):
|
||||
out[key] = int(value)
|
||||
assert f'{key}.granularity' not in out, f'Granularity collision on {key}.granularity'
|
||||
out[f'{key}.granularity'] = value.granularity
|
||||
return json.dumps(out, default = _json_serialise_datetime)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Item(_JSONDataclass):
|
||||
'''An abstract base class for an item returned by the scraper's get_items generator.
|
||||
|
||||
An item can really be anything. The string representation should be useful for the CLI output (e.g. a direct URL for the item).'''
|
||||
An item can really be anything. The string representation should be useful for the CLI output (e.g. a direct URL for the item).
|
||||
'''
|
||||
|
||||
@abc.abstractmethod
|
||||
def __str__(self):
|
||||
pass
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Entity(_JSONDataclass):
|
||||
'''An abstract base class for an entity returned by the scraper's entity property.
|
||||
|
||||
An entity is typically the account of a person or organisation. The string representation should be the preferred direct URL to the entity's page on the network.
|
||||
'''
|
||||
|
||||
@abc.abstractmethod
|
||||
def __str__(self):
|
||||
pass
|
||||
|
||||
|
||||
class IntWithGranularity(int):
|
||||
'''A number with an associated granularity
|
||||
|
||||
For example, an IntWithGranularity(42000, 1000) represents a number on the order of 42000 with two significant digits, i.e. something counted with a granularity of 1000.
|
||||
'''
|
||||
|
||||
def __new__(cls, value, granularity, *args, **kwargs):
|
||||
obj = super().__new__(cls, value, *args, **kwargs)
|
||||
obj.granularity = granularity
|
||||
return obj
|
||||
|
||||
def __reduce__(self):
|
||||
return (IntWithGranularity, (int(self), self.granularity))
|
||||
|
||||
|
||||
class URLItem(Item):
|
||||
'''A generic item which only holds a URL string.'''
|
||||
|
||||
@@ -40,34 +139,57 @@ class Scraper:
|
||||
|
||||
name = None
|
||||
|
||||
def __init__(self, retries = 3):
|
||||
def __init__(self, *, retries = 3, proxies = None):
|
||||
self._retries = retries
|
||||
self._proxies = proxies
|
||||
self._session = requests.Session()
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_items(self):
|
||||
'''Iterator yielding Items.'''
|
||||
|
||||
pass
|
||||
|
||||
def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None):
|
||||
def _get_entity(self):
|
||||
'''Get the entity behind the scraper, if any.
|
||||
|
||||
This is the method implemented by subclasses for doing the actual retrieval/entity object creation. For accessing the scraper's entity, use the entity property.
|
||||
'''
|
||||
|
||||
return None
|
||||
|
||||
@functools.cached_property
|
||||
def entity(self):
|
||||
return self._get_entity()
|
||||
|
||||
def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None, allowRedirects = True, proxies = None):
|
||||
proxies = proxies or self._proxies or {}
|
||||
for attempt in range(self._retries + 1):
|
||||
# The request is newly prepared on each retry because of potential cookie updates.
|
||||
req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
|
||||
environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None)
|
||||
logger.info(f'Retrieving {req.url}')
|
||||
logger.debug(f'... with headers: {headers!r}')
|
||||
if data:
|
||||
logger.debug(f'... with data: {data!r}')
|
||||
if environmentSettings:
|
||||
logger.debug(f'... with environmentSettings: {environmentSettings!r}')
|
||||
try:
|
||||
r = self._session.send(req, timeout = timeout)
|
||||
r = self._session.send(req, allow_redirects = allowRedirects, timeout = timeout, **environmentSettings)
|
||||
except requests.exceptions.RequestException as exc:
|
||||
if attempt < self._retries:
|
||||
retrying = ', retrying'
|
||||
level = logging.WARNING
|
||||
level = logging.INFO
|
||||
else:
|
||||
retrying = ''
|
||||
level = logging.ERROR
|
||||
logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
|
||||
else:
|
||||
redirected = f' (redirected to {r.url})' if r.history else ''
|
||||
logger.info(f'Retrieved {req.url}{redirected}: {r.status_code}')
|
||||
if r.history:
|
||||
for i, redirect in enumerate(r.history):
|
||||
logger.debug(f'... request {i}: {redirect.request.url}: {r.status_code} (Location: {r.headers.get("Location")})')
|
||||
if responseOkCallback is not None:
|
||||
success, msg = responseOkCallback(r)
|
||||
else:
|
||||
@@ -80,7 +202,7 @@ class Scraper:
|
||||
else:
|
||||
if attempt < self._retries:
|
||||
retrying = ', retrying'
|
||||
level = logging.WARNING
|
||||
level = logging.INFO
|
||||
else:
|
||||
retrying = ''
|
||||
level = logging.ERROR
|
||||
@@ -102,11 +224,23 @@ class Scraper:
|
||||
return self._request('POST', *args, **kwargs)
|
||||
|
||||
@classmethod
|
||||
@abc.abstractmethod
|
||||
def setup_parser(cls, subparser):
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
@abc.abstractmethod
|
||||
def from_args(cls, args):
|
||||
pass
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._construct(args)
|
||||
|
||||
@classmethod
|
||||
def _cli_construct(cls, argparseArgs, *args, **kwargs):
|
||||
return cls(*args, **kwargs, retries = argparseArgs.retries)
|
||||
|
||||
|
||||
def nonempty_string(name):
|
||||
def f(s):
|
||||
s = s.strip()
|
||||
if s:
|
||||
return s
|
||||
raise ValueError('must not be an empty string')
|
||||
f.__name__ = name
|
||||
return f
|
||||
|
||||
@@ -1,15 +1,17 @@
|
||||
import importlib
|
||||
import os
|
||||
import snscrape.base
|
||||
import pkgutil
|
||||
|
||||
|
||||
__all__ = []
|
||||
|
||||
|
||||
def _import_modules():
|
||||
files = os.listdir(__path__[0])
|
||||
for fn in files:
|
||||
if fn.endswith('.py') and fn != '__init__.py':
|
||||
# Import module if not already imported
|
||||
moduleName = f'snscrape.modules.{fn[:-3]}'
|
||||
module = importlib.import_module(moduleName)
|
||||
prefixLen = len(__name__) + 1
|
||||
for importer, moduleName, isPkg in pkgutil.iter_modules(__path__, prefix = f'{__name__}.'):
|
||||
assert not isPkg
|
||||
moduleNameWithoutPrefix = moduleName[prefixLen:]
|
||||
__all__.append(moduleNameWithoutPrefix)
|
||||
module = importer.find_module(moduleName).load_module(moduleName)
|
||||
globals()[moduleNameWithoutPrefix] = module
|
||||
|
||||
|
||||
_import_modules()
|
||||
|
||||
@@ -1,4 +1,8 @@
|
||||
__all__ = ['FacebookPost', 'User', 'FacebookUserScraper', 'FacebookCommunityScraper', 'FacebookGroupScraper']
|
||||
|
||||
|
||||
import bs4
|
||||
import dataclasses
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
@@ -8,22 +12,44 @@ import typing
|
||||
import urllib.parse
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FacebookPost(typing.NamedTuple, snscrape.base.Item):
|
||||
@dataclasses.dataclass
|
||||
class FacebookPost(snscrape.base.Item):
|
||||
cleanUrl: str
|
||||
dirtyUrl: str
|
||||
date: datetime.datetime
|
||||
content: typing.Optional[str]
|
||||
outlinks: list
|
||||
outlinksss: str
|
||||
|
||||
outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks), 'outlinks')
|
||||
|
||||
def __str__(self):
|
||||
return self.cleanUrl
|
||||
|
||||
|
||||
class FacebookCommonScraper(snscrape.base.Scraper):
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Entity):
|
||||
username: str
|
||||
pageId: int
|
||||
name: str
|
||||
verified: bool
|
||||
created: typing.Optional[datetime.date] = None
|
||||
pageOwner: typing.Optional[str] = None
|
||||
likes: typing.Optional[int] = None
|
||||
followers: typing.Optional[int] = None
|
||||
checkins: typing.Optional[int] = None
|
||||
address: typing.Optional[str] = None
|
||||
phone: typing.Optional[str] = None
|
||||
web: typing.Optional[str] = None
|
||||
keywords: typing.Optional[typing.List[str]] = None
|
||||
|
||||
def __str__(self):
|
||||
return f'https://www.facebook.com/{self.username}/'
|
||||
|
||||
|
||||
class _FacebookCommonScraper(snscrape.base.Scraper):
|
||||
def _clean_url(self, dirtyUrl):
|
||||
u = urllib.parse.urlparse(dirtyUrl)
|
||||
if u.path == '/permalink.php':
|
||||
@@ -41,7 +67,7 @@ class FacebookCommonScraper(snscrape.base.Scraper):
|
||||
if setVal.rstrip('0123456789').endswith('.a.'):
|
||||
setVal = f'a.{setVal.rsplit(".", 1)[1]}'
|
||||
clean = (u.scheme, u.netloc, u.path, urllib.parse.urlencode((('set', setVal),)), '')
|
||||
elif u.path.split('/')[2] == 'posts' or u.path.startswith('/events/') or u.path.startswith('/notes/'):
|
||||
elif u.path.split('/')[2] == 'posts' or u.path.startswith('/events/') or u.path.startswith('/notes/') or u.path.split('/')[1:4:2] == ['groups', 'permalink']:
|
||||
# No manipulation of the path needed, but strip the query string
|
||||
clean = (u.scheme, u.netloc, u.path, '', '')
|
||||
elif u.path.split('/')[2] in ('photos', 'videos'):
|
||||
@@ -80,11 +106,23 @@ class FacebookCommonScraper(snscrape.base.Scraper):
|
||||
return False, None
|
||||
|
||||
def _soup_to_items(self, soup, baseUrl, mode):
|
||||
cleanUrl = None # Value from previous iteration is used for warning on link-less entries
|
||||
for entry in soup.find_all('div', class_ = '_5pcr'): # also class 'fbUserContent' in 2017 and 'userContentWrapper' in 2019
|
||||
# Check that this is not inside another div._5pcr to avoid duplicates or extracting the wrong URL (e.g. 'X was mentioned in a post' on community pages)
|
||||
parent = entry.parent
|
||||
isNested = False
|
||||
while parent:
|
||||
if parent.name == 'div' and 'class' in parent.attrs and '_5pcr' in parent.attrs['class']:
|
||||
isNested = True
|
||||
break
|
||||
parent = parent.parent
|
||||
if isNested:
|
||||
continue
|
||||
|
||||
entryA = entry.find('a', class_ = '_5pcq') # There can be more than one, e.g. when a post is shared by another user, but the first one is always the one of this entry.
|
||||
mediaSetA = entry.find('a', class_ = '_17z-')
|
||||
if not mediaSetA and not entryA:
|
||||
logger.warning(f'Ignoring link-less entry after {cleanUrl}: {entry.text!r}')
|
||||
_logger.warning(f'Ignoring link-less entry after {cleanUrl}: {entry.text!r}')
|
||||
continue
|
||||
if mediaSetA and (not entryA or entryA['href'] == '#'):
|
||||
href = mediaSetA['href']
|
||||
@@ -93,12 +131,12 @@ class FacebookCommonScraper(snscrape.base.Scraper):
|
||||
oddLink, warn = self._is_odd_link(href, entry.text, mode)
|
||||
if oddLink:
|
||||
if warn:
|
||||
logger.warning(f'Ignoring odd link: {href}')
|
||||
_logger.warning(f'Ignoring odd link: {href}')
|
||||
continue
|
||||
dirtyUrl = urllib.parse.urljoin(baseUrl, href)
|
||||
cleanUrl = self._clean_url(dirtyUrl)
|
||||
date = datetime.datetime.fromtimestamp(int(entry.find('abbr', class_ = '_5ptz')['data-utime']), datetime.timezone.utc)
|
||||
contentDiv = entry.find('div', class_ = '_5pbx')
|
||||
if contentDiv:
|
||||
if (contentDiv := entry.find('div', class_ = '_5pbx')):
|
||||
content = contentDiv.text
|
||||
else:
|
||||
content = None
|
||||
@@ -111,71 +149,157 @@ class FacebookCommonScraper(snscrape.base.Scraper):
|
||||
continue
|
||||
query = urllib.parse.parse_qs(urllib.parse.urlparse(href).query)
|
||||
if 'u' not in query or len(query['u']) != 1:
|
||||
logger.warning(f'Ignoring odd outlink: {href}')
|
||||
_logger.warning(f'Ignoring odd outlink: {href}')
|
||||
continue
|
||||
outlink = query['u'][0]
|
||||
if outlink.startswith('http://') or outlink.startswith('https://') and outlink not in outlinks:
|
||||
outlinks.append(outlink)
|
||||
yield FacebookPost(cleanUrl = self._clean_url(dirtyUrl), dirtyUrl = dirtyUrl, date = date, content = content, outlinks = outlinks, outlinksss = ' '.join(outlinks))
|
||||
yield FacebookPost(cleanUrl = cleanUrl, dirtyUrl = dirtyUrl, date = date, content = content, outlinks = outlinks)
|
||||
|
||||
|
||||
class FacebookUserScraper(FacebookCommonScraper):
|
||||
name = 'facebook-user'
|
||||
|
||||
class _FacebookUserAndCommunityScraper(_FacebookCommonScraper):
|
||||
def __init__(self, username, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._username = username
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:78.0) Gecko/20100101 Firefox/78.0', 'Accept-Language': 'en-US,en;q=0.5'}
|
||||
self._initialPage = None
|
||||
self._initialPageSoup = None
|
||||
|
||||
def _initial_page(self):
|
||||
if self._initialPage is None:
|
||||
_logger.info('Retrieving initial data')
|
||||
r = self._get(self._baseUrl, headers = self._headers)
|
||||
if r.status_code not in (200, 404):
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
self._initialPage = r
|
||||
self._initialPageSoup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
return self._initialPage, self._initialPageSoup
|
||||
|
||||
def get_items(self):
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', 'Accept-Language': 'en-US,en;q=0.5'}
|
||||
|
||||
nextPageLinkPattern = re.compile(r'^/pages_reaction_units/more/\?page_id=')
|
||||
spuriousForLoopPattern = re.compile(r'^for \(;;\);')
|
||||
|
||||
logger.info('Retrieving initial data')
|
||||
baseUrl = f'https://www.facebook.com/{self._username}/'
|
||||
r = self._get(baseUrl, headers = headers)
|
||||
r, soup = self._initial_page()
|
||||
if r.status_code == 404:
|
||||
logger.warning('User does not exist')
|
||||
_logger.warning('User does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
logger.error('Got status code {r.status_code}')
|
||||
return
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
yield from self._soup_to_items(soup, baseUrl, 'user')
|
||||
nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern)
|
||||
yield from self._soup_to_items(soup, self._baseUrl, 'user')
|
||||
|
||||
while nextPageLink:
|
||||
logger.info('Retrieving next page')
|
||||
while (nextPageLink := soup.find('a', ajaxify = nextPageLinkPattern)):
|
||||
_logger.info('Retrieving next page')
|
||||
|
||||
# The web app sends a bunch of additional parameters. Most of them would be easy to add, but there's also __dyn, which is a compressed list of the "modules" loaded in the browser.
|
||||
# Reproducing that would be difficult to get right, especially as Facebook's codebase evolves, so it's just not sent at all here.
|
||||
r = self._get(urllib.parse.urljoin(baseUrl, nextPageLink.get('ajaxify')) + '&__a=1', headers = headers)
|
||||
r = self._get(urllib.parse.urljoin(self._baseUrl, nextPageLink.get('ajaxify')) + '&__a=1', headers = self._headers)
|
||||
if r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
response = json.loads(spuriousForLoopPattern.sub('', r.text))
|
||||
assert 'domops' in response
|
||||
assert len(response['domops']) == 1
|
||||
assert len(response['domops'][0]) == 4
|
||||
assert response['domops'][0][0] == 'replace', f'{response["domops"][0]} is not "replace"'
|
||||
assert response['domops'][0][1] == '#www_pages_reaction_see_more_unitwww_pages_home'
|
||||
assert response['domops'][0][1] in ('#www_pages_reaction_see_more_unitwww_pages_home', '#www_pages_reaction_see_more_unitwww_pages_community_tab')
|
||||
assert response['domops'][0][2] == False
|
||||
assert '__html' in response['domops'][0][3]
|
||||
soup = bs4.BeautifulSoup(response['domops'][0][3]['__html'], 'lxml')
|
||||
yield from self._soup_to_items(soup, baseUrl, 'user')
|
||||
nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern)
|
||||
yield from self._soup_to_items(soup, self._baseUrl, 'user')
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
subparser.add_argument('username', help = 'A Facebook username or user ID')
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('username', type = snscrape.base.nonempty_string('username'), help = 'A Facebook username or user ID')
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls(args.username, retries = args.retries)
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.username)
|
||||
|
||||
|
||||
class FacebookGroupScraper(FacebookCommonScraper):
|
||||
class FacebookUserScraper(_FacebookUserAndCommunityScraper):
|
||||
name = 'facebook-user'
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._baseUrl = f'https://www.facebook.com/{self._username}/'
|
||||
|
||||
def _get_entity(self):
|
||||
kwargs = {}
|
||||
|
||||
nameVerifiedMarkupPattern = re.compile(r'"markup":\[\["__markup_a588f507_0_0",\{"__html":(".*?")\}')
|
||||
handleDivPattern = re.compile(r'<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>')
|
||||
handlePattern = re.compile(r'<a\s[^>]*(?<=\s)href="/([^/]+)')
|
||||
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
|
||||
createdDatePattern = re.compile('^(' + '|'.join(months) + r') (\d+), (\d+)$')
|
||||
|
||||
r, soup = self._initial_page()
|
||||
if r.status_code != 200:
|
||||
return
|
||||
|
||||
handleDiv = handleDivPattern.search(r.text)
|
||||
handle = handlePattern.search(handleDiv.group(0))
|
||||
kwargs['username'] = handle.group(1)
|
||||
|
||||
nameVerifiedMarkup = nameVerifiedMarkupPattern.search(r.text)
|
||||
nameVerifiedMarkup = json.loads(nameVerifiedMarkup.group(1))
|
||||
nameVerifiedSoup = bs4.BeautifulSoup(nameVerifiedMarkup, 'lxml')
|
||||
kwargs['name'] = nameVerifiedSoup.find('a', class_ = '_64-f').text
|
||||
kwargs['verified'] = bool(nameVerifiedSoup.find('a', class_ = '_56_f'))
|
||||
|
||||
pageTransparencyContentDiv = soup.find('div', class_ = '_61-0')
|
||||
if pageTransparencyContentDiv.text.startswith('Page created - '):
|
||||
createdDateMess = pageTransparencyContentDiv.text.split(' - ', 1)[1]
|
||||
m = createdDatePattern.match(createdDateMess)
|
||||
assert m, 'unexpected created div content'
|
||||
kwargs['created'] = datetime.date(int(m.group(3)), months.index(m.group(1)) + 1, int(m.group(2)))
|
||||
if pageTransparencyContentDiv.text.startswith('Confirmed Page Owner: '):
|
||||
kwargs['pageOwner'] = pageTransparencyContentDiv.text.split(': ', 1)[1]
|
||||
|
||||
communityDiv = soup.find('div', class_ = '_6590')
|
||||
for div in communityDiv.find_all('div', class_ = '_4bl9'):
|
||||
text = div.text
|
||||
if text.endswith(' people like this'):
|
||||
kwargs['likes'] = int(text.split(' ', 1)[0].replace(',', ''))
|
||||
elif text.endswith(' people follow this'):
|
||||
kwargs['followers'] = int(text.split(' ', 1)[0].replace(',', ''))
|
||||
elif text.endswith(' check-ins'):
|
||||
kwargs['checkins'] = int(text.split(' ', 1)[0].replace(',', ''))
|
||||
|
||||
aboutDiv = soup.find('div', class_ = '_u9q')
|
||||
if aboutDiv:
|
||||
# As if the above wasn't already ugly enough, this is where it gets really bad...
|
||||
for div in aboutDiv.find_all('div', class_ = '_2pi9'):
|
||||
img = div.find('img', class_ = '_3-91')
|
||||
if not img:
|
||||
continue
|
||||
if img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/y5/r/vfXKA62x4Da.png': # Address
|
||||
rawAddress = div.find('div', class_ = '_2wzd').text
|
||||
kwargs['address'] = re.sub(r' \((\d+,)?\d+(\.\d+)? mi\)', '\n', rawAddress) # Remove distance from inferred IP location, restore linebreak
|
||||
elif img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/yW/r/mYv88EsODOI.png': # Phone number
|
||||
kwargs['phone'] = div.find('div', class_ = '_4bl9').text
|
||||
elif img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/yx/r/xVA3lB-GVep.png': # Web link
|
||||
for a in div.find_all('a'):
|
||||
if a.text == '' or 'href' not in a.attrs or a.find('span'):
|
||||
continue
|
||||
dirtyWeb = a['href']
|
||||
assert dirtyWeb.startswith('https://l.facebook.com/l.php?u='), 'unexpected web link'
|
||||
kwargs['web'] = urllib.parse.unquote(dirtyWeb.split('=', 1)[1].split('&', 1)[0])
|
||||
elif img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/yl/r/LwDWwC1d0Rx.png': # Keywords
|
||||
kwargs['keywords'] = div.find('div', class_ = '_4bl9').text.split(' · ')
|
||||
|
||||
androidUrlMeta = soup.find('meta', property = 'al:android:url')
|
||||
assert androidUrlMeta['content'].startswith('fb://page/') and androidUrlMeta['content'].endswith('?referrer=app_link')
|
||||
kwargs['pageId'] = int(androidUrlMeta['content'][10:-18])
|
||||
|
||||
return User(**kwargs)
|
||||
|
||||
|
||||
class FacebookCommunityScraper(_FacebookUserAndCommunityScraper):
|
||||
name = 'facebook-community'
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._baseUrl = f'https://www.facebook.com/{self._username}/community/'
|
||||
|
||||
|
||||
class FacebookGroupScraper(_FacebookCommonScraper):
|
||||
name = 'facebook-group'
|
||||
|
||||
def __init__(self, group, **kwargs):
|
||||
@@ -189,18 +313,16 @@ class FacebookGroupScraper(FacebookCommonScraper):
|
||||
pageletDataPrefixLength = len('"GroupEntstreamPagelet",')
|
||||
spuriousForLoopPattern = re.compile(r'^for \(;;\);')
|
||||
|
||||
baseUrl = f'https://www.facebook.com/groups/{self._group}/'
|
||||
baseUrl = f'https://upload.facebook.com/groups/{self._group}/?sorting_setting=CHRONOLOGICAL'
|
||||
r = self._get(baseUrl, headers = headers)
|
||||
if r.status_code == 404:
|
||||
logger.warning('Group does not exist')
|
||||
_logger.warning('Group does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
logger.error('Got status code {r.status_code}')
|
||||
return
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
|
||||
if 'content:{pagelet_group_mall:{container_id:"' not in r.text:
|
||||
logger.error('Code container ID marker not found (does the group exist?)')
|
||||
return
|
||||
raise snscrape.base.ScraperException('Code container ID marker not found (does the group exist?)')
|
||||
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
|
||||
@@ -210,35 +332,33 @@ class FacebookGroupScraper(FacebookCommonScraper):
|
||||
codeContainerId = r.text[codeContainerIdPos : r.text.index('"', codeContainerIdPos)]
|
||||
codeContainer = soup.find('code', id = codeContainerId)
|
||||
if not codeContainer:
|
||||
raise RuntimeError('Code container not found')
|
||||
raise snscrape.base.ScraperException('Code container not found')
|
||||
if type(codeContainer.string) is not bs4.element.Comment:
|
||||
raise RuntimeError('Code container does not contain a comment')
|
||||
raise snscrape.base.ScraperException('Code container does not contain a comment')
|
||||
codeSoup = bs4.BeautifulSoup(codeContainer.string, 'lxml')
|
||||
yield from self._soup_to_items(codeSoup, baseUrl, 'group')
|
||||
|
||||
# Pagination
|
||||
data = pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:]
|
||||
while True:
|
||||
while (data := pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:]):
|
||||
# As on the user profile pages, the web app sends a lot of additional parameters, but those all seem to be unnecessary (although some change the response format, e.g. from JSON to HTML)
|
||||
r = self._get(
|
||||
f'https://www.facebook.com/ajax/pagelet/generic.php/GroupEntstreamPagelet',
|
||||
'https://upload.facebook.com/ajax/pagelet/generic.php/GroupEntstreamPagelet',
|
||||
params = {'data': data, '__a': 1},
|
||||
headers = headers,
|
||||
)
|
||||
if r.status_code != 200:
|
||||
raise RuntimeError(f'Got status code {r.status_code}')
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
obj = json.loads(spuriousForLoopPattern.sub('', r.text))
|
||||
if obj['payload'] == '':
|
||||
# End of pagination
|
||||
break
|
||||
soup = bs4.BeautifulSoup(obj['payload'], 'lxml')
|
||||
yield from self._soup_to_items(soup, baseUrl, 'group')
|
||||
data = pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:]
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
subparser.add_argument('group', help = 'A group name or ID')
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('group', type = snscrape.base.nonempty_string('group'), help = 'A group name or ID')
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls(args.group, retries = args.retries)
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.group)
|
||||
|
||||
@@ -1,115 +0,0 @@
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import snscrape.base
|
||||
import time
|
||||
import typing
|
||||
import urllib.parse
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GabPost(typing.NamedTuple, snscrape.base.Item):
|
||||
url: str
|
||||
date: datetime.datetime
|
||||
content: str
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
class GabUserCommonScraper(snscrape.base.Scraper):
|
||||
def __init__(self, mode, username, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
if mode not in ('posts', 'comments', 'media'):
|
||||
raise ValueError('Invalid mode')
|
||||
self._mode = mode
|
||||
self._username = username
|
||||
if mode == 'posts':
|
||||
self._baseUrl = f'https://gab.com/api/feed/{username}'
|
||||
self._beforeGlue = '?'
|
||||
elif mode == 'comments':
|
||||
self._baseUrl = f'https://gab.com/api/feed/{username}/comments?includes=post.conversation_parent'
|
||||
self._beforeGlue = '&'
|
||||
elif mode == 'media':
|
||||
self._baseUrl = f'https://gab.com/api/feed/{username}/media'
|
||||
self._beforeGlue = '?'
|
||||
|
||||
def _response_to_items(self, response):
|
||||
yielded = set()
|
||||
for post in response['data']:
|
||||
if post['post']['id'] not in yielded:
|
||||
yield GabPost(
|
||||
url = f'https://gab.com/{post["post"]["user"]["username"]}/posts/{post["post"]["id"]}',
|
||||
date = datetime.datetime.strptime(post['post']['created_at'].replace('-', '', 2).replace(':', ''), '%Y%m%dT%H%M%S%z'),
|
||||
content = post['post']['body'],
|
||||
)
|
||||
yielded.add(post['post']['id'])
|
||||
|
||||
def get_items(self):
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept-Language': 'en-US,en;q=0.5'}
|
||||
|
||||
logger.info('Retrieving initial data')
|
||||
r = self._get(self._baseUrl, headers = headers)
|
||||
if r.status_code == 404:
|
||||
logger.error('User does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
|
||||
response = json.loads(r.text)
|
||||
if not response['data']:
|
||||
logger.error('User has no posts')
|
||||
return
|
||||
yield from self._response_to_items(response)
|
||||
if self._mode == 'posts':
|
||||
before = response['data'][-1]['published_at']
|
||||
elif self._mode in ('comments', 'media'):
|
||||
before = 30
|
||||
|
||||
while True:
|
||||
logger.info('Retrieving next page')
|
||||
r = self._get(f'{self._baseUrl}{self._beforeGlue}before={before}', headers = headers)
|
||||
if r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
response = json.loads(r.text)
|
||||
yield from self._response_to_items(response)
|
||||
if response['no-more'] or not response['data']:
|
||||
# Last page
|
||||
return
|
||||
if self._mode == 'posts':
|
||||
before = response['data'][-1]['published_at']
|
||||
elif self._mode in ('comments', 'media'):
|
||||
before += 30
|
||||
time.sleep(1) # Gab's API is pretty quick but doesn't like being hammered...
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
subparser.add_argument('username', help = 'A Gab username')
|
||||
|
||||
|
||||
class GabUserPostsScraper(GabUserCommonScraper):
|
||||
name = 'gab-user'
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls('posts', args.username, retries = args.retries)
|
||||
|
||||
|
||||
class GabUserCommentsScraper(GabUserCommonScraper):
|
||||
name = 'gab-user-comments'
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls('comments', args.username, retries = args.retries)
|
||||
|
||||
|
||||
class GabUserMediaScraper(GabUserCommonScraper):
|
||||
name = 'gab-user-media'
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls('media', args.username, retries = args.retries)
|
||||
@@ -1,102 +0,0 @@
|
||||
import datetime
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import snscrape.base
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GooglePlusUserScraper(snscrape.base.Scraper):
|
||||
name = 'googleplus-user'
|
||||
|
||||
def __init__(self, user, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._user = user
|
||||
|
||||
def get_items(self):
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
|
||||
|
||||
logger.info('Retrieving initial data')
|
||||
r = self._get(f'https://plus.google.com/{self._user}', headers = headers)
|
||||
if r.status_code == 404:
|
||||
logger.warning('User does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
|
||||
# Global data; only needed for the session ID
|
||||
#TODO: Make this more robust somehow
|
||||
match = re.search(r'''(['"])FdrFJe\1\s*:\s*(['"])(?P<sid>.*?)\2''', r.text)
|
||||
if not match:
|
||||
logger.error('Unable to find session ID')
|
||||
return
|
||||
sid = match.group('sid')
|
||||
|
||||
# Page data
|
||||
# As of 2018-05-18, the much simpler regex r'''<script[^>]*>AF_initDataCallback\(\{key: 'ds:6',.*?return (.*?)\}\}\);</script>''' would work also, but this is more generic and less likely to break:
|
||||
match = re.search(r'''<script[^>]*>\s*(?:.*?)\s*\(\s*\{(?:|.*?,)\s*key\s*:\s*(['"])ds:6\1\s*,.*?,\s*data\s*:\s*function\s*\(\s*\)\s*\{\s*return\s*(?P<data>.*?)\}\s*\}\s*\)\s*;\s*</script>''', r.text, re.DOTALL)
|
||||
if not match:
|
||||
logger.error('Unable to extract data')
|
||||
return
|
||||
jsonData = match.group('data')
|
||||
response = json.loads(jsonData)
|
||||
if response[0][7] is None:
|
||||
logger.info('User has no posts')
|
||||
return
|
||||
for postObj in response[0][7]:
|
||||
yield snscrape.base.URLItem(f'https://plus.google.com/{postObj[6]["33558957"][21]}')
|
||||
cursor = response[0][1] # 'ADSJ_x'
|
||||
if cursor is None:
|
||||
# No further pages
|
||||
return
|
||||
baseDate = datetime.datetime.utcnow()
|
||||
baseSeconds = baseDate.hour * 3600 + baseDate.minute * 60 + baseDate.second
|
||||
userid = response[1] # Alternatively and more ugly: response[0][7][0][6]['33558957'][16]
|
||||
|
||||
for counter in itertools.count(start = 2):
|
||||
logger.info('Retrieving next page')
|
||||
reqid = 1 + baseSeconds + int(1e5) * counter
|
||||
r = self._post(
|
||||
f'https://plus.google.com/_/PlusAppUi/data?ds.extension=74333095&f.sid={sid}&hl=en-US&soc-app=199&soc-platform=1&soc-device=1&_reqid={reqid}&rt=c',
|
||||
data = [('f.req', '[[[74333095,[{"74333095":["' + cursor + '","' + userid + '"]}],null,null,0]]]'), ('', '')],
|
||||
headers = headers
|
||||
)
|
||||
if r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
|
||||
# As if everything up to here wasn't terrible already, this is where it gets *really* bad.
|
||||
# The API contains a few junk characters at the beginning, apparently as an anti-CSRF measure.
|
||||
# The remainder is effectively a self-made chunked transfer encoding but with decimal digits and including everything except the digits themselves in the chunk size.
|
||||
# It sucks.
|
||||
# Each chunk is actually one JSON object; you'd think that we can just read the first one and parse that, but there are some quirks that make this difficult.
|
||||
# I was unable to figure out what the "chunk size" actually covers exactly; the response is UTF-8 encoded, but the chunk size matches neither the binary nor the decoded length.
|
||||
# Enter the awful workaround: strip away the initial chunk size, then parse the beginning of the remaining data using a parser that doesn't care if there's junk after the JSON.
|
||||
|
||||
garbage = r.text
|
||||
assert garbage[:6] == ")]}'\n\n" # anti-CSRF and two newlines
|
||||
data = []
|
||||
pos = 6
|
||||
while garbage[pos].isdigit() or garbage[pos].isspace(): # Also strip leading whitespace
|
||||
pos += 1
|
||||
response = json.JSONDecoder().raw_decode(''.join(garbage[pos:]))[0] # Parses only the first structure in the data stream without throwing an error about the extra data at the end
|
||||
|
||||
for postObj in response[0][2]['74333095'][0][7]:
|
||||
yield snscrape.base.URLItem(f'https://plus.google.com/{postObj[6]["33558957"][21]}')
|
||||
|
||||
cursor = response[0][2]['74333095'][0][1]
|
||||
|
||||
if cursor is None:
|
||||
break
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
subparser.add_argument('user', help = 'A Google Plus username (with leading "+") or numeric ID')
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls(args.user, retries = args.retries)
|
||||
@@ -1,73 +1,87 @@
|
||||
__all__ = ['InstagramPost', 'User', 'InstagramUserScraper', 'InstagramHashtagScraper', 'InstagramLocationScraper']
|
||||
|
||||
|
||||
import dataclasses
|
||||
import datetime
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import snscrape.base
|
||||
import typing
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class InstagramPost(typing.NamedTuple, snscrape.base.Item):
|
||||
cleanUrl: str
|
||||
dirtyUrl: str
|
||||
@dataclasses.dataclass
|
||||
class InstagramPost(snscrape.base.Item):
|
||||
url: str
|
||||
date: datetime.datetime
|
||||
content: str
|
||||
content: typing.Optional[str]
|
||||
thumbnailUrl: str
|
||||
displayUrl: str
|
||||
username: typing.Optional[str]
|
||||
likes: int
|
||||
comments: int
|
||||
commentsDisabled: bool
|
||||
isVideo: bool
|
||||
|
||||
def __str__(self):
|
||||
return self.cleanUrl
|
||||
return self.url
|
||||
|
||||
|
||||
class InstagramCommonScraper(snscrape.base.Scraper):
|
||||
def __init__(self, mode, name, **kwargs):
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Entity):
|
||||
username: str
|
||||
name: typing.Optional[str]
|
||||
followers: snscrape.base.IntWithGranularity
|
||||
following: snscrape.base.IntWithGranularity
|
||||
posts: snscrape.base.IntWithGranularity
|
||||
|
||||
followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
|
||||
followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
|
||||
postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
|
||||
|
||||
def __str__(self):
|
||||
return f'https://www.instagram.com/{self.username}/'
|
||||
|
||||
|
||||
class _InstagramCommonScraper(snscrape.base.Scraper):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
if mode not in ('User', 'Hashtag', 'Location'):
|
||||
raise ValueError('Invalid mode')
|
||||
self._mode = mode
|
||||
self._name = name
|
||||
|
||||
if self._mode == 'User':
|
||||
self._initialUrl = f'https://www.instagram.com/{self._name}/'
|
||||
self._pageName = 'ProfilePage'
|
||||
self._responseContainer = 'user'
|
||||
self._edgeXToMedia = 'edge_owner_to_timeline_media'
|
||||
self._pageIDKey = 'id'
|
||||
self._queryHash = 'f2405b236d85e8296cf30347c9f08c2a'
|
||||
self._variablesFormat = '{{"id":"{pageID}","first":50,"after":"{endCursor}"}}'
|
||||
elif self._mode == 'Hashtag':
|
||||
self._initialUrl = f'https://www.instagram.com/explore/tags/{self._name}/'
|
||||
self._pageName = 'TagPage'
|
||||
self._responseContainer = 'hashtag'
|
||||
self._edgeXToMedia = 'edge_hashtag_to_media'
|
||||
self._pageIDKey = 'name'
|
||||
self._queryHash = 'f92f56d47dc7a55b606908374b43a314'
|
||||
self._variablesFormat = '{{"tag_name":"{pageID}","first":50,"after":"{endCursor}"}}'
|
||||
elif self._mode == 'Location':
|
||||
self._initialUrl = f'https://www.instagram.com/explore/locations/{self._name}/'
|
||||
self._pageName = 'LocationsPage'
|
||||
self._responseContainer = 'location'
|
||||
self._edgeXToMedia = 'edge_location_to_media'
|
||||
self._pageIDKey = 'id'
|
||||
self._queryHash = '1b84447a4d8b6d6d0426fefb34514485'
|
||||
self._variablesFormat = '{{"id":"{pageID}","first":50,"after":"{endCursor}"}}'
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
|
||||
self._initialPage = None
|
||||
|
||||
def _response_to_items(self, response):
|
||||
for node in response[self._responseContainer][self._edgeXToMedia]['edges']:
|
||||
code = node['node']['shortcode']
|
||||
usernameQuery = '?taken-by=' + node['node']['owner']['username'] if 'username' in node['node']['owner'] else ''
|
||||
cleanUrl = f'https://www.instagram.com/p/{code}/'
|
||||
username = node['node']['owner']['username'] if 'username' in node['node']['owner'] else None
|
||||
url = f'https://www.instagram.com/p/{code}/'
|
||||
yield InstagramPost(
|
||||
cleanUrl = cleanUrl,
|
||||
dirtyUrl = f'{cleanUrl}{usernameQuery}',
|
||||
url = url,
|
||||
date = datetime.datetime.fromtimestamp(node['node']['taken_at_timestamp'], datetime.timezone.utc),
|
||||
content = node['node']['edge_media_to_caption']['edges'][0]['node']['text'] if len(node['node']['edge_media_to_caption']['edges']) else None,
|
||||
thumbnailUrl = node['node']['thumbnail_src'],
|
||||
displayUrl = node['node']['display_url'],
|
||||
username = username,
|
||||
likes = node['node']['edge_media_preview_like']['count'],
|
||||
comments = node['node']['edge_media_to_comment']['count'],
|
||||
commentsDisabled = node['node']['comments_disabled'],
|
||||
isVideo = node['node']['is_video'],
|
||||
)
|
||||
|
||||
def _initial_page(self):
|
||||
if self._initialPage is None:
|
||||
_logger.info('Retrieving initial data')
|
||||
r = self._get(self._initialUrl, headers = self._headers, responseOkCallback = self._check_initial_page_callback)
|
||||
if r.status_code not in (200, 404):
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
elif r.url.startswith('https://www.instagram.com/accounts/login/'):
|
||||
raise snscrape.base.ScraperException('Redirected to login page')
|
||||
self._initialPage = r
|
||||
return self._initialPage
|
||||
|
||||
def _check_initial_page_callback(self, r):
|
||||
if r.status_code != 200:
|
||||
return True, None
|
||||
@@ -82,6 +96,8 @@ class InstagramCommonScraper(snscrape.base.Scraper):
|
||||
def _check_json_callback(self, r):
|
||||
if r.status_code != 200:
|
||||
return False, f'status code {r.status_code}'
|
||||
if r.url.startswith('https://www.instagram.com/accounts/login/'):
|
||||
raise snscrape.base.ScraperException('Redirected to login page')
|
||||
try:
|
||||
obj = json.loads(r.text)
|
||||
except json.JSONDecodeError as e:
|
||||
@@ -90,23 +106,17 @@ class InstagramCommonScraper(snscrape.base.Scraper):
|
||||
return True, None
|
||||
|
||||
def get_items(self):
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
|
||||
|
||||
logger.info('Retrieving initial data')
|
||||
r = self._get(self._initialUrl, headers = headers, responseOkCallback = self._check_initial_page_callback)
|
||||
r = self._initial_page()
|
||||
if r.status_code == 404:
|
||||
logger.warning(f'{self._mode} does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
_logger.warning('Page does not exist')
|
||||
return
|
||||
response = r._snscrape_json_obj
|
||||
rhxGis = response['rhx_gis'] if 'rhx_gis' in response else ''
|
||||
if response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['count'] == 0:
|
||||
logger.info(f'{self._mode} has no posts')
|
||||
_logger.info('Page has no posts')
|
||||
return
|
||||
if not response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['edges']:
|
||||
logger.warning('Private account')
|
||||
_logger.warning('Private account')
|
||||
return
|
||||
pageID = response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._pageIDKey]
|
||||
yield from self._response_to_items(response['entry_data'][self._pageName][0]['graphql'])
|
||||
@@ -114,16 +124,16 @@ class InstagramCommonScraper(snscrape.base.Scraper):
|
||||
return
|
||||
endCursor = response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['page_info']['end_cursor']
|
||||
|
||||
headers = self._headers.copy()
|
||||
while True:
|
||||
logger.info(f'Retrieving endCursor = {endCursor!r}')
|
||||
_logger.info(f'Retrieving endCursor = {endCursor!r}')
|
||||
variables = self._variablesFormat.format(**locals())
|
||||
headers['X-Requested-With'] = 'XMLHttpRequest'
|
||||
headers['X-Instagram-GIS'] = hashlib.md5(f'{rhxGis}:{variables}'.encode('utf-8')).hexdigest()
|
||||
r = self._get(f'https://www.instagram.com/graphql/query/?query_hash={self._queryHash}&variables={variables}', headers = headers, responseOkCallback = self._check_json_callback)
|
||||
|
||||
if r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
|
||||
response = r._snscrape_json_obj
|
||||
if not response['data'][self._responseContainer][self._edgeXToMedia]['edges']:
|
||||
@@ -134,37 +144,100 @@ class InstagramCommonScraper(snscrape.base.Scraper):
|
||||
endCursor = response['data'][self._responseContainer][self._edgeXToMedia]['page_info']['end_cursor']
|
||||
|
||||
|
||||
class InstagramUserScraper(InstagramCommonScraper):
|
||||
class InstagramUserScraper(_InstagramCommonScraper):
|
||||
name = 'instagram-user'
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
subparser.add_argument('username', help = 'An Instagram username (no leading @)')
|
||||
def __init__(self, username, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._initialUrl = f'https://www.instagram.com/{username}/'
|
||||
self._pageName = 'ProfilePage'
|
||||
self._responseContainer = 'user'
|
||||
self._edgeXToMedia = 'edge_owner_to_timeline_media'
|
||||
self._pageIDKey = 'id'
|
||||
self._queryHash = 'f2405b236d85e8296cf30347c9f08c2a'
|
||||
self._variablesFormat = '{{"id":"{pageID}","first":50,"after":"{endCursor}"}}'
|
||||
|
||||
def _get_entity(self):
|
||||
r = self._initial_page()
|
||||
if r.status_code != 200:
|
||||
return
|
||||
if '<meta property="og:description" content="' not in r.text:
|
||||
return
|
||||
ogDescriptionContentPos = r.text.index('<meta property="og:description" content="') + len('<meta property="og:description" content="')
|
||||
ogDescription = r.text[ogDescriptionContentPos : r.text.index('"', ogDescriptionContentPos)]
|
||||
|
||||
numPattern = r'\d+(?:\.\d+)?m|\d+(?:\.\d+)?k|\d+,\d+|\d+'
|
||||
ogDescriptionPattern = re.compile('^(' + numPattern + ') Followers, (' + numPattern + ') Following, (' + numPattern + r') Posts - See Instagram photos and videos from (?:(.*?) \(@([a-z0-9_.]+)\)|@([a-z0-9_-]+))$')
|
||||
m = ogDescriptionPattern.match(ogDescription)
|
||||
assert m, 'unexpected og:description format'
|
||||
|
||||
def parse_num(s):
|
||||
if s.endswith('m'):
|
||||
return int(float(s[:-1].replace(',', '')) * 1e6), 10 ** (6 if '.' not in s else 6 - len(s[:-1].replace(',', '').split('.')[1]))
|
||||
elif s.endswith('k'):
|
||||
return int(float(s[:-1].replace(',', '')) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].replace(',', '').split('.')[1]))
|
||||
else:
|
||||
return int(s.replace(',', '')), 1
|
||||
|
||||
followers = snscrape.base.IntWithGranularity(*parse_num(m.group(1)))
|
||||
following = snscrape.base.IntWithGranularity(*parse_num(m.group(2)))
|
||||
posts = snscrape.base.IntWithGranularity(*parse_num(m.group(3)))
|
||||
return User(
|
||||
username = m.group(5) or m.group(6),
|
||||
name = m.group(4) or None,
|
||||
followers = followers,
|
||||
following = following,
|
||||
posts = posts,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls('User', args.username, retries = args.retries)
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('username', type = snscrape.base.nonempty_string('username'), help = 'An Instagram username (no leading @)')
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.username)
|
||||
|
||||
|
||||
class InstagramHashtagScraper(InstagramCommonScraper):
|
||||
class InstagramHashtagScraper(_InstagramCommonScraper):
|
||||
name = 'instagram-hashtag'
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
subparser.add_argument('hashtag', help = 'An Instagram hashtag (no leading #)')
|
||||
def __init__(self, hashtag, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._initialUrl = f'https://www.instagram.com/explore/tags/{hashtag}/'
|
||||
self._pageName = 'TagPage'
|
||||
self._responseContainer = 'hashtag'
|
||||
self._edgeXToMedia = 'edge_hashtag_to_media'
|
||||
self._pageIDKey = 'name'
|
||||
self._queryHash = 'f92f56d47dc7a55b606908374b43a314'
|
||||
self._variablesFormat = '{{"tag_name":"{pageID}","first":50,"after":"{endCursor}"}}'
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls('Hashtag', args.hashtag, retries = args.retries)
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('hashtag', type = snscrape.base.nonempty_string('hashtag'), help = 'An Instagram hashtag (no leading #)')
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.hashtag)
|
||||
|
||||
|
||||
class InstagramLocationScraper(InstagramCommonScraper):
|
||||
class InstagramLocationScraper(_InstagramCommonScraper):
|
||||
name = 'instagram-location'
|
||||
|
||||
def __init__(self, locationId, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._initialUrl = f'https://www.instagram.com/explore/locations/{locationId}/'
|
||||
self._pageName = 'LocationsPage'
|
||||
self._responseContainer = 'location'
|
||||
self._edgeXToMedia = 'edge_location_to_media'
|
||||
self._pageIDKey = 'id'
|
||||
self._queryHash = '1b84447a4d8b6d6d0426fefb34514485'
|
||||
self._variablesFormat = '{{"id":"{pageID}","first":50,"after":"{endCursor}"}}'
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('locationid', help = 'An Instagram location ID', type = int)
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls('Location', args.locationid, retries = args.retries)
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.locationid)
|
||||
|
||||
340
snscrape/modules/mastodon.py
Normal file
340
snscrape/modules/mastodon.py
Normal file
@@ -0,0 +1,340 @@
|
||||
__all__ = ['Toot', 'Boost', 'Attachment', 'Poll', 'PollOption', 'User', 'CustomEmoji', 'MastodonProfileScraper', 'MastodonTootScraperMode', 'MastodonTootScraper']
|
||||
|
||||
|
||||
import bs4
|
||||
import dataclasses
|
||||
import datetime
|
||||
import enum
|
||||
import json
|
||||
import logging
|
||||
import snscrape.base
|
||||
import time
|
||||
import typing
|
||||
import urllib.parse
|
||||
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Toot(snscrape.base.Item):
|
||||
url: str
|
||||
id: str
|
||||
user: 'User'
|
||||
date: datetime.datetime
|
||||
text: str
|
||||
spoilerText: typing.Optional[str] = None
|
||||
attachments: typing.Optional[typing.List['Attachment']] = None
|
||||
links: typing.Optional[typing.List[str]] = None
|
||||
mentionedUsers: typing.Optional[typing.List['User']] = None
|
||||
hashtags: typing.Optional[typing.List[str]] = None
|
||||
poll: typing.Optional['Poll'] = None
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Boost(snscrape.base.Item):
|
||||
user: 'User'
|
||||
toot: Toot
|
||||
|
||||
def __str__(self):
|
||||
# Boosts don't have their own URLs
|
||||
return str(self.toot)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Attachment:
|
||||
url: str
|
||||
name: str
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Poll:
|
||||
id: str
|
||||
expirationDate: datetime.datetime
|
||||
multiple: bool
|
||||
options: typing.List['PollOption']
|
||||
votesCount: int
|
||||
votersCount: typing.Optional[int] = None # Available since version 3.0.0 (commit 3babf846)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class PollOption:
|
||||
title: str
|
||||
votesCount: int
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Entity):
|
||||
account: str # @username@domain.invalid
|
||||
displayName: typing.Optional[str] = None
|
||||
displayNameWithCustomEmojis: typing.Optional[typing.List[typing.Union[str, 'CustomEmoji']]] = None
|
||||
avatarUrl: typing.Optional[str] = None
|
||||
_url: typing.Optional[str] = None
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
if self._url:
|
||||
return self._url
|
||||
return f'https://{"/@".join(reversed(self.account[1:].split("@")))}'
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class CustomEmoji:
|
||||
shortName: str
|
||||
url: str
|
||||
staticUrl: str
|
||||
|
||||
|
||||
class _MastodonCommonScraper(snscrape.base.Scraper):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept-Language': 'en-US,en;q=0.5'}
|
||||
self._lastRequest = 0
|
||||
|
||||
def _rate_limited_get(self, *args, **kwargs):
|
||||
if (diff := time.time() - self._lastRequest) < 3:
|
||||
time.sleep(3 - diff)
|
||||
self._lastRequest = time.time()
|
||||
return self._get(*args, **kwargs)
|
||||
|
||||
def _entries_to_items(self, entries, url):
|
||||
for entry in entries:
|
||||
if entry.find('a', class_ = 'load-more'):
|
||||
continue
|
||||
|
||||
tootKwargs = {}
|
||||
|
||||
info = entry.find('div', class_ = 'status__info')
|
||||
if not info: # Before 2.5.0 (commit bb71538b)
|
||||
info = entry.find('div', class_ = 'status__header')
|
||||
if not info: # Detailed status (i.e. toot page rather than timeline)?
|
||||
info = entry.find('div', class_ = 'detailed-status__meta')
|
||||
link = info.find('a', class_ = 'status__relative-time')
|
||||
if not link: # Detailed status?
|
||||
link = info.find('a', class_ = 'detailed-status__datetime')
|
||||
tootKwargs['url'] = link['href']
|
||||
tootKwargs['id'] = tootKwargs['url'].rsplit('/', 1)[1]
|
||||
tootKwargs['date'] = datetime.datetime.strptime(info.find('data', class_ = 'dt-published')['value'], '%Y-%m-%dT%H:%M:%S+00:00').replace(tzinfo = datetime.timezone.utc)
|
||||
|
||||
userKwargs = {}
|
||||
userLink = info.find('a', class_ = 'status__display-name')
|
||||
if not userLink: # Detailed status?
|
||||
userLink = entry.find('a', class_ = 'detailed-status__display-name')
|
||||
userNameSpan = userLink.find('span', class_ = 'display-name')
|
||||
userKwargs['account'] = userNameSpan.find('span').text.strip()
|
||||
if userKwargs['account'].count('@') == 1: # Ancient versions don't include the instance for posts from accounts on the instance itself
|
||||
userKwargs['account'] = self._url_to_account(userLink['href'])
|
||||
userKwargs['_url'] = urllib.parse.urljoin(url, userLink['href'])
|
||||
userKwargs['displayName'], userKwargs['displayNameWithCustomEmojis'] = self._display_name(userNameSpan.find('strong'), url)
|
||||
userKwargs['avatarUrl'] = urllib.parse.urljoin(url, userLink.find('img', class_ = 'u-photo')['src'])
|
||||
tootKwargs['user'] = User(**userKwargs)
|
||||
|
||||
content = entry.find('div', class_ = 'status__content')
|
||||
if not content.find(class_ = 'status__content__spoiler-link'):
|
||||
tootKwargs['text'] = '\n\n'.join(p.text for p in content.find_all('p'))
|
||||
else:
|
||||
tootKwargs['text'] = content.find('span', class_ = 'p-summary').text
|
||||
tootKwargs['spoilerText'] = '\n\n'.join(p.text for p in content.find('div', class_ = 'e-content').find_all('p'))
|
||||
|
||||
if (attachmentsDiv := entry.find('div', class_ = 'attachment-list')):
|
||||
attachments = []
|
||||
for a in attachmentsDiv.find_all('a'):
|
||||
attachments.append(Attachment(url = urllib.parse.urljoin(url, a['href']), name = a.text.strip()))
|
||||
tootKwargs['attachments'] = attachments
|
||||
elif (mediaGalleryDiv := entry.find('div', attrs = {'data-component': 'MediaGallery'})): # Before 2.7.0 (https://github.com/mastodon/mastodon/issues/6714)
|
||||
o = json.loads(mediaGalleryDiv['data-props'])
|
||||
attachments = []
|
||||
for medium in o['media']:
|
||||
attachments.append(Attachment(url = urllib.parse.urljoin(url, medium['url']), name = medium['url'].rsplit('/', 1)[-1].strip()))
|
||||
tootKwargs['attachments'] = attachments
|
||||
elif (attachmentsDiv := entry.find('div', class_ = 'status__attachments')): # Before 2.3.0 (commit 2bbf987a)
|
||||
attachments = []
|
||||
for a in attachmentsDiv.find_all('a'):
|
||||
attachments.append(Attachment(url = urllib.parse.urljoin(url, a['href']), name = a['href'].rsplit('/', 1)[1]))
|
||||
tootKwargs['attachments'] = attachments
|
||||
|
||||
links = []
|
||||
mentionedUsers = []
|
||||
hashtags = []
|
||||
for a in content.find_all('a'):
|
||||
cls = a.get('class', [])
|
||||
if 'mention' in cls and 'u-url' in cls:
|
||||
mentionUrl = urllib.parse.urljoin(url, a['href'])
|
||||
mentionedUsers.append(User(account = self._url_to_account(mentionUrl), _url = mentionUrl))
|
||||
elif 'mention' in cls and 'hashtag' in cls:
|
||||
hashtags.append(a.text.strip())
|
||||
else:
|
||||
links.append(urllib.parse.urljoin(url, a['href']))
|
||||
if links:
|
||||
tootKwargs['links'] = links
|
||||
if mentionedUsers:
|
||||
tootKwargs['mentionedUsers'] = mentionedUsers
|
||||
if hashtags:
|
||||
tootKwargs['hashtags'] = hashtags
|
||||
|
||||
if (pollDiv := entry.find('div', attrs = {'data-component': 'Poll'})):
|
||||
o = json.loads(pollDiv['data-props'])
|
||||
pollKwargs = {}
|
||||
pollKwargs['id'] = o['poll']['id']
|
||||
pollKwargs['expirationDate'] = datetime.datetime.strptime(o['poll']['expires_at'], '%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo = datetime.timezone.utc)
|
||||
pollKwargs['multiple'] = o['poll']['multiple']
|
||||
pollKwargs['options'] = [PollOption(title = op['title'], votesCount = op['votes_count']) for op in o['poll']['options']]
|
||||
pollKwargs['votesCount'] = o['poll']['votes_count']
|
||||
if 'voters_count' in o['poll']: # 3.0.0 (commit 3babf846)
|
||||
pollKwargs['votersCount'] = o['poll']['voters_count']
|
||||
tootKwargs['poll'] = Poll(**pollKwargs)
|
||||
|
||||
toot = Toot(**tootKwargs)
|
||||
|
||||
# Boosts
|
||||
prepend = entry.find('div', class_ = 'status__prepend')
|
||||
if not prepend: # Before 2.5.0 (commit bb71538b)
|
||||
prepend = entry.find('div', class_ = 'pre-header')
|
||||
if prepend and prepend.find('i', class_ = 'fa-retweet'): # Is a boost
|
||||
userKwargs = {}
|
||||
userLink = prepend.find('a', class_ = 'status__display-name')
|
||||
# The user is always on this instance since that's the only place where boosts are shown, hence there is no explicit account span. Reconstruct from URL.
|
||||
userUrl = urllib.parse.urljoin(url, userLink['href'])
|
||||
assert userUrl.count('/') == 3 and userUrl.count('/@') == 1
|
||||
userKwargs['account'] = '@'.join(reversed(userUrl.split('/')[2:]))
|
||||
userKwargs['displayName'], userKwargs['displayNameWithCustomEmojis'] = self._display_name(userLink.find('strong'), url)
|
||||
toot = Boost(user = User(**userKwargs), toot = toot)
|
||||
|
||||
yield toot
|
||||
|
||||
def _display_name(self, strong, url):
|
||||
outPlain = []
|
||||
outFull = []
|
||||
hasCustomEmoji = False
|
||||
for child in strong.children:
|
||||
if isinstance(child, bs4.element.NavigableString):
|
||||
outPlain.append(str(child))
|
||||
outFull.append(str(child))
|
||||
elif child.name == 'img' and 'custom-emoji' in child.get('class', []):
|
||||
hasCustomEmoji = True
|
||||
outPlain.append(child['alt'])
|
||||
outFull.append(CustomEmoji(shortName = child['alt'], url = urllib.parse.urljoin(url, child['data-original']), staticUrl = urllib.parse.urljoin(url, child['data-static'])))
|
||||
elif child.name == 'img' and 'emojione' in child.get('class', []):
|
||||
# Version 2.0.0 (which first added custom emojis) to 2.9.4: no data-* attributes, only gets one of the URLs with no (easy, reliable) way of knowing which it is.
|
||||
hasCustomEmoji = True
|
||||
outPlain.append(child['alt'])
|
||||
outFull.append(CustomEmoji(shortName = child['alt'], url = urllib.parse.urljoin(url, child['src'])))
|
||||
else:
|
||||
_logger.warning(f'Unexpected display name child: {child!r}')
|
||||
return ''.join(outPlain), outFull if hasCustomEmoji else None
|
||||
|
||||
@staticmethod
|
||||
def _url_to_account(url):
|
||||
if url.count('/') == 3 and url.count('/@') == 1:
|
||||
return '@'.join(reversed(url.split('/')[2:]))
|
||||
if url.count('/') == 4 and '/users/' in url: # E.g. Pleroma, also supported by Mastodon
|
||||
return '@' + '@'.join(reversed(url.split('/')[2::2]))
|
||||
if url.count('/') == 4 and '/accounts/' in url: # E.g. Peertube
|
||||
return '@' + '@'.join(reversed(url.split('/')[2::2]))
|
||||
if url.count('/') == 4 and '/profile/' in url: # E.g. Friendica
|
||||
return '@' + '@'.join(reversed(url.split('/')[2::2]))
|
||||
raise ValueError('Unrecognised account URL format')
|
||||
|
||||
|
||||
class MastodonProfileScraper(_MastodonCommonScraper):
|
||||
name = 'mastodon-profile'
|
||||
|
||||
def __init__(self, account, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
if account.startswith('@') and account.count('@') == 2:
|
||||
account, domain = account[1:].split('@')
|
||||
url = f'https://{domain}/@{account}'
|
||||
else:
|
||||
url = account
|
||||
self._url = url
|
||||
|
||||
def get_items(self):
|
||||
initial = True
|
||||
while True:
|
||||
if initial:
|
||||
r = self._rate_limited_get(f'{self._url}/with_replies', headers = self._headers)
|
||||
if r.status_code not in (200, 404):
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
if r.status_code == 404: # Possibly an old instance where with_replies doesn't exist, try without that.
|
||||
r = self._rate_limited_get(self._url, headers = self._headers)
|
||||
if r.status_code not in (200, 404):
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
if r.status_code == 404:
|
||||
_logger.warning('Account does not exist')
|
||||
return
|
||||
_logger.warning('Old Mastodon instance, cannot retrieve reply toots')
|
||||
initial = False
|
||||
else:
|
||||
r = self._rate_limited_get(url, headers = self._headers)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
|
||||
yield from self._entries_to_items(soup.find('div', class_ = 'activity-stream').find_all('div', class_ = 'entry'), r.url)
|
||||
|
||||
nextA = soup.find('a', class_ = 'load-more', href = lambda x: '?max_id=' in x or '&max_id=' in x)
|
||||
if not nextA: # Before 2.5.0 (commit bb71538b)
|
||||
paginationDiv = soup.find('div', class_ = 'pagination')
|
||||
if paginationDiv:
|
||||
nextA = paginationDiv.find('a', class_ = 'next')
|
||||
if not nextA: # End of pagination
|
||||
break
|
||||
url = urllib.parse.urljoin(r.url, nextA['href'])
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('account', type = snscrape.base.nonempty_string('account'), help = 'A Mastodon account. This can be either a URL to the profile page or a string of the form @account@instance.example.org')
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.account)
|
||||
|
||||
|
||||
class MastodonTootScraperMode(enum.Enum):
|
||||
SINGLE = 'single'
|
||||
THREAD = 'thread'
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
if args.thread:
|
||||
return cls.THREAD
|
||||
return cls.SINGLE
|
||||
|
||||
|
||||
class MastodonTootScraper(_MastodonCommonScraper):
|
||||
name = 'mastodon-toot'
|
||||
|
||||
def __init__(self, url, *, mode = MastodonTootScraperMode.SINGLE, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._url = url
|
||||
self._mode = mode
|
||||
|
||||
def get_items(self):
|
||||
r = self._rate_limited_get(self._url, headers = self._headers)
|
||||
if r.status_code == 404:
|
||||
_logger.warning('Toot does not exist')
|
||||
return
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
if self._mode is MastodonTootScraperMode.SINGLE:
|
||||
status = soup.find('div', class_ = 'detailed-status')
|
||||
entry = status.parent
|
||||
yield from self._entries_to_items([entry], r.url)
|
||||
elif self._mode is MastodonTootScraperMode.THREAD:
|
||||
yield from self._entries_to_items(soup.find('div', class_ = 'activity-stream').find_all('div', class_ = 'entry'), r.url)
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('--thread', action = 'store_true', help = 'Collect thread around the toot referenced by the URL')
|
||||
subparser.add_argument('url', type = snscrape.base.nonempty_string('url'), help = 'A URL for a toot')
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.url, mode = MastodonTootScraperMode._cli_from_args(args))
|
||||
285
snscrape/modules/reddit.py
Normal file
285
snscrape/modules/reddit.py
Normal file
@@ -0,0 +1,285 @@
|
||||
__all__ = ['Submission', 'Comment', 'RedditUserScraper', 'RedditSubredditScraper', 'RedditSearchScraper', 'RedditSubmissionScraper']
|
||||
|
||||
|
||||
import dataclasses
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
import snscrape.base
|
||||
import snscrape.version
|
||||
import string
|
||||
import time
|
||||
import typing
|
||||
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Most of these fields should never be None, but due to broken data, they sometimes are anyway...
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Submission(snscrape.base.Item):
|
||||
author: typing.Optional[str] # E.g. submission hf7k6
|
||||
date: datetime.datetime
|
||||
id: str
|
||||
link: typing.Optional[str]
|
||||
selftext: typing.Optional[str]
|
||||
subreddit: typing.Optional[str] # E.g. submission 617p51
|
||||
title: str
|
||||
url: str
|
||||
|
||||
created = snscrape.base._DeprecatedProperty('created', lambda self: self.date, 'date')
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Comment(snscrape.base.Item):
|
||||
author: typing.Optional[str]
|
||||
body: str
|
||||
date: datetime.datetime
|
||||
id: str
|
||||
parentId: typing.Optional[str]
|
||||
subreddit: typing.Optional[str]
|
||||
url: str
|
||||
|
||||
created = snscrape.base._DeprecatedProperty('created', lambda self: self.date, 'date')
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
def _cmp_id(id1, id2):
|
||||
'''Compare two Reddit IDs. Returns -1 if id1 is less than id2, 0 if they are equal, and 1 if id1 is greater than id2.
|
||||
|
||||
id1 and id2 may have prefixes like t1_, but if included, they must be present on both and equal.'''
|
||||
|
||||
if id1.startswith('t') and '_' in id1:
|
||||
prefix, id1 = id1.split('_', 1)
|
||||
if not id2.startswith(f'{prefix}_'):
|
||||
raise ValueError('id2 must have the same prefix as id1')
|
||||
_, id2 = id2.split('_', 1)
|
||||
if id1.strip(string.ascii_lowercase + string.digits) != '':
|
||||
raise ValueError('invalid characters in id1')
|
||||
if id2.strip(string.ascii_lowercase + string.digits) != '':
|
||||
raise ValueError('invalid characters in id2')
|
||||
if len(id1) < len(id2):
|
||||
return -1
|
||||
if len(id1) > len(id2):
|
||||
return 1
|
||||
if id1 < id2:
|
||||
return -1
|
||||
if id1 > id2:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
class _RedditPushshiftScraper(snscrape.base.Scraper):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._headers = {'User-Agent': f'snscrape/{snscrape.version.__version__}'}
|
||||
|
||||
def _handle_rate_limiting(self, r):
|
||||
if r.status_code == 429:
|
||||
_logger.info('Got 429 response, sleeping')
|
||||
time.sleep(10)
|
||||
return False, 'rate-limited'
|
||||
if r.status_code != 200:
|
||||
return False, 'non-200 status code'
|
||||
return True, None
|
||||
|
||||
def _get_api(self, url, params = None):
|
||||
r = self._get(url, params = params, headers = self._headers, responseOkCallback = self._handle_rate_limiting)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
return r.json()
|
||||
|
||||
def _api_obj_to_item(self, d):
|
||||
cls = Submission if 'title' in d else Comment
|
||||
|
||||
# Pushshift doesn't always return a permalink; sometimes, there's a permalink_url instead, and sometimes there's nothing at all
|
||||
permalink = d.get('permalink')
|
||||
if permalink is None:
|
||||
# E.g. comment dovj2v7
|
||||
permalink = d.get('permalink_url')
|
||||
if permalink is None:
|
||||
if 'link_id' in d and d['link_id'].startswith('t3_'): # E.g. comment doraazf
|
||||
if 'subreddit' in d:
|
||||
permalink = f'/r/{d["subreddit"]}/comments/{d["link_id"][3:]}/_/{d["id"]}/'
|
||||
else: # E.g. submission 617p51 but can likely happen for comments as well
|
||||
permalink = f'/comments/{d["link_id"][3:]}/_/{d["id"]}/'
|
||||
else:
|
||||
_logger.warning('Unable to find or construct permalink')
|
||||
permalink = '/'
|
||||
|
||||
kwargs = {
|
||||
'author': d.get('author'),
|
||||
'date': datetime.datetime.fromtimestamp(d['created_utc'], datetime.timezone.utc),
|
||||
'url': f'https://old.reddit.com{permalink}',
|
||||
'subreddit': d.get('subreddit'),
|
||||
}
|
||||
if cls is Submission:
|
||||
kwargs['selftext'] = d.get('selftext') or None
|
||||
kwargs['link'] = (d['url'] if not d['url'].startswith('/') else f'https://old.reddit.com{d["url"]}') if not kwargs['selftext'] else None
|
||||
if kwargs['link'] == kwargs['url'] or kwargs['url'].replace('//old.reddit.com/', '//www.reddit.com/') == kwargs['link']:
|
||||
kwargs['link'] = None
|
||||
kwargs['title'] = d['title']
|
||||
kwargs['id'] = f't3_{d["id"]}'
|
||||
else:
|
||||
kwargs['body'] = d['body']
|
||||
kwargs['parentId'] = d.get('parent_id')
|
||||
kwargs['id'] = f't1_{d["id"]}'
|
||||
|
||||
return cls(**kwargs)
|
||||
|
||||
|
||||
class _RedditPushshiftSearchScraper(_RedditPushshiftScraper):
|
||||
def __init__(self, name, *, submissions = True, comments = True, before = None, after = None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._name = name
|
||||
self._submissions = submissions
|
||||
self._comments = comments
|
||||
self._before = before
|
||||
self._after = after
|
||||
|
||||
if not type(self)._validationFunc(self._name):
|
||||
raise ValueError(f'invalid {type(self).name.split("-", 1)[1]} name')
|
||||
if not self._submissions and not self._comments:
|
||||
raise ValueError('At least one of submissions and comments must be True')
|
||||
|
||||
def _iter_api(self, url, params = None):
|
||||
'''Iterate through the Pushshift API using the 'before' parameter and yield the items.'''
|
||||
lowestIdSeen = None
|
||||
if params is None:
|
||||
params = {}
|
||||
if self._before is not None:
|
||||
params['before'] = self._before
|
||||
if self._after is not None:
|
||||
params['after'] = self._after
|
||||
params['sort'] = 'desc'
|
||||
while True:
|
||||
obj = self._get_api(url, params = params)
|
||||
if not obj['data'] or (lowestIdSeen is not None and all(_cmp_id(d['id'], lowestIdSeen) >= 0 for d in obj['data'])): # end of pagination
|
||||
break
|
||||
for d in obj['data']:
|
||||
if lowestIdSeen is None or _cmp_id(d['id'], lowestIdSeen) == -1:
|
||||
yield self._api_obj_to_item(d)
|
||||
lowestIdSeen = d['id']
|
||||
params['before'] = obj["data"][-1]["created_utc"] + 1
|
||||
|
||||
def _iter_api_submissions_and_comments(self, params: dict):
|
||||
# Retrieve both submissions and comments, interleave the results to get a reverse-chronological order
|
||||
params['size'] = '1000'
|
||||
if self._submissions:
|
||||
submissionsIter = self._iter_api('https://api.pushshift.io/reddit/search/submission/', params.copy()) # Pass copies to prevent the two iterators from messing each other up by using the same dict
|
||||
else:
|
||||
submissionsIter = iter(())
|
||||
if self._comments:
|
||||
commentsIter = self._iter_api('https://api.pushshift.io/reddit/search/comment/', params.copy())
|
||||
else:
|
||||
commentsIter = iter(())
|
||||
|
||||
try:
|
||||
tipSubmission = next(submissionsIter)
|
||||
except StopIteration:
|
||||
# There are no submissions, just yield comments and return
|
||||
yield from commentsIter
|
||||
return
|
||||
try:
|
||||
tipComment = next(commentsIter)
|
||||
except StopIteration:
|
||||
# There are no comments, just yield submissions and return
|
||||
yield tipSubmission
|
||||
yield from submissionsIter
|
||||
return
|
||||
|
||||
while True:
|
||||
# Return newer first; if both have the same creation datetime, return the comment first
|
||||
if tipSubmission.date > tipComment.date:
|
||||
yield tipSubmission
|
||||
try:
|
||||
tipSubmission = next(submissionsIter)
|
||||
except StopIteration:
|
||||
# Reached the end of submissions, just yield the remaining comments and stop
|
||||
yield tipComment
|
||||
yield from commentsIter
|
||||
break
|
||||
else:
|
||||
yield tipComment
|
||||
try:
|
||||
tipComment = next(commentsIter)
|
||||
except StopIteration:
|
||||
yield tipSubmission
|
||||
yield from submissionsIter
|
||||
break
|
||||
|
||||
def get_items(self):
|
||||
yield from self._iter_api_submissions_and_comments({type(self)._apiField: self._name})
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('--no-submissions', dest = 'noSubmissions', action = 'store_true', default = False, help = 'Don\'t list submissions')
|
||||
subparser.add_argument('--no-comments', dest = 'noComments', action = 'store_true', default = False, help = 'Don\'t list comments')
|
||||
subparser.add_argument('--before', metavar = 'TIMESTAMP', type = int, help = 'Fetch results before a Unix timestamp')
|
||||
subparser.add_argument('--after', metavar = 'TIMESTAMP', type = int, help = 'Fetch results after a Unix timestamp')
|
||||
name = cls.name.split('-', 1)[1]
|
||||
subparser.add_argument(name, type = snscrape.base.nonempty_string(name))
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
name = cls.name.split('-', 1)[1]
|
||||
return cls._cli_construct(args, getattr(args, name), submissions = not args.noSubmissions, comments = not args.noComments, before = args.before, after = args.after)
|
||||
|
||||
|
||||
class RedditUserScraper(_RedditPushshiftSearchScraper):
|
||||
name = 'reddit-user'
|
||||
_validationFunc = lambda x: re.match('^[A-Za-z0-9_-]{3,20}$', x)
|
||||
_apiField = 'author'
|
||||
|
||||
|
||||
class RedditSubredditScraper(_RedditPushshiftSearchScraper):
|
||||
name = 'reddit-subreddit'
|
||||
_validationFunc = lambda x: re.match('^[A-Za-z0-9][A-Za-z0-9_]{2,20}$', x)
|
||||
_apiField = 'subreddit'
|
||||
|
||||
|
||||
class RedditSearchScraper(_RedditPushshiftSearchScraper):
|
||||
name = 'reddit-search'
|
||||
_validationFunc = lambda x: True
|
||||
_apiField = 'q'
|
||||
|
||||
|
||||
class RedditSubmissionScraper(_RedditPushshiftScraper):
|
||||
name = 'reddit-submission'
|
||||
|
||||
def __init__(self, submissionId, **kwargs):
|
||||
if (submissionId[3:] if submissionId.startswith('t3_') else submissionId).strip(string.ascii_lowercase + string.digits) != '':
|
||||
raise ValueError('invalid submissionId')
|
||||
super().__init__(**kwargs)
|
||||
self._submissionId = submissionId
|
||||
|
||||
def get_items(self):
|
||||
obj = self._get_api(f'https://api.pushshift.io/reddit/search/submission/?ids={self._submissionId}')
|
||||
if not obj['data']:
|
||||
return
|
||||
if len(obj['data']) != 1:
|
||||
raise snscrape.base.ScraperException(f'Got {len(obj["data"])} results instead of 1')
|
||||
yield self._api_obj_to_item(obj['data'][0])
|
||||
|
||||
obj = self._get_api(f'https://api.pushshift.io/reddit/submission/comment_ids/{self._submissionId}')
|
||||
if not obj['data']:
|
||||
return
|
||||
commentIds = obj['data']
|
||||
for i in range(0, len(commentIds), 500):
|
||||
ids = commentIds[i : i + 500]
|
||||
obj = self._get_api(f'https://api.pushshift.io/reddit/comment/search?ids={",".join(ids)}')
|
||||
yield from map(self._api_obj_to_item, obj['data'])
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('submissionId', type = snscrape.base.nonempty_string('submissionId'))
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.submissionId)
|
||||
331
snscrape/modules/telegram.py
Normal file
331
snscrape/modules/telegram.py
Normal file
@@ -0,0 +1,331 @@
|
||||
__all__ = ['LinkPreview', 'TelegramPost', 'Channel', 'TelegramChannelScraper']
|
||||
|
||||
|
||||
import bs4
|
||||
import dataclasses
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
import snscrape.base
|
||||
import typing
|
||||
import urllib.parse
|
||||
import base64
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
_SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
|
||||
_STYLE_MEDIA_URL_PATTERN = re.compile(r'url\(\'(.*?)\'\)')
|
||||
|
||||
@dataclasses.dataclass
|
||||
class LinkPreview:
|
||||
href: str
|
||||
siteName: typing.Optional[str] = None
|
||||
title: typing.Optional[str] = None
|
||||
description: typing.Optional[str] = None
|
||||
image: typing.Optional[str] = None
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Channel(snscrape.base.Entity):
|
||||
username: str
|
||||
title: typing.Optional[str] = None
|
||||
verified: typing.Optional[bool] = None
|
||||
photo: typing.Optional[str] = None
|
||||
description: typing.Optional[str] = None
|
||||
members: typing.Optional[int] = None
|
||||
photos: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
videos: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
links: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
files: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
|
||||
photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
|
||||
videosGranularity = snscrape.base._DeprecatedProperty('videosGranularity', lambda self: self.videos.granularity, 'videos.granularity')
|
||||
linksGranularity = snscrape.base._DeprecatedProperty('linksGranularity', lambda self: self.links.granularity, 'links.granularity')
|
||||
filesGranularity = snscrape.base._DeprecatedProperty('filesGranularity', lambda self: self.files.granularity, 'files.granularity')
|
||||
|
||||
def __str__(self):
|
||||
return f'https://t.me/s/{self.username}'
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class TelegramPost(snscrape.base.Item):
|
||||
url: str
|
||||
date: datetime.datetime
|
||||
content: str
|
||||
outlinks: typing.List[str] = None
|
||||
mentions: typing.List[str] = None
|
||||
hashtags: typing.List[str] = None
|
||||
forwarded: typing.Optional['Channel'] = None
|
||||
forwardedUrl: typing.Optional[str] = None
|
||||
media: typing.Optional[typing.List['Medium']] = None
|
||||
views: typing.Optional[int] = None
|
||||
linkPreview: typing.Optional[LinkPreview] = None
|
||||
|
||||
outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks), 'outlinks')
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
class Medium:
|
||||
pass
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Photo(Medium):
|
||||
url: str
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Video(Medium):
|
||||
thumbnailUrl: str
|
||||
duration: float
|
||||
url: typing.Optional[str] = None
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class VoiceMessage(Medium):
|
||||
url: str
|
||||
duration: str
|
||||
bars:typing.List[float]
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Gif(Medium):
|
||||
thumbnailUrl: str
|
||||
url: typing.Optional[str] = None
|
||||
|
||||
|
||||
class TelegramChannelScraper(snscrape.base.Scraper):
|
||||
name = 'telegram-channel'
|
||||
|
||||
def __init__(self, name, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._name = name
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
|
||||
self._initialPage = None
|
||||
self._initialPageSoup = None
|
||||
|
||||
def _initial_page(self):
|
||||
if self._initialPage is None:
|
||||
r = self._get(f'https://t.me/s/{self._name}', headers = self._headers)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
self._initialPage, self._initialPageSoup = r, bs4.BeautifulSoup(r.text, 'lxml')
|
||||
return self._initialPage, self._initialPageSoup
|
||||
|
||||
def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
|
||||
posts = soup.find_all('div', attrs = {'class': 'tgme_widget_message', 'data-post': True})
|
||||
for post in reversed(posts):
|
||||
if onlyUsername:
|
||||
yield post['data-post'].split('/')[0]
|
||||
return
|
||||
dateDiv = post.find('div', class_ = 'tgme_widget_message_footer').find('a', class_ = 'tgme_widget_message_date')
|
||||
rawUrl = dateDiv['href']
|
||||
if not rawUrl.startswith('https://t.me/') or sum(x == '/' for x in rawUrl) != 4 or rawUrl.rsplit('/', 1)[1].strip('0123456789') != '':
|
||||
_logger.warning(f'Possibly incorrect URL: {rawUrl!r}')
|
||||
url = rawUrl.replace('//t.me/', '//t.me/s/')
|
||||
date = datetime.datetime.strptime(dateDiv.find('time', datetime = True)['datetime'].replace('-', '', 2).replace(':', ''), '%Y%m%dT%H%M%S%z')
|
||||
media = []
|
||||
outlinks = []
|
||||
mentions = []
|
||||
hashtags = []
|
||||
forwarded = None
|
||||
forwardedUrl = None
|
||||
|
||||
if (forwardTag := post.find('a', class_ = 'tgme_widget_message_forwarded_from_name')):
|
||||
forwardedUrl = forwardTag['href']
|
||||
forwardedName = forwardedUrl.split('t.me/')[1].split('/')[0]
|
||||
forwarded = Channel(username = forwardedName)
|
||||
|
||||
if (message := post.find('div', class_ = 'tgme_widget_message_text')):
|
||||
content = message.get_text(separator="\n")
|
||||
else:
|
||||
content = None
|
||||
|
||||
for link in post.find_all('a'):
|
||||
if any(x in link.parent.attrs.get('class', []) for x in ('tgme_widget_message_user', 'tgme_widget_message_author')):
|
||||
# Author links at the top (avatar and name)
|
||||
continue
|
||||
if link['href'] == rawUrl or link['href'] == url:
|
||||
style = link.attrs.get('style', '')
|
||||
# Generic filter of links to the post itself, catches videos, photos, and the date link
|
||||
if style != '':
|
||||
imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style)
|
||||
if len(imageUrls) == 1:
|
||||
media.append(Photo(url = imageUrls[0]))
|
||||
continue
|
||||
if _SINGLE_MEDIA_LINK_PATTERN.match(link['href']):
|
||||
style = link.attrs.get('style', '')
|
||||
imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style)
|
||||
if len(imageUrls) == 1:
|
||||
media.append(Photo(url = imageUrls[0]))
|
||||
# resp = self._get(image[0])
|
||||
# encoded_string = base64.b64encode(resp.content)
|
||||
# Individual photo or video link
|
||||
continue
|
||||
if link.text.startswith('@'):
|
||||
mentions.append(link.text.strip('@'))
|
||||
continue
|
||||
if link.text.startswith('#'):
|
||||
hashtags.append(link.text.strip('#'))
|
||||
continue
|
||||
href = urllib.parse.urljoin(pageUrl, link['href'])
|
||||
if (href not in outlinks) and (href != rawUrl) and (href != forwardedUrl):
|
||||
outlinks.append(href)
|
||||
|
||||
for voicePlayer in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}):
|
||||
audioUrl = voicePlayer.find('audio')['src']
|
||||
durationStr = voicePlayer.find('time').text
|
||||
duration = durationStrToSeconds(durationStr)
|
||||
barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')]
|
||||
|
||||
media.append(VoiceMessage(url = audioUrl, duration = duration, bars = barHeights))
|
||||
|
||||
for videoPlayer in post.find_all('a', {'class': 'tgme_widget_message_video_player'}):
|
||||
iTag = videoPlayer.find('i')
|
||||
if iTag is None:
|
||||
videoUrl = None
|
||||
videoThumbnailUrl = None
|
||||
else:
|
||||
style = iTag['style']
|
||||
videoThumbnailUrl = _STYLE_MEDIA_URL_PATTERN.findall(style)[0]
|
||||
videoTag = videoPlayer.find('video')
|
||||
videoUrl = None if videoTag is None else videoTag['src']
|
||||
mKwargs = {
|
||||
'thumbnailUrl': videoThumbnailUrl,
|
||||
'url': videoUrl,
|
||||
}
|
||||
timeTag = videoPlayer.find('time')
|
||||
if timeTag is None:
|
||||
cls = Gif
|
||||
else:
|
||||
cls = Video
|
||||
durationStr = videoPlayer.find('time').text
|
||||
mKwargs['duration'] = durationStrToSeconds(durationStr)
|
||||
media.append(cls(**mKwargs))
|
||||
|
||||
linkPreview = None
|
||||
if (linkPreviewA := post.find('a', class_ = 'tgme_widget_message_link_preview')):
|
||||
kwargs = {}
|
||||
kwargs['href'] = urllib.parse.urljoin(pageUrl, linkPreviewA['href'])
|
||||
if (siteNameDiv := linkPreviewA.find('div', class_ = 'link_preview_site_name')):
|
||||
kwargs['siteName'] = siteNameDiv.text
|
||||
if (titleDiv := linkPreviewA.find('div', class_ = 'link_preview_title')):
|
||||
kwargs['title'] = titleDiv.text
|
||||
if (descriptionDiv := linkPreviewA.find('div', class_ = 'link_preview_description')):
|
||||
kwargs['description'] = descriptionDiv.text
|
||||
if (imageI := linkPreviewA.find('i', class_ = 'link_preview_image')):
|
||||
if imageI['style'].startswith("background-image:url('"):
|
||||
kwargs['image'] = imageI['style'][22 : imageI['style'].index("'", 22)]
|
||||
else:
|
||||
_logger.warning(f'Could not process link preview image on {url}')
|
||||
linkPreview = LinkPreview(**kwargs)
|
||||
if kwargs['href'] in outlinks:
|
||||
outlinks.remove(kwargs['href'])
|
||||
|
||||
viewsSpan = post.find('span', class_ = 'tgme_widget_message_views')
|
||||
views = None if viewsSpan is None else parse_num(viewsSpan.text)
|
||||
|
||||
yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, mentions = mentions, hashtags = hashtags, linkPreview = linkPreview, media = media, forwarded = forwarded, forwardedUrl = forwardedUrl, views = views)
|
||||
|
||||
def get_items(self):
|
||||
r, soup = self._initial_page()
|
||||
if '/s/' not in r.url:
|
||||
_logger.warning('No public post list for this user')
|
||||
return
|
||||
nextPageUrl = ''
|
||||
while True:
|
||||
yield from self._soup_to_items(soup, r.url)
|
||||
try:
|
||||
if soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)['href'].split('/')[-1] == '1':
|
||||
# if message 1 is the first message in the page, terminate scraping
|
||||
break
|
||||
except:
|
||||
pass
|
||||
pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})
|
||||
if not pageLink:
|
||||
# some pages are missing a "tme_messages_more" tag, causing early termination
|
||||
if '=' not in nextPageUrl:
|
||||
nextPageUrl = soup.find('link', attrs = {'rel': 'canonical'}, href = True)['href']
|
||||
nextPostIndex = int(nextPageUrl.split('=')[-1]) - 20
|
||||
if nextPostIndex > 20:
|
||||
pageLink = {'href': nextPageUrl.split('=')[0] + f'={nextPostIndex}'}
|
||||
else:
|
||||
break
|
||||
nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
|
||||
r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = telegramResponseOkCallback)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
|
||||
def _get_entity(self):
|
||||
kwargs = {}
|
||||
# /channel has a more accurate member count and bigger profile picture
|
||||
r = self._get(f'https://t.me/{self._name}', headers = self._headers)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
membersDiv = soup.find('div', class_ = 'tgme_page_extra')
|
||||
if membersDiv.text.endswith((' members', ' subscribers')):
|
||||
kwargs['members'] = int(''.join(membersDiv.text.split(' ')[:-1]))
|
||||
photoImg = soup.find('img', class_ = 'tgme_page_photo_image')
|
||||
if photoImg is not None:
|
||||
kwargs['photo'] = photoImg.attrs['src']
|
||||
else:
|
||||
kwargs['photo'] = None
|
||||
|
||||
r, soup = self._initial_page()
|
||||
if '/s/' not in r.url: # Redirect on channels without public posts
|
||||
return
|
||||
channelInfoDiv = soup.find('div', class_ = 'tgme_channel_info')
|
||||
assert channelInfoDiv, 'channel info div not found'
|
||||
titleDiv = channelInfoDiv.find('div', class_ = 'tgme_channel_info_header_title')
|
||||
kwargs['title'] = titleDiv.find('span').text
|
||||
kwargs['verified'] = bool(titleDiv.find('i', class_ = 'verified-icon'))
|
||||
# The username in the channel info is not canonicalised, nor is the one on the /channel page anywhere.
|
||||
# However, the post URLs are, so extract the first post and use that.
|
||||
try:
|
||||
kwargs['username'] = next(self._soup_to_items(soup, r.url, onlyUsername = True))
|
||||
except StopIteration:
|
||||
# If there are no posts, fall back to the channel info div, although that should never happen due to the 'Channel created' entry.
|
||||
_logger.warning('Could not find a post; extracting username from channel info div, which may not be capitalised correctly')
|
||||
kwargs['username'] = channelInfoDiv.find('div', class_ = 'tgme_channel_info_header_username').text[1:] # Remove @
|
||||
if (descriptionDiv := channelInfoDiv.find('div', class_ = 'tgme_channel_info_description')):
|
||||
kwargs['description'] = descriptionDiv.text
|
||||
|
||||
for div in channelInfoDiv.find_all('div', class_ = 'tgme_channel_info_counter'):
|
||||
value, granularity = parse_num(div.find('span', class_ = 'counter_value').text)
|
||||
type_ = div.find('span', class_ = 'counter_type').text
|
||||
if type_ == 'members':
|
||||
# Already extracted more accurately from /channel, skip
|
||||
continue
|
||||
elif type_ in ('photos', 'videos', 'links', 'files'):
|
||||
kwargs[type_] = snscrape.base.IntWithGranularity(value, granularity)
|
||||
|
||||
return Channel(**kwargs)
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('channel', type = snscrape.base.nonempty_string('channel'), help = 'A channel name')
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.channel)
|
||||
|
||||
def parse_num(s):
|
||||
s = s.replace(' ', '')
|
||||
if s.endswith('M'):
|
||||
return int(float(s[:-1]) * 1e6), 10 ** (6 if '.' not in s else 6 - len(s[:-1].split('.')[1]))
|
||||
elif s.endswith('K'):
|
||||
return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1]))
|
||||
return int(s), 1
|
||||
|
||||
def durationStrToSeconds(durationStr):
|
||||
durationList = durationStr.split(':')
|
||||
return sum([int(s) * int(g) for s, g in zip([1, 60, 360], reversed(durationList))])
|
||||
|
||||
def telegramResponseOkCallback(r):
|
||||
if r.status_code == 200:
|
||||
return (True, None)
|
||||
return (False, f'{r.status_code=}')
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,100 +1,410 @@
|
||||
__all__ = ['VKontaktePost', 'Photo', 'PhotoVariant', 'Video', 'User', 'VKontakteUserScraper']
|
||||
|
||||
|
||||
import bs4
|
||||
import collections
|
||||
import dataclasses
|
||||
import datetime
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import snscrape.base
|
||||
import typing
|
||||
import urllib.parse
|
||||
try:
|
||||
import zoneinfo
|
||||
except ImportError:
|
||||
# Python 3.8 support; nowadays, Europe/Moscow is always UTC+3, but it's more complicated before 2014, so need proper zone info
|
||||
import pytz
|
||||
def _timezone(s):
|
||||
return pytz.timezone(s)
|
||||
def _localised_datetime(tz, *args, **kwargs):
|
||||
return tz.localize(datetime.datetime(*args, **kwargs))
|
||||
else:
|
||||
def _timezone(s):
|
||||
return zoneinfo.ZoneInfo(s)
|
||||
def _localised_datetime(tz, *args, **kwargs):
|
||||
return datetime.datetime(*args, tzinfo = tz, **kwargs)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
_logger = logging.getLogger(__name__)
|
||||
_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
|
||||
_datePattern = re.compile(r'^(?P<date>today'
|
||||
r'|yesterday'
|
||||
r'|(?P<day1>\d+)\s+(?P<month1>' + '|'.join(_months) + ')(\s+(?P<year1>\d{4}))?'
|
||||
r'|(?P<month2>' + '|'.join(_months) + r')\s+(?P<day2>\d+),\s+(?P<year2>\d{4})'
|
||||
')'
|
||||
r'\s+at\s+(?P<hour>\d+):(?P<minute>\d+)\s+(?P<ampm>[ap]m)$')
|
||||
|
||||
|
||||
class VKontaktePost(typing.NamedTuple, snscrape.base.Item):
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Entity):
|
||||
username: str
|
||||
name: str
|
||||
verified: bool
|
||||
description: typing.Optional[str] = None
|
||||
websites: typing.Optional[typing.List[str]] = None
|
||||
followers: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
posts: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
photos: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
tags: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
following: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
|
||||
followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
|
||||
postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
|
||||
photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
|
||||
tagsGranularity = snscrape.base._DeprecatedProperty('tagsGranularity', lambda self: self.tags.granularity, 'tags.granularity')
|
||||
followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
|
||||
|
||||
def __str__(self):
|
||||
return f'https://vk.com/{self.username}'
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class VKontaktePost(snscrape.base.Item):
|
||||
url: str
|
||||
date: datetime.datetime
|
||||
date: typing.Optional[typing.Union[datetime.datetime, datetime.date]]
|
||||
content: str
|
||||
user: User
|
||||
outlinks: typing.Optional[typing.List[str]] = None
|
||||
photos: typing.Optional[typing.List['Photo']] = None
|
||||
video: typing.Optional['Video'] = None
|
||||
quotedPost: typing.Optional['VKontaktePost'] = None
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Photo:
|
||||
variants: typing.List['PhotoVariant']
|
||||
url: typing.Optional[str] = None
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class PhotoVariant:
|
||||
url: str
|
||||
width: int
|
||||
height: int
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Video:
|
||||
id: str
|
||||
list: str
|
||||
duration: int
|
||||
url: str
|
||||
thumbUrl: str
|
||||
|
||||
|
||||
class VKontakteUserScraper(snscrape.base.Scraper):
|
||||
name = 'vkontakte-user'
|
||||
|
||||
def __init__(self, username, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._username = username
|
||||
self._baseUrl = f'https://vk.com/{self._username}'
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept-Language': 'en-US,en;q=0.5'}
|
||||
self._initialPage = None
|
||||
self._initialPageSoup = None
|
||||
|
||||
def _soup_to_items(self, soup, baseUrl):
|
||||
for post in soup.find_all('div', class_ = 'post'):
|
||||
def _away_a_to_url(self, a):
|
||||
# Transform an <a> tag with an href of /away.php?to=... to a plain URL; returns None if a doesn't have that form.
|
||||
if a and a.get('href', '').startswith('/away.php?to='):
|
||||
end = a['href'].find('&', 13)
|
||||
if end == -1:
|
||||
end = None
|
||||
return urllib.parse.unquote(a['href'][13 : end])
|
||||
return None
|
||||
|
||||
def _date_span_to_date(self, dateSpan):
|
||||
if not dateSpan:
|
||||
return None
|
||||
if 'time' in dateSpan.attrs:
|
||||
return datetime.datetime.fromtimestamp(int(dateSpan['time']), datetime.timezone.utc)
|
||||
if (match := _datePattern.match(dateSpan.text)):
|
||||
# Datetime information down to minutes
|
||||
tz = _timezone('Europe/Moscow')
|
||||
if match.group('date') in ('today', 'yesterday'):
|
||||
date = datetime.datetime.now(tz = tz)
|
||||
if match.group('date') == 'yesterday':
|
||||
date -= datetime.timedelta(days = 1)
|
||||
year, month, day = date.year, date.month, date.day
|
||||
else:
|
||||
year = int(match.group('year1') or match.group('year2') or datetime.datetime.now(tz = tz).year)
|
||||
month = _months.index(match.group('month1') or match.group('month2')) + 1
|
||||
day = int(match.group('day1') or match.group('day2'))
|
||||
hour = int(match.group('hour'))
|
||||
# Damn AM/PM...
|
||||
if hour == 12:
|
||||
hour -= 12
|
||||
if match.group('ampm') == 'pm':
|
||||
hour += 12
|
||||
minute = int(match.group('minute'))
|
||||
return _localised_datetime(tz, year, month, day, hour, minute)
|
||||
if (match := re.match(r'^(?P<day>\d+)\s+(?P<month>' + '|'.join(_months) + r')\s+(?P<year>\d{4})$', dateSpan.text)):
|
||||
# Date only
|
||||
return datetime.date(int(match.group('year')), _months.index(match.group('month')) + 1, int(match.group('day')))
|
||||
if dateSpan.text not in ('video', 'photo'): # Silently ignore video and photo reposts which have no original date attached
|
||||
_logger.warning(f'Could not parse date string: {dateSpan.text!r}')
|
||||
|
||||
def _post_div_to_item(self, post, isCopy = False):
|
||||
postLink = post.find('a', class_ = 'post_link' if not isCopy else 'published_by_date')
|
||||
if not postLink:
|
||||
_logger.warning(f'Skipping post without link: {str(post)[:200]!r}')
|
||||
return
|
||||
url = urllib.parse.urljoin(self._baseUrl, postLink['href'])
|
||||
assert (url.startswith('https://vk.com/wall') or (isCopy and (url.startswith('https://vk.com/video') or url.startswith('https://vk.com/photo')))) and '_' in url and url[-1] != '_' and url.rsplit('_', 1)[1].strip('0123456789') in ('', '?reply=')
|
||||
if not isCopy:
|
||||
dateSpan = post.find('div', class_ = 'post_date').find('span', class_ = 'rel_date')
|
||||
textDiv = post.find('div', class_ = 'wall_post_text')
|
||||
yield VKontaktePost(
|
||||
url = urllib.parse.urljoin(baseUrl, post.find('a', class_ = 'post_link')['href']),
|
||||
date = datetime.datetime.fromtimestamp(int(dateSpan['time']), datetime.timezone.utc) if 'time' in dateSpan else None,
|
||||
content = textDiv.text if textDiv else None,
|
||||
)
|
||||
else:
|
||||
dateSpan = post.find('div', class_ = 'copy_post_date').find('a', class_ = 'published_by_date')
|
||||
textDiv = post.find('div', class_ = 'wall_post_text')
|
||||
outlinks = [h for a in textDiv.find_all('a') if (h := self._away_a_to_url(a))] if textDiv else []
|
||||
if (mediaLinkDiv := post.find('div', class_ = 'media_link')) and \
|
||||
(mediaLinkA := mediaLinkDiv.find('a', class_ = 'media_link__title')) and \
|
||||
(href := self._away_a_to_url(mediaLinkA)) and \
|
||||
href not in outlinks:
|
||||
outlinks.append(href)
|
||||
photos = None
|
||||
video = None
|
||||
if (thumbsDiv := (post.find('div', class_ = 'wall_text') if not isCopy else post).find('div', class_ = 'page_post_sized_thumbs')) and \
|
||||
not (not isCopy and thumbsDiv.parent.name == 'div' and 'class' in thumbsDiv.parent.attrs and 'copy_quote' in thumbsDiv.parent.attrs['class']): # Skip post quotes
|
||||
photos = []
|
||||
for a in thumbsDiv.find_all('a', class_ = 'page_post_thumb_wrap'):
|
||||
if 'data-photo-id' not in a.attrs and 'data-video' not in a.attrs:
|
||||
_logger.warning(f'Skipping non-photo and non-video thumb wrap on {url}')
|
||||
continue
|
||||
if 'data-video' in a.attrs:
|
||||
# Video
|
||||
if 'data-link-attr' in a.attrs:
|
||||
hrefUrl = urllib.parse.unquote(a.attrs['data-link-attr'].split('to=')[1].split('&')[0])
|
||||
else:
|
||||
hrefUrl = f'https://vk.com{a["href"]}'
|
||||
video = Video(
|
||||
id = a['data-video'],
|
||||
list = a['data-list'],
|
||||
duration = int(a['data-duration']),
|
||||
url = hrefUrl,
|
||||
thumbUrl = a['style'][(begin := a['style'].find('background-image: url(') + 22) : a['style'].find(')', begin)],
|
||||
)
|
||||
continue
|
||||
# From here on: photo
|
||||
if 'onclick' not in a.attrs or not a['onclick'].startswith("return showPhoto('") or '{"temp":' not in a['onclick'] or not a['onclick'].endswith('}, event)'):
|
||||
_logger.warning(f'Photo thumb wrap on {url} has no or unexpected onclick, skipping')
|
||||
continue
|
||||
photoData = a['onclick'][a['onclick'].find('{"temp":') : -8] # -8 = len(', event)')
|
||||
photoObj = json.loads(photoData)
|
||||
singleLetterKeys = [k for k in photoObj['temp'].keys() if len(k) == 1 and 97 <= ord(k) <= 122] # 97 = ord('a'), 122 = ord('z')
|
||||
for x in singleLetterKeys:
|
||||
# Merge base into URLs
|
||||
if not photoObj['temp'][x].startswith('https://'):
|
||||
photoObj['temp'][x] = f'{photoObj["temp"]["base"]}{photoObj["temp"][x]}'
|
||||
x_ = f'{x}_'
|
||||
if not photoObj['temp'][x_][0].startswith('https://'):
|
||||
photoObj['temp'][x_][0] = f'{photoObj["temp"]["base"]}{photoObj["temp"][x_][0]}'
|
||||
if any(k not in {'base', 'w', 'w_', 'x', 'x_', 'y', 'y_', 'z', 'z_'} for k in photoObj['temp'].keys()) or \
|
||||
not all(photoObj['temp'][x] in (photoObj['temp'][f'{x}_'][0], photoObj['temp'][f'{x}_'][0] + '.jpg') for x in singleLetterKeys) or \
|
||||
not all(photoObj['temp'][x].startswith('https://sun') and '.userapi.com/' in photoObj['temp'][x] for x in singleLetterKeys) or \
|
||||
not all(len(photoObj['temp'][(x_ := f'{x}_')]) == 3 and isinstance(photoObj['temp'][x_][1], int) and isinstance(photoObj['temp'][x_][2], int) for x in singleLetterKeys):
|
||||
_logger.warning(f'Photo thumb wrap on {url} has unexpected data structure, skipping')
|
||||
continue
|
||||
photoVariants = []
|
||||
for x in singleLetterKeys:
|
||||
x_ = f'{x}_'
|
||||
photoVariants.append(PhotoVariant(url = f'{photoObj["temp"][x_][0]}.jpg' if '.jpg' not in photoObj['temp'][x_][0] else photoObj['temp'][x_][0], width = photoObj['temp'][x_][1], height = photoObj['temp'][x_][2]))
|
||||
photoUrl = f'https://vk.com{a["href"]}' if 'href' in a.attrs and a['href'].startswith('/photo') and a['href'][6:].strip('0123456789-_') == '' else None
|
||||
photos.append(Photo(variants = photoVariants, url = photoUrl))
|
||||
quotedPost = self._post_div_to_item(quoteDiv, isCopy = True) if (quoteDiv := post.find('div', class_ = 'copy_quote')) else None
|
||||
authorHeading = post.find('h5', class_ = ['post_author', 'copy_post_author'])
|
||||
authorLink = authorHeading.find('a', class_ = ['author', 'copy_author'])
|
||||
username = authorLink['href'].split('/')[-1]
|
||||
name = authorLink.text
|
||||
if authorHeading.find('div', class_ = 'page_verified') is not None:
|
||||
verified = True
|
||||
else:
|
||||
verified = False
|
||||
user = User(username = username, name = name, verified = verified)
|
||||
return VKontaktePost(
|
||||
url = url,
|
||||
date = self._date_span_to_date(dateSpan),
|
||||
content = textDiv.text if textDiv else None,
|
||||
user = user,
|
||||
outlinks = outlinks or None,
|
||||
photos = photos or None,
|
||||
video = video or None,
|
||||
quotedPost = quotedPost,
|
||||
)
|
||||
|
||||
def _soup_to_items(self, soup):
|
||||
for post in soup.find_all('div', class_ = 'post'):
|
||||
yield self._post_div_to_item(post)
|
||||
|
||||
def _initial_page(self):
|
||||
if self._initialPage is None:
|
||||
_logger.info('Retrieving initial data')
|
||||
r = self._get(self._baseUrl, headers = self._headers)
|
||||
if r.status_code not in (200, 404):
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
# VK sends windows-1251-encoded data, but Requests's decoding doesn't seem to work correctly and causes lxml to choke, so we need to pass the binary content and the encoding explicitly.
|
||||
self._initialPage, self._initialPageSoup = r, bs4.BeautifulSoup(r.content, 'lxml', from_encoding = r.encoding)
|
||||
return self._initialPage, self._initialPageSoup
|
||||
|
||||
def get_items(self):
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept-Language': 'en-US,en;q=0.5'}
|
||||
baseUrl = f'https://vk.com/{self._username}'
|
||||
|
||||
logger.info('Retrieving initial data')
|
||||
r = self._get(baseUrl, headers = headers)
|
||||
r, soup = self._initial_page()
|
||||
if r.status_code == 404:
|
||||
logger.error('Wall does not exist')
|
||||
_logger.warning('Wall does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
|
||||
# VK sends windows-1251-encoded data, but Requests's decoding doesn't seem to work correctly and causes lxml to choke, so we need to pass the binary content and the encoding explicitly.
|
||||
soup = bs4.BeautifulSoup(r.content, 'lxml', from_encoding = r.encoding)
|
||||
|
||||
if soup.find('div', class_ = 'profile_closed_wall_dummy'):
|
||||
logger.error('Private profile')
|
||||
_logger.warning('Private profile')
|
||||
return
|
||||
|
||||
if (profileDeleted := soup.find('h5', class_ = 'profile_deleted_text')):
|
||||
# Unclear what this state represents, so just log website text.
|
||||
_logger.warning(profileDeleted.text)
|
||||
return
|
||||
|
||||
newestPost = soup.find('div', class_ = 'post')
|
||||
if not newestPost:
|
||||
logger.info('Wall has no posts')
|
||||
_logger.info('Wall has no posts')
|
||||
return
|
||||
ownerID = newestPost.attrs['data-post-id'].split('_')[0]
|
||||
# If there is a pinned post, we need its ID for the pagination requests
|
||||
if 'post_fixed' in newestPost.attrs['class']:
|
||||
fixedPostID = newestPost.attrs['id'].split('_')[1]
|
||||
fixedPostID = int(newestPost.attrs['id'].split('_')[1])
|
||||
else:
|
||||
fixedPostID = ''
|
||||
|
||||
yield from self._soup_to_items(soup, baseUrl)
|
||||
last1000PostIDs = collections.deque(maxlen = 1000)
|
||||
|
||||
headers['X-Requested-With'] = 'XMLHttpRequest'
|
||||
def _process_soup(soup):
|
||||
nonlocal last1000PostIDs
|
||||
for item in self._soup_to_items(soup):
|
||||
postID = int(item.url.rsplit('_', 1)[1])
|
||||
if postID not in last1000PostIDs:
|
||||
yield item
|
||||
last1000PostIDs.append(postID)
|
||||
|
||||
yield from _process_soup(soup)
|
||||
|
||||
lastWorkingOffset = 0
|
||||
for offset in itertools.count(start = 10, step = 10):
|
||||
logger.info('Retrieving next page')
|
||||
r = self._post(
|
||||
'https://vk.com/al_wall.php',
|
||||
data = [('act', 'get_wall'), ('al', 1), ('fixed', fixedPostID), ('offset', offset), ('onlyCache', 'false'), ('owner_id', ownerID), ('type', 'own'), ('wall_start_from', offset)],
|
||||
headers = headers
|
||||
)
|
||||
if r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
fields = r.content.split(b'<!>')
|
||||
if fields[5].startswith(b'<div class="page_block no_posts">'):
|
||||
posts = self._get_wall_offset(fixedPostID, ownerID, offset)
|
||||
if posts.startswith('<div class="page_block no_posts">'):
|
||||
# Reached the end
|
||||
break
|
||||
if not fields[5].startswith(b'<div id="post'):
|
||||
logger.error(f'Got an unknown response: {fields[5][:200]!r}...')
|
||||
break
|
||||
soup = bs4.BeautifulSoup(fields[5], 'lxml', from_encoding = r.encoding)
|
||||
yield from self._soup_to_items(soup, baseUrl)
|
||||
if not posts.startswith('<div id="post'):
|
||||
if posts == '"\\/blank.php?block=119910902"':
|
||||
_logger.warning(f'Encountered geoblock on offset {offset}, trying to work around the block but might be missing content')
|
||||
for geoblockOffset in range(lastWorkingOffset + 1, offset + 10):
|
||||
geoPosts = self._get_wall_offset(fixedPostID, ownerID, geoblockOffset)
|
||||
if geoPosts.startswith('<div class="page_block no_posts">'):
|
||||
# No breaking the outer loop, it'll just make one extra request and exit as well
|
||||
break
|
||||
if not geoPosts.startswith('<div id="post'):
|
||||
if geoPosts == '"\\/blank.php?block=119910902"':
|
||||
continue
|
||||
raise snscrape.base.ScraperException(f'Got an unknown response: {geoPosts[:200]!r}...')
|
||||
yield from _process_soup(soup = bs4.BeautifulSoup(geoPosts, 'lxml'))
|
||||
continue
|
||||
raise snscrape.base.ScraperException(f'Got an unknown response: {posts[:200]!r}...')
|
||||
lastWorkingOffset = offset
|
||||
soup = bs4.BeautifulSoup(posts, 'lxml')
|
||||
yield from _process_soup(soup)
|
||||
|
||||
def _get_wall_offset(self, fixedPostID, ownerID, offset):
|
||||
headers = self._headers.copy()
|
||||
headers['X-Requested-With'] = 'XMLHttpRequest'
|
||||
_logger.info(f'Retrieving page offset {offset}')
|
||||
r = self._post(
|
||||
'https://vk.com/al_wall.php',
|
||||
data = [('act', 'get_wall'), ('al', 1), ('fixed', fixedPostID), ('offset', offset), ('onlyCache', 'false'), ('owner_id', ownerID), ('type', 'own'), ('wall_start_from', offset)],
|
||||
headers = headers
|
||||
)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
# Convert to JSON and read the HTML payload. Note that this implicitly converts the data to a Python string (i.e., Unicode), away from a windows-1251-encoded bytes.
|
||||
posts = r.json()['payload'][1][0]
|
||||
return posts
|
||||
|
||||
def _get_entity(self):
|
||||
r, soup = self._initial_page()
|
||||
if r.status_code != 200:
|
||||
return
|
||||
kwargs = {}
|
||||
kwargs['username'] = r.url.rsplit('/', 1)[1]
|
||||
nameH1 = soup.find('h1', class_ = 'page_name')
|
||||
kwargs['name'] = nameH1.text
|
||||
kwargs['verified'] = bool(nameH1.find('div', class_ = 'page_verified'))
|
||||
|
||||
if (descriptionDiv := soup.find('div', id = 'page_current_info')):
|
||||
kwargs['description'] = descriptionDiv.text
|
||||
|
||||
if (infoDiv := soup.find('div', id = 'page_info_wrap')):
|
||||
websites = []
|
||||
for rowDiv in infoDiv.find_all('div', class_ = ['profile_info_row', 'group_info_row']):
|
||||
if 'profile_info_row' in rowDiv['class']:
|
||||
labelDiv = rowDiv.find('div', class_ = 'fl_l')
|
||||
if not labelDiv or labelDiv.text != 'Website:':
|
||||
continue
|
||||
else: # group_info_row
|
||||
if rowDiv['title'] == 'Description':
|
||||
kwargs['description'] = rowDiv.text
|
||||
if rowDiv['title'] != 'Website':
|
||||
continue
|
||||
for a in rowDiv.find_all('a'):
|
||||
if not a['href'].startswith('/away.php?to='):
|
||||
_logger.warning(f'Skipping odd website link: {a["href"]!r}')
|
||||
continue
|
||||
websites.append(urllib.parse.unquote(a['href'].split('=', 1)[1].split('&', 1)[0]))
|
||||
if websites:
|
||||
kwargs['websites'] = websites
|
||||
|
||||
def parse_num(s: str) -> typing.Tuple[int, int]:
|
||||
if s.endswith('K'):
|
||||
return int(s[:-1]) * 1000, 1000
|
||||
elif s.endswith('M'):
|
||||
baseNum = s[:-1]
|
||||
precision = 1000000
|
||||
if '.' in s:
|
||||
precision //= (10 ** len(baseNum.split('.')[1]))
|
||||
return int(float(baseNum) * 1000000), precision
|
||||
else:
|
||||
return int(s.replace(',', '')), 1
|
||||
|
||||
if (countsDiv := soup.find('div', class_ = 'counts_module')):
|
||||
for a in countsDiv.find_all('a', class_ = 'page_counter'):
|
||||
count, granularity = parse_num(a.find('div', class_ = 'count').text)
|
||||
label = a.find('div', class_ = 'label').text
|
||||
if label in ('follower', 'post', 'photo', 'tag'):
|
||||
label = f'{label}s'
|
||||
if label in ('followers', 'posts', 'photos', 'tags'):
|
||||
kwargs[label] = snscrape.base.IntWithGranularity(count, granularity)
|
||||
|
||||
if (idolsDiv := soup.find('div', id = 'profile_idols')):
|
||||
if (topDiv := idolsDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Following':
|
||||
kwargs['following'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
|
||||
|
||||
# On public pages, this is where followers are listed
|
||||
if (followersDiv := soup.find('div', id = 'public_followers')):
|
||||
if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Followers':
|
||||
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
|
||||
# On community groups, this is where followers are listed
|
||||
elif (followersDiv := soup.find('div', class_ = 'group_friends_text')):
|
||||
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(followersDiv.find('span', class_ = 'group_friends_count').text))
|
||||
# On public groups, this is where followers are listed
|
||||
elif (followersDiv := soup.find('div', id = 'group_followers')):
|
||||
if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Members':
|
||||
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
|
||||
|
||||
return User(**kwargs)
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
subparser.add_argument('username', help = 'A VK username')
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('username', type = snscrape.base.nonempty_string('username'), help = 'A VK username')
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls(args.username, retries = args.retries)
|
||||
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.username)
|
||||
|
||||
151
snscrape/modules/weibo.py
Normal file
151
snscrape/modules/weibo.py
Normal file
@@ -0,0 +1,151 @@
|
||||
__all__ = ['Post', 'User', 'WeiboUserScraper']
|
||||
|
||||
|
||||
import dataclasses
|
||||
import logging
|
||||
import re
|
||||
import snscrape.base
|
||||
import typing
|
||||
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
_userDoesNotExist = object()
|
||||
_HTML_STRIP_PATTERN = re.compile(r'<[^>]*>')
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Post(snscrape.base.Item):
|
||||
url: str
|
||||
id: str
|
||||
user: typing.Optional['User']
|
||||
createdAt: str # Can have a variety of inconsistent formats
|
||||
text: str
|
||||
repostsCount: typing.Optional[int]
|
||||
commentsCount: typing.Optional[typing.Union[int, str]]
|
||||
likesCount: typing.Optional[int]
|
||||
picturesCount: typing.Optional[int]
|
||||
pictures: typing.Optional[typing.List[str]] # May be shorter than pictureCount if the API didn't return all of them (e.g. post Ipay2evb0)
|
||||
video: typing.Optional[str]
|
||||
link: typing.Optional[str]
|
||||
repostedPost: typing.Optional['Post']
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Entity):
|
||||
screenname: str
|
||||
uid: int
|
||||
verified: bool
|
||||
verifiedReason: typing.Optional[str]
|
||||
description: str
|
||||
statusesCount: int
|
||||
followersCount: int
|
||||
followCount: int
|
||||
avatar: str
|
||||
|
||||
def __str__(self):
|
||||
return f'https://m.weibo.cn/u/{self.uid}'
|
||||
|
||||
|
||||
class WeiboUserScraper(snscrape.base.Scraper):
|
||||
name = 'weibo-user'
|
||||
|
||||
def __init__(self, user, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._user = user
|
||||
self._isUserId = isinstance(user, int)
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
|
||||
|
||||
def _ensure_user_id(self):
|
||||
if self._isUserId:
|
||||
return
|
||||
r = self._get(f'https://m.weibo.cn/n/{self._user}', headers = self._headers, allowRedirects = False)
|
||||
if r.status_code == 302 and r.headers['Location'].startswith('/u/') and len(r.headers['Location']) == 13 and r.headers['Location'][3:].strip('0123456789') == '':
|
||||
# Redirect to uid URL
|
||||
self._user = int(r.headers['Location'][3:])
|
||||
self._isUserId = True
|
||||
elif r.status_code == 200 and '<p class="h5-4con">用户不存在</p>' in r.text:
|
||||
_logger.warning('User does not exist')
|
||||
self._user = _userDoesNotExist
|
||||
else:
|
||||
raise snscrape.base.ScraperException(f'Got unexpected response on resolving username ({r.status_code})')
|
||||
|
||||
def _check_timeline_response(self, r):
|
||||
if r.status_code == 200 and r.content == b'{"ok":0,"msg":"\\u8fd9\\u91cc\\u8fd8\\u6ca1\\u6709\\u5185\\u5bb9","data":{"cards":[]}}':
|
||||
# 'No content here yet'. Appears to happen sometimes on pagination, possibly due to too fast requests; retry this
|
||||
return False, 'no-content message'
|
||||
if r.status_code != 200:
|
||||
return False, 'non-200 status code'
|
||||
return True, None
|
||||
|
||||
def _mblog_to_item(self, mblog):
|
||||
return Post(
|
||||
url = f'https://m.weibo.cn/status/{mblog["bid"]}',
|
||||
id = mblog['id'],
|
||||
user = self._user_info_to_entity(mblog['user']) if mblog['user'] is not None else None,
|
||||
createdAt = mblog['created_at'],
|
||||
text = mblog['raw_text'] if 'raw_text' in mblog else _HTML_STRIP_PATTERN.sub('', mblog['text']),
|
||||
repostsCount = mblog.get('reposts_count'),
|
||||
commentsCount = mblog.get('comments_count'),
|
||||
likesCount = mblog.get('attitudes_count'),
|
||||
picturesCount = mblog.get('pic_num'),
|
||||
pictures = [x['large']['url'] for x in mblog['pics']] if 'pics' in mblog else None,
|
||||
video = mblog['page_info']['media_info']['mp4_720p_mp4'] if 'page_info' in mblog and mblog['page_info']['type'] == 'video' else None,
|
||||
link = mblog['page_info']['page_url'] if 'page_info' in mblog and mblog['page_info']['type'] == 'webpage' else None,
|
||||
repostedPost = self._mblog_to_item(mblog['retweeted_status']) if 'retweeted_status' in mblog else None,
|
||||
)
|
||||
|
||||
def get_items(self):
|
||||
self._ensure_user_id()
|
||||
if self._user is _userDoesNotExist:
|
||||
return
|
||||
sinceId = None
|
||||
while True:
|
||||
sinceParam = f'&since_id={sinceId}' if sinceId is not None else ''
|
||||
r = self._get(f'https://m.weibo.cn/api/container/getIndex?type=uid&value={self._user}&containerid=107603{self._user}&count=25{sinceParam}', headers = self._headers, responseOkCallback = self._check_timeline_response)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
o = r.json()
|
||||
for card in o['data']['cards']:
|
||||
if card['card_type'] != 9:
|
||||
_logger.warning(f'Skipping card of type {card["card_type"]}')
|
||||
continue
|
||||
yield self._mblog_to_item(card['mblog'])
|
||||
if 'since_id' not in o['data']['cardlistInfo']:
|
||||
# End of pagination
|
||||
break
|
||||
sinceId = o['data']['cardlistInfo']['since_id']
|
||||
|
||||
def _user_info_to_entity(self, userInfo):
|
||||
return User(
|
||||
screenname = userInfo['screen_name'],
|
||||
uid = userInfo['id'],
|
||||
verified = userInfo['verified'],
|
||||
verifiedReason = userInfo.get('verified_reason'),
|
||||
description = userInfo['description'],
|
||||
statusesCount = userInfo['statuses_count'],
|
||||
followersCount = userInfo['followers_count'],
|
||||
followCount = userInfo['follow_count'],
|
||||
avatar = userInfo['avatar_hd'],
|
||||
)
|
||||
|
||||
def _get_entity(self):
|
||||
self._ensure_user_id()
|
||||
if self._user is _userDoesNotExist:
|
||||
return
|
||||
r = self._get(f'https://m.weibo.cn/api/container/getIndex?type=uid&value={self._user}', headers = self._headers)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException('Could not fetch user info')
|
||||
o = r.json()
|
||||
return self._user_info_to_entity(o['data']['userInfo'])
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('--name', dest = 'isName', action = 'store_true', help = 'Use username instead of user ID')
|
||||
subparser.add_argument('user', type = snscrape.base.nonempty_string('user'), help = 'A user ID')
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, user = args.user if args.isName else int(args.user))
|
||||
@@ -1,7 +1,7 @@
|
||||
import pkg_resources
|
||||
import importlib.metadata
|
||||
|
||||
|
||||
try:
|
||||
__version__ = pkg_resources.get_distribution('snscrape').version
|
||||
except pkg_resources.DistributionNotFound:
|
||||
__version__ = importlib.metadata.version('snscrape')
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
__version__ = None
|
||||
|
||||
Reference in New Issue
Block a user