mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-10 19:38:29 +03:00
Compare commits
354 Commits
v0.3.1
...
more-tg-in
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cacd783b95 | ||
|
|
3dd9c28e31 | ||
|
|
7186c833dd | ||
|
|
1c3a592415 | ||
|
|
285d5874fc | ||
|
|
adac052723 | ||
|
|
edac5f38cb | ||
|
|
b93cf2640c | ||
|
|
e47fbe3d1f | ||
|
|
99050710d7 | ||
|
|
3f7bb0516d | ||
|
|
98b50ff9e9 | ||
|
|
fd75fff202 | ||
|
|
c77d19da5d | ||
|
|
945bfbde04 | ||
|
|
0942beedd6 | ||
|
|
3545837637 | ||
|
|
aa8d93e07c | ||
|
|
7061ad2eb5 | ||
|
|
03ef3debaf | ||
|
|
42cb6d8170 | ||
|
|
ea7c6786c2 | ||
|
|
61dbbba6b1 | ||
|
|
d1592177ab | ||
|
|
21cf626803 | ||
|
|
f329b69ed4 | ||
|
|
f109f3fd46 | ||
|
|
7330e0a9a0 | ||
|
|
4e6956e564 | ||
|
|
4e70306f99 | ||
|
|
7327a01397 | ||
|
|
880a0a7f55 | ||
|
|
57b126c656 | ||
|
|
82f64a6472 | ||
|
|
6a6b02cb28 | ||
|
|
3d6cd63a00 | ||
|
|
9a2f1524c2 | ||
|
|
b5694e01a2 | ||
|
|
280b972f22 | ||
|
|
6ba478657b | ||
|
|
71fb33af70 | ||
|
|
c65e36a094 | ||
|
|
206907612d | ||
|
|
fe5d90b748 | ||
|
|
f1cb96b685 | ||
|
|
8709282ba0 | ||
|
|
0933a30e37 | ||
|
|
d60ce38b6a | ||
|
|
23ebdd2a3c | ||
|
|
35c0c32c38 | ||
|
|
b515a66b93 | ||
|
|
36e85c54c1 | ||
|
|
49270f6d3a | ||
|
|
d0fb9ab8a9 | ||
|
|
5d3f27bc2b | ||
|
|
b7cb270b6e | ||
|
|
8ad26fc7d1 | ||
|
|
1fb5c39168 | ||
|
|
d81d247a87 | ||
|
|
564a5eca77 | ||
|
|
bf0e720b5a | ||
|
|
27374285a2 | ||
|
|
238bdcd560 | ||
|
|
e846a6a4cd | ||
|
|
cbeb65d5c9 | ||
|
|
3e19f8f84b | ||
|
|
28f5a45825 | ||
|
|
2196bdf3e8 | ||
|
|
faf09b2f5e | ||
|
|
3e297c9a42 | ||
|
|
a0414d92cf | ||
|
|
ff5e2d61ee | ||
|
|
129ad3fc34 | ||
|
|
7de8d734e9 | ||
|
|
ceb06664f0 | ||
|
|
996cf882cc | ||
|
|
e449d5cdbe | ||
|
|
cbdaee6864 | ||
|
|
a3bee057b1 | ||
|
|
6f9a0e6534 | ||
|
|
4ff4af13cf | ||
|
|
e09aea70e7 | ||
|
|
cbdfeed812 | ||
|
|
aa325fa1a5 | ||
|
|
46a603053c | ||
|
|
59abeaf04c | ||
|
|
e13033fea0 | ||
|
|
9294c26ffa | ||
|
|
d6bce5b1d6 | ||
|
|
2c7a85a620 | ||
|
|
ff18f6f771 | ||
|
|
da3d870e10 | ||
|
|
279d1cf4a1 | ||
|
|
73f10a4f24 | ||
|
|
d72b51953f | ||
|
|
056cd6215c | ||
|
|
d5b406bc1b | ||
|
|
56e4232083 | ||
|
|
50899c01f3 | ||
|
|
bcad6923c2 | ||
|
|
0d361685ff | ||
|
|
530f4fa122 | ||
|
|
dc6bc9bf9d | ||
|
|
01cf6a09b3 | ||
|
|
ef7c4fad3e | ||
|
|
65723f10ff | ||
|
|
07a5f6fd7d | ||
|
|
0822a9c354 | ||
|
|
faeffe2603 | ||
|
|
e3bdc02a7c | ||
|
|
e2d922301e | ||
|
|
b13e62eb5d | ||
|
|
f38513503d | ||
|
|
0a4bd39ca6 | ||
|
|
c18ca0f047 | ||
|
|
5648e957d0 | ||
|
|
21f7b620ec | ||
|
|
9b3faec980 | ||
|
|
97d38e5cde | ||
|
|
b276c3cc27 | ||
|
|
1e4e0c278d | ||
|
|
babcddda19 | ||
|
|
ed3ea944d1 | ||
|
|
e7a6d38a5f | ||
|
|
6c50eee31b | ||
|
|
5103a33afa | ||
|
|
247bd82d79 | ||
|
|
5fc67f2bcf | ||
|
|
65e7d8bd24 | ||
|
|
3870282a42 | ||
|
|
7c0fcdec43 | ||
|
|
9af1f19034 | ||
|
|
5fc3c0e290 | ||
|
|
f978954bb3 | ||
|
|
2ce014ade4 | ||
|
|
5d156c6a15 | ||
|
|
4e59638e7c | ||
|
|
a7eb54d226 | ||
|
|
d32c9add8a | ||
|
|
fb8d73ac95 | ||
|
|
ed829163a0 | ||
|
|
694657ef80 | ||
|
|
b8efce2a12 | ||
|
|
1ab0f4fccb | ||
|
|
3a92b5bf0d | ||
|
|
2480b173f4 | ||
|
|
de4ebed81f | ||
|
|
72b26f2373 | ||
|
|
77bbb9f61f | ||
|
|
57a624c618 | ||
|
|
b1cfd51121 | ||
|
|
ace2c16f54 | ||
|
|
2f9c0457df | ||
|
|
878f2a3c7a | ||
|
|
25ee014e29 | ||
|
|
a192dc6236 | ||
|
|
a7242f340b | ||
|
|
359cc25cdf | ||
|
|
01799a7391 | ||
|
|
b0753c34ed | ||
|
|
7f78fa0bc0 | ||
|
|
8702a9c7e2 | ||
|
|
8ac1fd3ea8 | ||
|
|
9235890f9a | ||
|
|
7d939c110c | ||
|
|
8e95e9a9a7 | ||
|
|
aa7d7d3dc3 | ||
|
|
560c78c5cf | ||
|
|
107c3c71c2 | ||
|
|
7f88678253 | ||
|
|
52e4f9fb69 | ||
|
|
eebdfc1c55 | ||
|
|
e6076353c8 | ||
|
|
a32d79fab2 | ||
|
|
65391297f6 | ||
|
|
deb2659dd6 | ||
|
|
93e62744d7 | ||
|
|
3f3632d341 | ||
|
|
5070953feb | ||
|
|
853848ed5d | ||
|
|
0b4abdc43f | ||
|
|
267b7d0e32 | ||
|
|
acb7f10a4f | ||
|
|
afb6bfc429 | ||
|
|
ec5626097a | ||
|
|
ca00b480b1 | ||
|
|
f189ab4241 | ||
|
|
c6e1e33a23 | ||
|
|
a37ea528d3 | ||
|
|
eee06d8593 | ||
|
|
4dd3ee6e47 | ||
|
|
0336ce13ed | ||
|
|
193d4f80d6 | ||
|
|
e7d35ec1eb | ||
|
|
8540045658 | ||
|
|
1f1c1bd8af | ||
|
|
7fdc8bcb53 | ||
|
|
4b3c6aefe7 | ||
|
|
525cd71225 | ||
|
|
72abff9e5c | ||
|
|
bcaa477b3d | ||
|
|
66d4c99f82 | ||
|
|
0ac50f1383 | ||
|
|
c2257ad16e | ||
|
|
58f654405f | ||
|
|
35fb61a327 | ||
|
|
a6b6f3faaa | ||
|
|
5e829e2541 | ||
|
|
d4567da23c | ||
|
|
e5e0da25a0 | ||
|
|
821326bcfb | ||
|
|
4bf9ef239c | ||
|
|
e382891642 | ||
|
|
e5f4389464 | ||
|
|
d91f971f51 | ||
|
|
67e8295293 | ||
|
|
5fc2562642 | ||
|
|
2825bd0a73 | ||
|
|
9831f2a4a0 | ||
|
|
a11eef6b06 | ||
|
|
3fb731ade1 | ||
|
|
c76f1637ce | ||
|
|
ed117e8891 | ||
|
|
f9a3fafb3f | ||
|
|
660b8c7a0a | ||
|
|
0c22608dc7 | ||
|
|
2bb706feda | ||
|
|
5e6bc4ec50 | ||
|
|
57d0aaafc1 | ||
|
|
157e4d4265 | ||
|
|
54588e9c42 | ||
|
|
9e7274f3d7 | ||
|
|
ac4e335bdb | ||
|
|
1d255de48d | ||
|
|
9c1dcd37f9 | ||
|
|
f8dac183d0 | ||
|
|
45d1fa27de | ||
|
|
98b798b0e5 | ||
|
|
f18b64e7da | ||
|
|
460be9d581 | ||
|
|
97c8caea48 | ||
|
|
a34f93076a | ||
|
|
8f1c470061 | ||
|
|
dbf2a2f689 | ||
|
|
39a34a57ac | ||
|
|
f44b39705a | ||
|
|
f64ce217b7 | ||
|
|
cdf87f4b8f | ||
|
|
47fbc2a84d | ||
|
|
5cd3b7d7cc | ||
|
|
0121fa51c2 | ||
|
|
892941b609 | ||
|
|
e3022628b6 | ||
|
|
fdc33d0dba | ||
|
|
6d6411cc24 | ||
|
|
61a1ecffc5 | ||
|
|
d2dce37fa0 | ||
|
|
d65f0434da | ||
|
|
7499384110 | ||
|
|
7a0f68b7ec | ||
|
|
1a219fd2b6 | ||
|
|
6fb98dae12 | ||
|
|
8c2c0fa47a | ||
|
|
58c8365c33 | ||
|
|
2c11ec38fa | ||
|
|
fe5e23502d | ||
|
|
644cd1d2fb | ||
|
|
5ccfab6314 | ||
|
|
bf895ea5b1 | ||
|
|
e956e2562b | ||
|
|
defe874bf4 | ||
|
|
3f8935ee4d | ||
|
|
cd12500dbf | ||
|
|
5dc61d50ac | ||
|
|
11a82e110a | ||
|
|
16ebe8bf48 | ||
|
|
1bbe25647a | ||
|
|
e22b461563 | ||
|
|
c4a5715e18 | ||
|
|
5cb64faa72 | ||
|
|
0f78aa45fc | ||
|
|
179112a310 | ||
|
|
4ce9ed4eb3 | ||
|
|
11414cb68f | ||
|
|
bd53e729a0 | ||
|
|
ffd9289edc | ||
|
|
b1a7b9607f | ||
|
|
119e53d07c | ||
|
|
c3e2e12369 | ||
|
|
a70b361176 | ||
|
|
8b68f1a8af | ||
|
|
c72bf3174f | ||
|
|
472cef2382 | ||
|
|
b1d8475a03 | ||
|
|
3d3faf80bf | ||
|
|
bbb372284b | ||
|
|
8cf81e9bfc | ||
|
|
d90f06b389 | ||
|
|
c519832755 | ||
|
|
397a0b988e | ||
|
|
f1428fa0e0 | ||
|
|
7d2c546ee5 | ||
|
|
2332c30e26 | ||
|
|
b78bf3e642 | ||
|
|
1a09f9b9a3 | ||
|
|
5ae5ec7bcd | ||
|
|
c0ff6631aa | ||
|
|
ae60a4d0fd | ||
|
|
800cfd5be0 | ||
|
|
f296f9d21d | ||
|
|
8265ffc19e | ||
|
|
f8efe98608 | ||
|
|
2b5444f89e | ||
|
|
07d446fd19 | ||
|
|
a25426043b | ||
|
|
84692846b9 | ||
|
|
039b2c6719 | ||
|
|
70a3d9ba3a | ||
|
|
bd619bf4e9 | ||
|
|
072519f539 | ||
|
|
d9572ec450 | ||
|
|
ba250aabf2 | ||
|
|
0cc4f0c016 | ||
|
|
1a2e367a87 | ||
|
|
4f24843f89 | ||
|
|
bfb92a47b9 | ||
|
|
dc5d55004b | ||
|
|
d8e7f96d4d | ||
|
|
bb83d1d72f | ||
|
|
1480260e47 | ||
|
|
c8d688d39f | ||
|
|
9df4352089 | ||
|
|
dd25fd0526 | ||
|
|
c90fd54b6b | ||
|
|
9528df48cd | ||
|
|
924c35f883 | ||
|
|
588ec415ff | ||
|
|
bf229414ba | ||
|
|
afa819547d | ||
|
|
dbcdc159ef | ||
|
|
30f945897a | ||
|
|
eee5794ff9 | ||
|
|
966a6ebd8e | ||
|
|
4d3d0fe0d7 | ||
|
|
7b967ff82a | ||
|
|
90f9598ecc | ||
|
|
7b3c7deb28 | ||
|
|
040a11656c | ||
|
|
1459245258 | ||
|
|
dbe4c5ce55 | ||
|
|
80491ecc2c | ||
|
|
1a71b58101 | ||
|
|
0ce37a69d4 | ||
|
|
722bfd5f7c |
97
.github/ISSUE_TEMPLATE/bug_report.yml
vendored
Normal file
97
.github/ISSUE_TEMPLATE/bug_report.yml
vendored
Normal file
@@ -0,0 +1,97 @@
|
||||
name: Bug report
|
||||
description: Are you experiencing a problem? Create a report to help us improve!
|
||||
labels: 'bug'
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
## Self Check
|
||||
- Try searching existing GitHub Issues (open or closed) for similar issues.
|
||||
- type: textarea
|
||||
validations:
|
||||
required: true
|
||||
attributes:
|
||||
label: Describe the bug
|
||||
description: A clear description of what the bug is.
|
||||
placeholder: e.g. I see an AssertionError when trying to scrape a Twitter user!
|
||||
- type: textarea
|
||||
validations:
|
||||
required: true
|
||||
attributes:
|
||||
label: How to reproduce
|
||||
description: |
|
||||
How to reproduce the problem.
|
||||
This should be a minimal reproducible example, i.e. the shortest possible code or the smallest number of steps that still causes the error.
|
||||
placeholder: e.g. I can reproduce this issue by scraping the textfiles user with the twitter-user scraper.
|
||||
- type: textarea
|
||||
validations:
|
||||
required: true
|
||||
attributes:
|
||||
label: Expected behaviour
|
||||
description: A brief description of what should happen.
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Screenshots and recordings
|
||||
description: |
|
||||
If applicable, add screenshots or videos to help explain your problem. (Videos should be as short as possible! Avoid watermarks too.)
|
||||
- type: input
|
||||
validations:
|
||||
required: true
|
||||
attributes:
|
||||
label: Operating system
|
||||
description: Include the version too, please!
|
||||
placeholder: e.g. Windows 10, Ubuntu 20.04, macOS 10.15...
|
||||
- type: input
|
||||
validations:
|
||||
required: true
|
||||
attributes:
|
||||
label: |
|
||||
Python version: output of `python3 --version`
|
||||
- type: input
|
||||
validations:
|
||||
required: true
|
||||
attributes:
|
||||
label: |
|
||||
snscrape version: output of `snscrape --version`
|
||||
- type: input
|
||||
validations:
|
||||
required: true
|
||||
attributes:
|
||||
label: Scraper
|
||||
placeholder: e.g. twitter-user, reddit-search, TwitterSearchScraper, ...
|
||||
- type: dropdown
|
||||
validations:
|
||||
required: true
|
||||
attributes:
|
||||
label: How are you using snscrape?
|
||||
options: ['CLI (`snscrape ...` as a command, e.g. in a terminal)', 'Module (`import snscrape.modules.something` in Python code)']
|
||||
- type: textarea
|
||||
validations:
|
||||
required: false
|
||||
attributes:
|
||||
label: Backtrace
|
||||
description: What is the error snscrape gives you, if any?
|
||||
- type: textarea
|
||||
validations:
|
||||
required: false
|
||||
attributes:
|
||||
label: Log output
|
||||
description: |
|
||||
Insert here the debug log of snscrape.
|
||||
If you use the CLI, add the global options `-vv` to the command, e.g. `snscrape -vv twitter-search ...`.
|
||||
If you use the module, set the debug level in your Python code before any use of snscrape: `import logging; logging.basicConfig(level = logging.DEBUG)`.
|
||||
If you already use `logging` in your own code, you may need to adjust the level there instead.
|
||||
- type: textarea
|
||||
validations:
|
||||
required: false
|
||||
attributes:
|
||||
label: Dump of locals
|
||||
description: |
|
||||
Here attach the dump of your snscrape locals, if it's a crash. (snscrape should tell you the path).
|
||||
Please note that it may contain identifying info such as IP address, if the website returns that.
|
||||
You can also optionally request to exchange the file in private.
|
||||
Finally, if snscrape didn't crash, leave this field blank.
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Additional context
|
||||
description: Add any other context about the problem here.
|
||||
27
.github/ISSUE_TEMPLATE/feature_request.yml
vendored
Normal file
27
.github/ISSUE_TEMPLATE/feature_request.yml
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
name: Feature Request
|
||||
description: Want a feature? Ask; we don't bite!
|
||||
labels: 'enhancement'
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
## Self Check
|
||||
- Try searching existing GitHub Issues (open or closed) for similar issues.
|
||||
- type: textarea
|
||||
validations:
|
||||
required: true
|
||||
attributes:
|
||||
label: Describe the feature
|
||||
description: A clear description of what the feature is.
|
||||
- type: textarea
|
||||
validations:
|
||||
required: false
|
||||
attributes:
|
||||
label: Would this fix a problem you're experiencing? If so, specify.
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Did you consider other alternatives?
|
||||
description: If so, specify
|
||||
- type: input
|
||||
attributes:
|
||||
label: Additional context
|
||||
6
.github/ISSUE_TEMPLATE/question.md
vendored
Normal file
6
.github/ISSUE_TEMPLATE/question.md
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
---
|
||||
name: Question
|
||||
about: Ask away! (Do not use this for bugs or features.)
|
||||
labels: 'question'
|
||||
|
||||
---
|
||||
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
__pycache__/
|
||||
/dist/
|
||||
/snscrape.egg-info/
|
||||
/.eggs/
|
||||
44
README.md
44
README.md
@@ -1,16 +1,19 @@
|
||||
# snscrape
|
||||
snscrape is a scraper for social networking services (SNS). It scrapes things like user profiles, hashtags, or searches and returns the discovered items, e.g. the relevant posts.
|
||||
snscrape is a scraper for social networking services (SNS). It scrapes things like user profiles, hashtags, or searches and returns the discovered items, e.g. the relevant posts.
|
||||
|
||||
The following services are currently supported:
|
||||
* Facebook: user profiles and groups
|
||||
* Gab: user profile posts, media, and comments
|
||||
* Google+: user profiles
|
||||
|
||||
* Facebook: user profiles, groups, and communities (aka visitor posts)
|
||||
* Instagram: user profiles, hashtags, and locations
|
||||
* Twitter: user profiles, hashtags, searches, threads, and lists (members as well as posts)
|
||||
* Mastodon: user profiles and toots (single or thread)
|
||||
* Reddit: users, subreddits, and searches (via Pushshift)
|
||||
* Telegram: channels
|
||||
* Twitter: users, user profiles, hashtags, searches (live tweets, top tweets, and users), tweets (single or surrounding thread), list posts, communities, and trends
|
||||
* VKontakte: user profiles
|
||||
* Weibo (Sina Weibo): user profiles
|
||||
|
||||
## Requirements
|
||||
snscrape requires Python 3.6 or higher. The Python package dependencies are installed automatically when you install snscrape.
|
||||
snscrape requires Python 3.8 or higher. The Python package dependencies are installed automatically when you install snscrape.
|
||||
|
||||
Note that one of the dependencies, lxml, also requires libxml2 and libxslt to be installed.
|
||||
|
||||
@@ -22,11 +25,28 @@ If you want to use the development version:
|
||||
pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git
|
||||
|
||||
## Usage
|
||||
To get all tweets by Jason Scott (@textfiles):
|
||||
### CLI
|
||||
The generic syntax of snscrape's CLI is:
|
||||
|
||||
snscrape [GLOBAL-OPTIONS] SCRAPER-NAME [SCRAPER-OPTIONS] [SCRAPER-ARGUMENTS...]
|
||||
|
||||
`snscrape --help` and `snscrape SCRAPER-NAME --help` provide details on the options and arguments. `snscrape --help` also lists all available scrapers.
|
||||
|
||||
The default output of the CLI is the URL of each result.
|
||||
|
||||
Some noteworthy global options are:
|
||||
|
||||
* `--jsonl` to get output as JSONL. This includes all information extracted by snscrape (e.g. message content, datetime, images; details vary by scraper).
|
||||
* `--max-results NUMBER` to only return the first `NUMBER` results.
|
||||
* `--with-entity` to get an item on the entity being scraped, e.g. the user or channel. This is not supported on all scrapers. (You can use this together with `--max-results 0` to only fetch the entity info.)
|
||||
|
||||
#### Examples
|
||||
Collect all tweets by Jason Scott (@textfiles):
|
||||
|
||||
snscrape twitter-user textfiles
|
||||
|
||||
It's usually useful to redirect the output to a file for further processing, e.g. in bash using the filename `@textfiles-tweets`:
|
||||
It's usually useful to redirect the output to a file for further processing, e.g. in bash using the filename `twitter-@textfiles`:
|
||||
|
||||
```bash
|
||||
snscrape twitter-user textfiles >twitter-@textfiles
|
||||
```
|
||||
@@ -35,12 +55,14 @@ To get the latest 100 tweets with the hashtag #archiveteam:
|
||||
|
||||
snscrape --max-results 100 twitter-hashtag archiveteam
|
||||
|
||||
`snscrape --help` or `snscrape <module> --help` provides details on the available options. `snscrape --help` also lists all available modules.
|
||||
|
||||
### Library
|
||||
It is also possible to use snscrape as a library in Python, but this is currently undocumented.
|
||||
|
||||
## Issue reporting
|
||||
If you discover an issue with snscrape, please report it at <https://github.com/JustAnotherArchivist/snscrape/issues>. If possible please run snscrape with `-vv` and `--dump-locals` and include the log output as well as the dump files referenced in the log in the issue. Note that the files may contain sensitive information in some cases and could potentially be used to identify you (e.g. if the service includes your IP address in its response). If you prefer to arrange a file transfer privately, just mention that in the issue.
|
||||
If you discover an issue with snscrape, please report it at <https://github.com/JustAnotherArchivist/snscrape/issues>. If you use the CLI, please run snscrape with `-vv` and include the log output in the issue. If you use snscrape as a module, please enable debug-level logging using `import logging; logging.basicConfig(level = logging.DEBUG)` (before using snscrape at all) and include the log output in the issue.
|
||||
|
||||
### Dump files
|
||||
In some cases, debugging may require more information than is available in the log. The CLI has a `--dump-locals` option that enables dumping all local variables within snscrape based on important log messages (rather than, by default, only on crashes). Note that the dump files may contain sensitive information in some cases and could potentially be used to identify you (e.g. if the service includes your IP address in its response). If you prefer to arrange a file transfer privately, just mention that in the issue.
|
||||
|
||||
## License
|
||||
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
|
||||
|
||||
37
pyproject.toml
Normal file
37
pyproject.toml
Normal file
@@ -0,0 +1,37 @@
|
||||
[build-system]
|
||||
requires = ['setuptools>=61', 'setuptools_scm>=6.2']
|
||||
build-backend = 'setuptools.build_meta'
|
||||
|
||||
[tool.setuptools]
|
||||
packages = ['snscrape', 'snscrape.modules']
|
||||
|
||||
[tool.setuptools_scm]
|
||||
|
||||
[project]
|
||||
name = 'snscrape'
|
||||
description = 'A social networking service scraper'
|
||||
readme = 'README.md'
|
||||
authors = [{name = 'JustAnotherArchivist'}]
|
||||
classifiers = [
|
||||
'Development Status :: 4 - Beta',
|
||||
'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Programming Language :: Python :: 3.9',
|
||||
'Programming Language :: Python :: 3.10',
|
||||
'Programming Language :: Python :: 3.11',
|
||||
]
|
||||
dependencies = [
|
||||
'requests[socks]',
|
||||
'lxml',
|
||||
'beautifulsoup4',
|
||||
'pytz; python_version < "3.9.0"',
|
||||
'filelock',
|
||||
]
|
||||
requires-python = '~=3.8'
|
||||
dynamic = ['version']
|
||||
|
||||
[project.urls]
|
||||
repository = "https://github.com/JustAnotherArchivist/snscrape"
|
||||
|
||||
[project.scripts]
|
||||
snscrape = 'snscrape._cli:main'
|
||||
23
setup.py
23
setup.py
@@ -1,23 +0,0 @@
|
||||
import setuptools
|
||||
|
||||
|
||||
setuptools.setup(
|
||||
name = 'snscrape',
|
||||
description = 'A social networking service scraper',
|
||||
author = 'JustAnotherArchivist',
|
||||
url = 'https://github.com/JustAnotherArchivist/snscrape',
|
||||
classifiers = [
|
||||
'Development Status :: 4 - Beta',
|
||||
'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
],
|
||||
packages = ['snscrape', 'snscrape.modules'],
|
||||
setup_requires = ['setuptools_scm'],
|
||||
use_scm_version = True,
|
||||
install_requires = ['requests[socks]', 'lxml', 'beautifulsoup4'],
|
||||
entry_points = {
|
||||
'console_scripts': [
|
||||
'snscrape = snscrape.cli:main',
|
||||
],
|
||||
},
|
||||
)
|
||||
343
snscrape/_cli.py
Normal file
343
snscrape/_cli.py
Normal file
@@ -0,0 +1,343 @@
|
||||
import argparse
|
||||
import collections
|
||||
import contextlib
|
||||
import dataclasses
|
||||
import datetime
|
||||
import importlib.metadata
|
||||
import inspect
|
||||
import logging
|
||||
import os
|
||||
import requests
|
||||
# Imported in parse_args() after setting up the logger:
|
||||
#import snscrape.base
|
||||
#import snscrape.modules
|
||||
#import snscrape.version
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
|
||||
## Logging
|
||||
dumpLocals = False
|
||||
logger = logging # Replaced below after setting the logger class
|
||||
|
||||
|
||||
class Logger(logging.Logger):
|
||||
def _log_with_stack(self, level, *args, **kwargs):
|
||||
super().log(level, *args, **kwargs)
|
||||
if dumpLocals and not kwargs.get('extra', {}).get('_snscrapeSuppressDumpLocals', False):
|
||||
stack = inspect.stack()
|
||||
if len(stack) >= 3:
|
||||
name = _dump_stack_and_locals(stack[2:][::-1])
|
||||
super().log(level, f'Dumped stack and locals to {name}')
|
||||
|
||||
def warning(self, *args, **kwargs):
|
||||
self._log_with_stack(logging.WARNING, *args, **kwargs)
|
||||
|
||||
def error(self, *args, **kwargs):
|
||||
self._log_with_stack(logging.ERROR, *args, **kwargs)
|
||||
|
||||
def critical(self, *args, **kwargs):
|
||||
self._log_with_stack(logging.CRITICAL, *args, **kwargs)
|
||||
|
||||
def log(self, level, *args, **kwargs):
|
||||
if level >= logging.WARNING:
|
||||
self._log_with_stack(level, *args, **kwargs)
|
||||
else:
|
||||
super().log(level, *args, **kwargs)
|
||||
|
||||
|
||||
def _requests_request_repr(name, request):
|
||||
ret = []
|
||||
ret.append(f'{name} = {request!r}')
|
||||
ret.append(f'\n {name}.method = {request.method}')
|
||||
ret.append(f'\n {name}.url = {request.url}')
|
||||
ret.append(f'\n {name}.headers = \\')
|
||||
for field in request.headers:
|
||||
ret.append(f'\n {field} = {_repr("_", request.headers[field])}')
|
||||
for attr in ('body', 'params', 'data'):
|
||||
if hasattr(request, attr) and getattr(request, attr):
|
||||
ret.append(f'\n {name}.{attr} = ')
|
||||
ret.append(_repr('_', getattr(request, attr)).replace('\n', '\n '))
|
||||
return ''.join(ret)
|
||||
|
||||
|
||||
def _requests_response_repr(name, response, withHistory = True):
|
||||
ret = []
|
||||
ret.append(f'{name} = {response!r}')
|
||||
ret.append(f'\n {name}.url = {response.url}')
|
||||
ret.append(f'\n {name}.request = ')
|
||||
ret.append(_repr('_', response.request).replace('\n', '\n '))
|
||||
if withHistory and response.history:
|
||||
ret.append(f'\n {name}.history = [')
|
||||
for previousResponse in response.history:
|
||||
ret.append('\n ')
|
||||
ret.append(_requests_response_repr('_', previousResponse, withHistory = False).replace('\n', '\n '))
|
||||
ret.append('\n ]')
|
||||
ret.append(f'\n {name}.status_code = {response.status_code}')
|
||||
ret.append(f'\n {name}.headers = \\')
|
||||
for field in response.headers:
|
||||
ret.append(f'\n {field} = {_repr("_", response.headers[field])}')
|
||||
ret.append(f'\n {name}.content = {_repr("_", response.content)}')
|
||||
return ''.join(ret)
|
||||
|
||||
|
||||
def _requests_exception_repr(name, exc):
|
||||
ret = []
|
||||
ret.append(f'{name} = {exc!r}')
|
||||
ret.append('\n ' + _repr(f'{name}.request', exc.request).replace('\n', '\n '))
|
||||
ret.append('\n ' + _repr(f'{name}.response', exc.response).replace('\n', '\n '))
|
||||
return ''.join(ret)
|
||||
|
||||
|
||||
def _repr(name, value):
|
||||
if type(value) is requests.Response:
|
||||
return _requests_response_repr(name, value)
|
||||
if type(value) in (requests.PreparedRequest, requests.Request):
|
||||
return _requests_request_repr(name, value)
|
||||
if isinstance(value, requests.exceptions.RequestException):
|
||||
return _requests_exception_repr(name, value)
|
||||
if isinstance(value, dict):
|
||||
return f'{name} = <{type(value).__module__}.{type(value).__name__}>\n ' + \
|
||||
'\n '.join(_repr(f'{name}[{k!r}]', v).replace('\n', '\n ') for k, v in value.items())
|
||||
if isinstance(value, (list, tuple, collections.deque)) and not all(isinstance(v, (int, str)) for v in value):
|
||||
return f'{name} = <{type(value).__module__}.{type(value).__name__}>\n ' + \
|
||||
'\n '.join(_repr(f'{name}[{i}]', v).replace('\n', '\n ') for i, v in enumerate(value))
|
||||
if dataclasses.is_dataclass(value) and not isinstance(value, type):
|
||||
return f'{name} = <{type(value).__module__}.{type(value).__name__}>\n ' + \
|
||||
'\n '.join(_repr(f'{name}.{f.name}', f.name) + ' = ' + _repr(f'{name}.{f.name}', getattr(value, f.name)).replace('\n', '\n ') for f in dataclasses.fields(value))
|
||||
valueRepr = f'{name} = {value!r}'
|
||||
if '\n' in valueRepr:
|
||||
return ''.join(['\\\n ', valueRepr.replace('\n', '\n ')])
|
||||
return valueRepr
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _dump_locals_on_exception():
|
||||
try:
|
||||
yield
|
||||
except Exception as e:
|
||||
trace = inspect.trace()
|
||||
if len(trace) >= 2:
|
||||
name = _dump_stack_and_locals(trace[1:], exc = e)
|
||||
logger.fatal(f'Dumped stack and locals to {name}', extra = {'_snscrapeSuppressDumpLocals': True})
|
||||
raise
|
||||
|
||||
|
||||
def _dump_stack_and_locals(trace, exc = None):
|
||||
with tempfile.NamedTemporaryFile('w', prefix = 'snscrape_locals_', delete = False) as fp:
|
||||
if exc is not None:
|
||||
fp.write('Exception:\n')
|
||||
fp.write(f' {type(exc).__module__}.{type(exc).__name__}: {exc!s}\n')
|
||||
fp.write(f' args: {exc.args!r}\n')
|
||||
fp.write('\n')
|
||||
|
||||
fp.write('Stack:\n')
|
||||
for frameRecord in trace:
|
||||
fp.write(f' File "{frameRecord.filename}", line {frameRecord.lineno}, in {frameRecord.function}\n')
|
||||
if frameRecord.code_context is not None:
|
||||
for line in frameRecord.code_context:
|
||||
fp.write(f' {line.strip()}\n')
|
||||
fp.write('\n')
|
||||
|
||||
modules = [inspect.getmodule(frameRecord[0]) for frameRecord in trace]
|
||||
for i, (module, frameRecord) in enumerate(zip(modules, trace)):
|
||||
if module is None:
|
||||
# Module-less frame, e.g. dataclass.__init__
|
||||
for j in reversed(range(i)):
|
||||
if modules[j] is not None:
|
||||
break
|
||||
else:
|
||||
# No previous module scope
|
||||
continue
|
||||
module = modules[j]
|
||||
if not module.__name__.startswith('snscrape.') and module.__name__ != 'snscrape':
|
||||
continue
|
||||
locals_ = frameRecord[0].f_locals
|
||||
fp.write(f'Locals from file "{frameRecord.filename}", line {frameRecord.lineno}, in {frameRecord.function}:\n')
|
||||
for variableName in locals_:
|
||||
variable = locals_[variableName]
|
||||
varRepr = _repr(variableName, variable)
|
||||
fp.write(f' {variableName} {type(variable)} = ')
|
||||
fp.write(varRepr.replace('\n', '\n '))
|
||||
fp.write('\n')
|
||||
fp.write('\n')
|
||||
if 'self' in locals_ and hasattr(locals_['self'], '__dict__'):
|
||||
fp.write('Object dict:\n')
|
||||
fp.write(repr(locals_['self'].__dict__))
|
||||
fp.write('\n\n')
|
||||
name = fp.name
|
||||
return name
|
||||
|
||||
|
||||
def parse_datetime_arg(arg):
|
||||
for format in ('%Y-%m-%d %H:%M:%S %z', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %z', '%Y-%m-%d'):
|
||||
try:
|
||||
d = datetime.datetime.strptime(arg, format)
|
||||
except ValueError:
|
||||
continue
|
||||
else:
|
||||
if d.tzinfo is None:
|
||||
return d.replace(tzinfo = datetime.timezone.utc)
|
||||
return d
|
||||
# Try treating it as a unix timestamp
|
||||
try:
|
||||
d = datetime.datetime.fromtimestamp(int(arg), datetime.timezone.utc)
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
return d
|
||||
raise argparse.ArgumentTypeError(f'Cannot parse {arg!r} into a datetime object')
|
||||
|
||||
|
||||
def parse_format(arg):
|
||||
# Replace '{' by '{0.' to use properties of the item, but keep '{{' intact
|
||||
parts = arg.split('{')
|
||||
out = ''
|
||||
it = iter(zip(parts, parts[1:]))
|
||||
for part, nextPart in it:
|
||||
out += part
|
||||
if nextPart == '': # Double brace
|
||||
out += '{{'
|
||||
next(it)
|
||||
else: # Single brace
|
||||
out += '{0.'
|
||||
out += parts[-1]
|
||||
return out
|
||||
|
||||
|
||||
class CitationAction(argparse.Action):
|
||||
def __init__(self, option_strings, dest = argparse.SUPPRESS, *args, default = argparse.SUPPRESS, **kwargs):
|
||||
super().__init__(option_strings, dest, *args, **kwargs)
|
||||
|
||||
def __call__(self, parser, namespace, values, optionString):
|
||||
try:
|
||||
m = importlib.metadata.metadata('snscrape')
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
print('Error: could not find snscrape installation. --citation does not work without the package being installed.', file = sys.stderr)
|
||||
parser.exit(1)
|
||||
print(f'Author: {m["author"]}')
|
||||
print(f'Title: {m["name"]}: {m["summary"]}')
|
||||
print(f'URL: {m["home-page"]}')
|
||||
print(f'Version: {m["version"]}')
|
||||
print(f'Date: 2018‒{m["version"].split(".", 3)[3][:4]}')
|
||||
|
||||
if '.dev' in m['version']:
|
||||
print()
|
||||
print('WARNING! You are running a development version. The date range may be incorrect. Please adjust the upper end of the range to the year of the commit.')
|
||||
|
||||
parser.exit()
|
||||
|
||||
|
||||
def parse_args():
|
||||
import snscrape.base
|
||||
import snscrape.modules
|
||||
import snscrape.version
|
||||
|
||||
parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--version', action = 'version', version = f'snscrape {snscrape.version.__version__}')
|
||||
parser.add_argument('--citation', action = CitationAction, nargs = 0, help = 'Display recommended citation information and exit')
|
||||
parser.add_argument('-v', '--verbose', '--verbosity', dest = 'verbosity', action = 'count', default = 0, help = 'Increase output verbosity')
|
||||
parser.add_argument('--dump-locals', dest = 'dumpLocals', action = 'store_true', default = False, help = 'Dump local variables on serious log messages (warnings or higher)')
|
||||
parser.add_argument('--retry', '--retries', dest = 'retries', type = int, default = 3, metavar = 'N',
|
||||
help = 'When the connection fails or the server returns an unexpected response, retry up to N times with an exponential backoff')
|
||||
parser.add_argument('-n', '--max-results', dest = 'maxResults', type = lambda x: int(x) if int(x) >= 0 else parser.error('--max-results N must be zero or positive'), metavar = 'N', help = 'Only return the first N results')
|
||||
group = parser.add_mutually_exclusive_group(required = False)
|
||||
group.add_argument('-f', '--format', dest = 'format', type = parse_format, default = None, help = 'Output format')
|
||||
group.add_argument('--jsonl', dest = 'jsonl', action = 'store_true', default = False, help = 'Output JSONL')
|
||||
parser.add_argument('--with-entity', dest = 'withEntity', action = 'store_true', default = False, help = 'Include the entity (e.g. user, channel) as the first output item')
|
||||
parser.add_argument('--since', type = parse_datetime_arg, metavar = 'DATETIME', help = 'Only return results newer than DATETIME')
|
||||
parser.add_argument('--progress', action = 'store_true', default = False, help = 'Report progress on stderr')
|
||||
|
||||
subparsers = parser.add_subparsers(dest = 'scraper', metavar = 'SCRAPER', title = 'scrapers', required = True)
|
||||
classes = snscrape.base.Scraper.__subclasses__()
|
||||
scrapers = {}
|
||||
for cls in classes:
|
||||
if cls.name is not None:
|
||||
scrapers[cls.name] = cls
|
||||
classes.extend(cls.__subclasses__())
|
||||
for scraper, cls in sorted(scrapers.items()):
|
||||
subparser = subparsers.add_parser(cls.name, help = '', formatter_class = argparse.ArgumentDefaultsHelpFormatter)
|
||||
cls._cli_setup_parser(subparser)
|
||||
subparser.set_defaults(cls = cls)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.withEntity and args.maxResults == 0:
|
||||
parser.error('--max-results 0 is only valid when used with --with-entity')
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def setup_logging():
|
||||
logging.setLoggerClass(Logger)
|
||||
global logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def configure_logging(verbosity, dumpLocals_):
|
||||
global dumpLocals
|
||||
dumpLocals = dumpLocals_
|
||||
|
||||
rootLogger = logging.getLogger()
|
||||
|
||||
# Set level
|
||||
if verbosity > 0:
|
||||
level = logging.INFO if verbosity == 1 else logging.DEBUG
|
||||
rootLogger.setLevel(level)
|
||||
for handler in rootLogger.handlers:
|
||||
handler.setLevel(level)
|
||||
|
||||
# Create formatter
|
||||
formatter = logging.Formatter('{asctime}.{msecs:03.0f} {levelname} {name} {message}', datefmt = '%Y-%m-%d %H:%M:%S', style = '{')
|
||||
|
||||
# Remove existing handlers
|
||||
for handler in rootLogger.handlers:
|
||||
rootLogger.removeHandler(handler)
|
||||
|
||||
# Add stream handler
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(formatter)
|
||||
rootLogger.addHandler(handler)
|
||||
|
||||
|
||||
def main():
|
||||
setup_logging()
|
||||
args = parse_args()
|
||||
configure_logging(args.verbosity, args.dumpLocals)
|
||||
scraper = args.cls._cli_from_args(args)
|
||||
|
||||
i = 0
|
||||
with _dump_locals_on_exception():
|
||||
try:
|
||||
if args.withEntity and (entity := scraper.entity):
|
||||
if args.jsonl:
|
||||
print(entity.json())
|
||||
else:
|
||||
print(entity)
|
||||
if args.maxResults == 0:
|
||||
logger.info('Exiting after 0 results')
|
||||
return
|
||||
for i, item in enumerate(scraper.get_items(), start = 1):
|
||||
if args.since is not None and item.date < args.since:
|
||||
logger.info(f'Exiting due to reaching older results than {args.since}')
|
||||
break
|
||||
if args.jsonl:
|
||||
print(item.json())
|
||||
elif args.format is not None:
|
||||
print(args.format.format(item))
|
||||
else:
|
||||
print(item)
|
||||
if args.progress and i % 100 == 0:
|
||||
print(f'Scraping, {i} results so far', file = sys.stderr)
|
||||
if args.maxResults and i >= args.maxResults:
|
||||
logger.info(f'Exiting after {i} results')
|
||||
if args.progress:
|
||||
print(f'Stopped scraping after {i} results due to --max-results', file = sys.stderr)
|
||||
break
|
||||
else:
|
||||
logger.info(f'Done, found {i} results')
|
||||
if args.progress:
|
||||
print(f'Finished, {i} results', file = sys.stderr)
|
||||
except BrokenPipeError:
|
||||
os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stdout.fileno())
|
||||
sys.exit(1)
|
||||
229
snscrape/base.py
229
snscrape/base.py
@@ -1,34 +1,156 @@
|
||||
__all__ = ['DeprecatedFeatureWarning', 'IntWithGranularity', 'Item', 'Scraper', 'ScraperException']
|
||||
|
||||
|
||||
import abc
|
||||
import copy
|
||||
import dataclasses
|
||||
import datetime
|
||||
import functools
|
||||
import json
|
||||
import logging
|
||||
import requests
|
||||
import requests.adapters
|
||||
import urllib3.connection
|
||||
import time
|
||||
import warnings
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Item:
|
||||
'''An abstract base class for an item returned by the scraper's get_items generator.
|
||||
def _module_deprecation_helper(all, **names):
|
||||
def __getattr__(name):
|
||||
if name in names:
|
||||
warnings.warn(f'{name} is deprecated, use {names[name].__name__} instead', DeprecatedFeatureWarning, stacklevel = 2)
|
||||
return names[name]
|
||||
raise AttributeError(f'module {__name__!r} has no attribute {name!r}')
|
||||
def __dir__():
|
||||
return sorted(all + list(names.keys()))
|
||||
return __getattr__, __dir__
|
||||
|
||||
An item can really be anything. The string representation should be useful for the CLI output (e.g. a direct URL for the item).'''
|
||||
|
||||
class DeprecatedFeatureWarning(FutureWarning):
|
||||
pass
|
||||
|
||||
|
||||
class _DeprecatedProperty:
|
||||
def __init__(self, name, repl, replStr):
|
||||
self.name = name
|
||||
self.repl = repl
|
||||
self.replStr = replStr
|
||||
|
||||
def __get__(self, obj, objType):
|
||||
if obj is None: # if the access is through the class using _DeprecatedProperty rather than an instance of the class:
|
||||
return self
|
||||
warnings.warn(f'{self.name} is deprecated, use {self.replStr} instead', DeprecatedFeatureWarning, stacklevel = 2)
|
||||
return self.repl(obj)
|
||||
|
||||
|
||||
def _json_serialise_datetime(obj):
|
||||
'''A JSON serialiser that converts datetime.datetime and datetime.date objects to ISO-8601 strings.'''
|
||||
|
||||
if isinstance(obj, (datetime.datetime, datetime.date)):
|
||||
return obj.isoformat()
|
||||
raise TypeError(f'Object of type {type(obj)} is not JSON serializable')
|
||||
|
||||
|
||||
def _json_dataclass_to_dict(obj):
|
||||
if isinstance(obj, _JSONDataclass) or dataclasses.is_dataclass(obj):
|
||||
out = {}
|
||||
out['_type'] = f'{type(obj).__module__}.{type(obj).__name__}'
|
||||
for field in dataclasses.fields(obj):
|
||||
assert field.name != '_type'
|
||||
if field.name.startswith('_'):
|
||||
continue
|
||||
out[field.name] = _json_dataclass_to_dict(getattr(obj, field.name))
|
||||
# Add properties
|
||||
for k in dir(obj):
|
||||
if isinstance(getattr(type(obj), k, None), (property, _DeprecatedProperty)):
|
||||
assert k != '_type'
|
||||
if k.startswith('_'):
|
||||
continue
|
||||
out[k] = _json_dataclass_to_dict(getattr(obj, k))
|
||||
return out
|
||||
elif isinstance(obj, (tuple, list)):
|
||||
return type(obj)(_json_dataclass_to_dict(x) for x in obj)
|
||||
elif isinstance(obj, dict):
|
||||
return {_json_dataclass_to_dict(k): _json_dataclass_to_dict(v) for k, v in obj.items()}
|
||||
elif isinstance(obj, set):
|
||||
return {_json_dataclass_to_dict(v) for v in obj}
|
||||
else:
|
||||
return copy.deepcopy(obj)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class _JSONDataclass:
|
||||
'''A base class for dataclasses for conversion to JSON'''
|
||||
|
||||
def json(self):
|
||||
'''Convert the object to a JSON string'''
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings(action = 'ignore', category = DeprecatedFeatureWarning)
|
||||
out = _json_dataclass_to_dict(self)
|
||||
for key, value in list(out.items()): # Modifying the dict below, so make a copy first
|
||||
if isinstance(value, IntWithGranularity):
|
||||
out[key] = int(value)
|
||||
assert f'{key}.granularity' not in out, f'Granularity collision on {key}.granularity'
|
||||
out[f'{key}.granularity'] = value.granularity
|
||||
return json.dumps(out, default = _json_serialise_datetime)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Item(_JSONDataclass):
|
||||
'''An abstract base class for an item returned by the scraper.
|
||||
|
||||
An item can really be anything. The string representation should be useful for the CLI output (e.g. a direct URL for the item).
|
||||
'''
|
||||
|
||||
@abc.abstractmethod
|
||||
def __str__(self):
|
||||
pass
|
||||
|
||||
|
||||
class URLItem(Item):
|
||||
'''A generic item which only holds a URL string.'''
|
||||
class IntWithGranularity(int):
|
||||
'''A number with an associated granularity
|
||||
|
||||
def __init__(self, url):
|
||||
self._url = url
|
||||
For example, an IntWithGranularity(42000, 1000) represents a number on the order of 42000 with two significant digits, i.e. something counted with a granularity of 1000.
|
||||
'''
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
return self._url
|
||||
def __new__(cls, value, granularity, *args, **kwargs):
|
||||
obj = super().__new__(cls, value, *args, **kwargs)
|
||||
obj.granularity = granularity
|
||||
return obj
|
||||
|
||||
def __str__(self):
|
||||
return self._url
|
||||
def __reduce__(self):
|
||||
return (IntWithGranularity, (int(self), self.granularity))
|
||||
|
||||
|
||||
class _HTTPSAdapter(requests.adapters.HTTPAdapter):
|
||||
def init_poolmanager(self, *args, **kwargs):
|
||||
super().init_poolmanager(*args, **kwargs)
|
||||
#FIXME: Uses private urllib3.PoolManager attribute pool_classes_by_scheme.
|
||||
try:
|
||||
self.poolmanager.pool_classes_by_scheme['https'].ConnectionCls = _HTTPSConnection
|
||||
except (AttributeError, KeyError) as e:
|
||||
_logger.debug(f'Could not install TLS cipher logger: {type(e).__module__}.{type(e).__name__} {e!s}')
|
||||
|
||||
|
||||
class _HTTPSConnection(urllib3.connection.HTTPSConnection):
|
||||
def connect(self, *args, **kwargs):
|
||||
conn = super().connect(*args, **kwargs)
|
||||
#FIXME: Uses undocumented attribute self.sock and beyond.
|
||||
try:
|
||||
_logger.debug(f'Connected to: {self.sock.getpeername()}')
|
||||
except AttributeError:
|
||||
# self.sock might be a urllib3.util.ssltransport.SSLTransport, which lacks getpeername.
|
||||
pass
|
||||
try:
|
||||
_logger.debug(f'Connection cipher: {self.sock.cipher()}')
|
||||
except AttributeError:
|
||||
# Shouldn't be possible, but better safe than sorry.
|
||||
pass
|
||||
return conn
|
||||
|
||||
|
||||
class ScraperException(Exception):
|
||||
@@ -40,58 +162,88 @@ class Scraper:
|
||||
|
||||
name = None
|
||||
|
||||
def __init__(self, retries = 3):
|
||||
def __init__(self, *, retries = 3, proxies = None):
|
||||
self._retries = retries
|
||||
self._proxies = proxies
|
||||
self._session = requests.Session()
|
||||
self._session.mount('https://', _HTTPSAdapter())
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_items(self):
|
||||
'''Iterator yielding Items.'''
|
||||
|
||||
pass
|
||||
|
||||
def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None):
|
||||
def _get_entity(self):
|
||||
'''Get the entity behind the scraper, if any.
|
||||
|
||||
This is the method implemented by subclasses for doing the actual retrieval/entity object creation. For accessing the scraper's entity, use the entity property.
|
||||
'''
|
||||
|
||||
return None
|
||||
|
||||
@functools.cached_property
|
||||
def entity(self):
|
||||
return self._get_entity()
|
||||
|
||||
def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None, allowRedirects = True, proxies = None):
|
||||
proxies = proxies or self._proxies or {}
|
||||
errors = []
|
||||
for attempt in range(self._retries + 1):
|
||||
# The request is newly prepared on each retry because of potential cookie updates.
|
||||
req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
|
||||
logger.info(f'Retrieving {req.url}')
|
||||
logger.debug(f'... with headers: {headers!r}')
|
||||
environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None)
|
||||
_logger.info(f'Retrieving {req.url}')
|
||||
_logger.debug(f'... with headers: {headers!r}')
|
||||
if data:
|
||||
logger.debug(f'... with data: {data!r}')
|
||||
_logger.debug(f'... with data: {data!r}')
|
||||
if environmentSettings:
|
||||
_logger.debug(f'... with environmentSettings: {environmentSettings!r}')
|
||||
try:
|
||||
r = self._session.send(req, timeout = timeout)
|
||||
r = self._session.send(req, allow_redirects = allowRedirects, timeout = timeout, **environmentSettings)
|
||||
except requests.exceptions.RequestException as exc:
|
||||
if attempt < self._retries:
|
||||
retrying = ', retrying'
|
||||
level = logging.WARNING
|
||||
level = logging.INFO
|
||||
else:
|
||||
retrying = ''
|
||||
level = logging.ERROR
|
||||
logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
|
||||
_logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
|
||||
errors.append(repr(exc))
|
||||
else:
|
||||
redirected = f' (redirected to {r.url})' if r.history else ''
|
||||
_logger.info(f'Retrieved {req.url}{redirected}: {r.status_code}')
|
||||
_logger.debug(f'... with response headers: {r.headers!r}')
|
||||
if r.history:
|
||||
for i, redirect in enumerate(r.history):
|
||||
_logger.debug(f'... request {i}: {redirect.request.url}: {redirect.status_code} (Location: {redirect.headers.get("Location")})')
|
||||
_logger.debug(f'... ... with response headers: {redirect.headers!r}')
|
||||
if responseOkCallback is not None:
|
||||
success, msg = responseOkCallback(r)
|
||||
errors.append(msg)
|
||||
else:
|
||||
success, msg = (True, None)
|
||||
msg = f': {msg}' if msg else ''
|
||||
|
||||
if success:
|
||||
logger.debug(f'{req.url} retrieved successfully{msg}')
|
||||
_logger.debug(f'{req.url} retrieved successfully{msg}')
|
||||
return r
|
||||
else:
|
||||
if attempt < self._retries:
|
||||
retrying = ', retrying'
|
||||
level = logging.WARNING
|
||||
level = logging.INFO
|
||||
else:
|
||||
retrying = ''
|
||||
level = logging.ERROR
|
||||
logger.log(level, f'Error retrieving {req.url}{msg}{retrying}')
|
||||
_logger.log(level, f'Error retrieving {req.url}{msg}{retrying}')
|
||||
if attempt < self._retries:
|
||||
sleepTime = 1.0 * 2**attempt # exponential backoff: sleep 1 second after first attempt, 2 after second, 4 after third, etc.
|
||||
logger.info(f'Waiting {sleepTime:.0f} seconds')
|
||||
_logger.info(f'Waiting {sleepTime:.0f} seconds')
|
||||
time.sleep(sleepTime)
|
||||
else:
|
||||
msg = f'{self._retries + 1} requests to {req.url} failed, giving up.'
|
||||
logger.fatal(msg)
|
||||
_logger.fatal(msg)
|
||||
_logger.fatal(f'Errors: {", ".join(errors)}')
|
||||
raise ScraperException(msg)
|
||||
raise RuntimeError('Reached unreachable code')
|
||||
|
||||
@@ -102,11 +254,26 @@ class Scraper:
|
||||
return self._request('POST', *args, **kwargs)
|
||||
|
||||
@classmethod
|
||||
@abc.abstractmethod
|
||||
def setup_parser(cls, subparser):
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
@abc.abstractmethod
|
||||
def from_args(cls, args):
|
||||
pass
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args)
|
||||
|
||||
@classmethod
|
||||
def _cli_construct(cls, argparseArgs, *args, **kwargs):
|
||||
return cls(*args, **kwargs, retries = argparseArgs.retries)
|
||||
|
||||
|
||||
def nonempty_string(name):
|
||||
def f(s):
|
||||
s = s.strip()
|
||||
if s:
|
||||
return s
|
||||
raise ValueError('must not be an empty string')
|
||||
f.__name__ = name
|
||||
return f
|
||||
|
||||
|
||||
__getattr__, __dir__ = _module_deprecation_helper(__all__, Entity = Item)
|
||||
|
||||
236
snscrape/cli.py
236
snscrape/cli.py
@@ -1,236 +0,0 @@
|
||||
import argparse
|
||||
import contextlib
|
||||
import datetime
|
||||
import inspect
|
||||
import logging
|
||||
import requests.models
|
||||
# Imported in parse_args() after setting up the logger:
|
||||
#import snscrape.base
|
||||
#import snscrape.modules
|
||||
#import snscrape.version
|
||||
import tempfile
|
||||
|
||||
|
||||
## Logging
|
||||
dumpLocals = False
|
||||
logger = logging # Replaced below after setting the logger class
|
||||
|
||||
|
||||
class Logger(logging.Logger):
|
||||
def _log_with_stack(self, level, *args, **kwargs):
|
||||
super().log(level, *args, **kwargs)
|
||||
if dumpLocals:
|
||||
stack = inspect.stack()
|
||||
if len(stack) >= 3:
|
||||
name = _dump_stack_and_locals(stack[2:][::-1])
|
||||
super().log(level, f'Dumped stack and locals to {name}')
|
||||
|
||||
def warning(self, *args, **kwargs):
|
||||
self._log_with_stack(logging.WARNING, *args, **kwargs)
|
||||
|
||||
def error(self, *args, **kwargs):
|
||||
self._log_with_stack(logging.ERROR, *args, **kwargs)
|
||||
|
||||
def critical(self, *args, **kwargs):
|
||||
self._log_with_stack(logging.CRITICAL, *args, **kwargs)
|
||||
|
||||
def log(self, level, *args, **kwargs):
|
||||
if level >= logging.WARNING:
|
||||
self._log_with_stack(level, *args, **kwargs)
|
||||
else:
|
||||
super().log(level, *args, **kwargs)
|
||||
|
||||
|
||||
def _requests_preparedrequest_repr(name, request):
|
||||
ret = []
|
||||
ret.append(repr(request))
|
||||
ret.append(f'\n {name}.method = {request.method}')
|
||||
ret.append(f'\n {name}.url = {request.url}')
|
||||
ret.append(f'\n {name}.headers = \\')
|
||||
for field in request.headers:
|
||||
ret.append(f'\n {field} = {_repr("_", request.headers[field])}')
|
||||
if request.body:
|
||||
ret.append(f'\n {name}.body = ')
|
||||
ret.append(_repr('_', request.body).replace('\n', '\n '))
|
||||
return ''.join(ret)
|
||||
|
||||
|
||||
def _requests_response_repr(name, response, withHistory = True):
|
||||
ret = []
|
||||
ret.append(repr(response))
|
||||
ret.append(f'\n {name}.url = {response.url}')
|
||||
ret.append(f'\n {name}.request = ')
|
||||
ret.append(_repr('_', response.request).replace('\n', '\n '))
|
||||
if withHistory and response.history:
|
||||
ret.append(f'\n {name}.history = [')
|
||||
for previousResponse in response.history:
|
||||
ret.append(f'\n ')
|
||||
ret.append(_requests_response_repr('_', previousResponse, withHistory = False).replace('\n', '\n '))
|
||||
ret.append('\n ]')
|
||||
ret.append(f'\n {name}.status_code = {response.status_code}')
|
||||
ret.append(f'\n {name}.headers = \\')
|
||||
for field in response.headers:
|
||||
ret.append(f'\n {field} = {_repr("_", response.headers[field])}')
|
||||
ret.append(f'\n {name}.content = {_repr("_", response.content)}')
|
||||
return ''.join(ret)
|
||||
|
||||
|
||||
def _repr(name, value):
|
||||
if type(value) is requests.models.Response:
|
||||
return _requests_response_repr(name, value)
|
||||
if type(value) is requests.models.PreparedRequest:
|
||||
return _requests_preparedrequest_repr(name, value)
|
||||
valueRepr = repr(value)
|
||||
if '\n' in valueRepr:
|
||||
return ''.join(['\\\n ', valueRepr.replace('\n', '\n ')])
|
||||
return valueRepr
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _dump_locals_on_exception():
|
||||
try:
|
||||
yield
|
||||
except Exception as e:
|
||||
trace = inspect.trace()
|
||||
if len(trace) >= 2:
|
||||
name = _dump_stack_and_locals(trace[1:])
|
||||
logger.fatal(f'Dumped stack and locals to {name}')
|
||||
raise
|
||||
|
||||
|
||||
def _dump_stack_and_locals(trace):
|
||||
with tempfile.NamedTemporaryFile('w', prefix = 'snscrape_locals_', delete = False) as fp:
|
||||
fp.write('Stack:\n')
|
||||
for frameRecord in trace:
|
||||
fp.write(f' File "{frameRecord.filename}", line {frameRecord.lineno}, in {frameRecord.function}\n')
|
||||
for line in frameRecord.code_context:
|
||||
fp.write(f' {line.strip()}\n')
|
||||
fp.write('\n')
|
||||
|
||||
for frameRecord in trace:
|
||||
module = inspect.getmodule(frameRecord[0])
|
||||
if not module.__name__.startswith('snscrape.') and module.__name__ != 'snscrape':
|
||||
continue
|
||||
locals_ = frameRecord[0].f_locals
|
||||
fp.write(f'Locals from file "{frameRecord.filename}", line {frameRecord.lineno}, in {frameRecord.function}:\n')
|
||||
for variableName in locals_:
|
||||
variable = locals_[variableName]
|
||||
varRepr = _repr(variableName, variable)
|
||||
fp.write(f' {variableName} {type(variable)} = ')
|
||||
fp.write(varRepr.replace('\n', '\n '))
|
||||
fp.write('\n')
|
||||
fp.write('\n')
|
||||
if 'self' in locals_ and hasattr(locals_['self'], '__dict__'):
|
||||
fp.write(f'Object dict:\n')
|
||||
fp.write(repr(locals_['self'].__dict__))
|
||||
fp.write('\n\n')
|
||||
name = fp.name
|
||||
return name
|
||||
|
||||
|
||||
def parse_datetime_arg(arg):
|
||||
for format in ('%Y-%m-%d %H:%M:%S %z', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %z', '%Y-%m-%d'):
|
||||
try:
|
||||
d = datetime.datetime.strptime(arg, format)
|
||||
except ValueError:
|
||||
continue
|
||||
else:
|
||||
if d.tzinfo is None:
|
||||
return d.replace(tzinfo = datetime.timezone.utc)
|
||||
return d
|
||||
# Try treating it as a unix timestamp
|
||||
try:
|
||||
d = datetime.datetime.fromtimestamp(int(arg), datetime.timezone.utc)
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
return d
|
||||
raise argparse.ArgumentTypeError(f'Cannot parse {arg!r} into a datetime object')
|
||||
|
||||
|
||||
def parse_args():
|
||||
import snscrape.base
|
||||
import snscrape.modules
|
||||
import snscrape.version
|
||||
|
||||
parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--version', action = 'version', version = f'snscrape {snscrape.version.__version__}')
|
||||
parser.add_argument('-v', '--verbose', '--verbosity', dest = 'verbosity', action = 'count', default = 0, help = 'Increase output verbosity')
|
||||
parser.add_argument('--dump-locals', dest = 'dumpLocals', action = 'store_true', default = False, help = 'Dump local variables on serious log messages (warnings or higher)')
|
||||
parser.add_argument('--retry', '--retries', dest = 'retries', type = int, default = 3, metavar = 'N',
|
||||
help = 'When the connection fails or the server returns an unexpected response, retry up to N times with an exponential backoff')
|
||||
parser.add_argument('-n', '--max-results', dest = 'maxResults', type = int, metavar = 'N', help = 'Only return the first N results')
|
||||
parser.add_argument('-f', '--format', dest = 'format', type = str, default = None, help = 'Output format')
|
||||
parser.add_argument('--since', type = parse_datetime_arg, metavar = 'DATETIME', help = 'Only return results newer than DATETIME')
|
||||
|
||||
subparsers = parser.add_subparsers(dest = 'scraper', help = 'The scraper you want to use')
|
||||
classes = snscrape.base.Scraper.__subclasses__()
|
||||
for cls in classes:
|
||||
if cls.name is not None:
|
||||
subparser = subparsers.add_parser(cls.name, formatter_class = argparse.ArgumentDefaultsHelpFormatter)
|
||||
cls.setup_parser(subparser)
|
||||
subparser.set_defaults(cls = cls)
|
||||
classes.extend(cls.__subclasses__())
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# http://bugs.python.org/issue16308 / https://bugs.python.org/issue26510 (fixed in Python 3.7)
|
||||
if not args.scraper:
|
||||
raise RuntimeError('Error: no scraper specified')
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def setup_logging():
|
||||
logging.setLoggerClass(Logger)
|
||||
global logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def configure_logging(verbosity, dumpLocals_):
|
||||
global dumpLocals
|
||||
dumpLocals = dumpLocals_
|
||||
|
||||
rootLogger = logging.getLogger()
|
||||
|
||||
# Set level
|
||||
if verbosity > 0:
|
||||
level = logging.INFO if verbosity == 1 else logging.DEBUG
|
||||
rootLogger.setLevel(level)
|
||||
for handler in rootLogger.handlers:
|
||||
handler.setLevel(level)
|
||||
|
||||
# Create formatter
|
||||
formatter = logging.Formatter('{asctime}.{msecs:03.0f} {levelname} {name} {message}', datefmt = '%Y-%m-%d %H:%M:%S', style = '{')
|
||||
|
||||
# Remove existing handlers
|
||||
for handler in rootLogger.handlers:
|
||||
rootLogger.removeHandler(handler)
|
||||
|
||||
# Add stream handler
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(formatter)
|
||||
rootLogger.addHandler(handler)
|
||||
|
||||
|
||||
def main():
|
||||
setup_logging()
|
||||
args = parse_args()
|
||||
configure_logging(args.verbosity, args.dumpLocals)
|
||||
scraper = args.cls.from_args(args)
|
||||
|
||||
i = 0
|
||||
with _dump_locals_on_exception():
|
||||
for i, item in enumerate(scraper.get_items(), start = 1):
|
||||
if args.since is not None and item.date < args.since:
|
||||
logger.info(f'Exiting due to reaching older results than {args.since}')
|
||||
break
|
||||
if args.format is not None:
|
||||
print(args.format.format(**item._asdict()))
|
||||
else:
|
||||
print(item)
|
||||
if args.maxResults and i >= args.maxResults:
|
||||
logger.info(f'Exiting after {i} results')
|
||||
break
|
||||
else:
|
||||
logger.info(f'Done, found {i} results')
|
||||
@@ -1,15 +1,17 @@
|
||||
import importlib
|
||||
import os
|
||||
import snscrape.base
|
||||
import pkgutil
|
||||
|
||||
|
||||
__all__ = []
|
||||
|
||||
|
||||
def _import_modules():
|
||||
files = os.listdir(__path__[0])
|
||||
for fn in files:
|
||||
if fn.endswith('.py') and fn != '__init__.py':
|
||||
# Import module if not already imported
|
||||
moduleName = f'snscrape.modules.{fn[:-3]}'
|
||||
module = importlib.import_module(moduleName)
|
||||
prefixLen = len(__name__) + 1
|
||||
for importer, moduleName, isPkg in pkgutil.iter_modules(__path__, prefix = f'{__name__}.'):
|
||||
assert not isPkg
|
||||
moduleNameWithoutPrefix = moduleName[prefixLen:]
|
||||
__all__.append(moduleNameWithoutPrefix)
|
||||
module = importer.find_module(moduleName).load_module(moduleName)
|
||||
globals()[moduleNameWithoutPrefix] = module
|
||||
|
||||
|
||||
_import_modules()
|
||||
|
||||
@@ -1,4 +1,8 @@
|
||||
__all__ = ['FacebookPost', 'User', 'FacebookUserScraper', 'FacebookCommunityScraper', 'FacebookGroupScraper']
|
||||
|
||||
|
||||
import bs4
|
||||
import dataclasses
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
@@ -8,22 +12,44 @@ import typing
|
||||
import urllib.parse
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FacebookPost(typing.NamedTuple, snscrape.base.Item):
|
||||
@dataclasses.dataclass
|
||||
class FacebookPost(snscrape.base.Item):
|
||||
cleanUrl: str
|
||||
dirtyUrl: str
|
||||
date: datetime.datetime
|
||||
content: typing.Optional[str]
|
||||
outlinks: list
|
||||
outlinksss: str
|
||||
|
||||
outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks), 'outlinks')
|
||||
|
||||
def __str__(self):
|
||||
return self.cleanUrl
|
||||
|
||||
|
||||
class FacebookCommonScraper(snscrape.base.Scraper):
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Item):
|
||||
username: str
|
||||
pageId: int
|
||||
name: str
|
||||
verified: bool
|
||||
created: typing.Optional[datetime.date] = None
|
||||
pageOwner: typing.Optional[str] = None
|
||||
likes: typing.Optional[int] = None
|
||||
followers: typing.Optional[int] = None
|
||||
checkins: typing.Optional[int] = None
|
||||
address: typing.Optional[str] = None
|
||||
phone: typing.Optional[str] = None
|
||||
web: typing.Optional[str] = None
|
||||
keywords: typing.Optional[typing.List[str]] = None
|
||||
|
||||
def __str__(self):
|
||||
return f'https://www.facebook.com/{self.username}/'
|
||||
|
||||
|
||||
class _FacebookCommonScraper(snscrape.base.Scraper):
|
||||
def _clean_url(self, dirtyUrl):
|
||||
u = urllib.parse.urlparse(dirtyUrl)
|
||||
if u.path == '/permalink.php':
|
||||
@@ -41,7 +67,7 @@ class FacebookCommonScraper(snscrape.base.Scraper):
|
||||
if setVal.rstrip('0123456789').endswith('.a.'):
|
||||
setVal = f'a.{setVal.rsplit(".", 1)[1]}'
|
||||
clean = (u.scheme, u.netloc, u.path, urllib.parse.urlencode((('set', setVal),)), '')
|
||||
elif u.path.split('/')[2] == 'posts' or u.path.startswith('/events/') or u.path.startswith('/notes/'):
|
||||
elif u.path.split('/')[2] == 'posts' or u.path.startswith('/events/') or u.path.startswith('/notes/') or u.path.split('/')[1:4:2] == ['groups', 'permalink']:
|
||||
# No manipulation of the path needed, but strip the query string
|
||||
clean = (u.scheme, u.netloc, u.path, '', '')
|
||||
elif u.path.split('/')[2] in ('photos', 'videos'):
|
||||
@@ -82,10 +108,21 @@ class FacebookCommonScraper(snscrape.base.Scraper):
|
||||
def _soup_to_items(self, soup, baseUrl, mode):
|
||||
cleanUrl = None # Value from previous iteration is used for warning on link-less entries
|
||||
for entry in soup.find_all('div', class_ = '_5pcr'): # also class 'fbUserContent' in 2017 and 'userContentWrapper' in 2019
|
||||
# Check that this is not inside another div._5pcr to avoid duplicates or extracting the wrong URL (e.g. 'X was mentioned in a post' on community pages)
|
||||
parent = entry.parent
|
||||
isNested = False
|
||||
while parent:
|
||||
if parent.name == 'div' and 'class' in parent.attrs and '_5pcr' in parent.attrs['class']:
|
||||
isNested = True
|
||||
break
|
||||
parent = parent.parent
|
||||
if isNested:
|
||||
continue
|
||||
|
||||
entryA = entry.find('a', class_ = '_5pcq') # There can be more than one, e.g. when a post is shared by another user, but the first one is always the one of this entry.
|
||||
mediaSetA = entry.find('a', class_ = '_17z-')
|
||||
if not mediaSetA and not entryA:
|
||||
logger.warning(f'Ignoring link-less entry after {cleanUrl}: {entry.text!r}')
|
||||
_logger.warning(f'Ignoring link-less entry after {cleanUrl}: {entry.text!r}')
|
||||
continue
|
||||
if mediaSetA and (not entryA or entryA['href'] == '#'):
|
||||
href = mediaSetA['href']
|
||||
@@ -94,13 +131,12 @@ class FacebookCommonScraper(snscrape.base.Scraper):
|
||||
oddLink, warn = self._is_odd_link(href, entry.text, mode)
|
||||
if oddLink:
|
||||
if warn:
|
||||
logger.warning(f'Ignoring odd link: {href}')
|
||||
_logger.warning(f'Ignoring odd link: {href}')
|
||||
continue
|
||||
dirtyUrl = urllib.parse.urljoin(baseUrl, href)
|
||||
cleanUrl = self._clean_url(dirtyUrl)
|
||||
date = datetime.datetime.fromtimestamp(int(entry.find('abbr', class_ = '_5ptz')['data-utime']), datetime.timezone.utc)
|
||||
contentDiv = entry.find('div', class_ = '_5pbx')
|
||||
if contentDiv:
|
||||
if (contentDiv := entry.find('div', class_ = '_5pbx')):
|
||||
content = contentDiv.text
|
||||
else:
|
||||
content = None
|
||||
@@ -113,71 +149,157 @@ class FacebookCommonScraper(snscrape.base.Scraper):
|
||||
continue
|
||||
query = urllib.parse.parse_qs(urllib.parse.urlparse(href).query)
|
||||
if 'u' not in query or len(query['u']) != 1:
|
||||
logger.warning(f'Ignoring odd outlink: {href}')
|
||||
_logger.warning(f'Ignoring odd outlink: {href}')
|
||||
continue
|
||||
outlink = query['u'][0]
|
||||
if outlink.startswith('http://') or outlink.startswith('https://') and outlink not in outlinks:
|
||||
outlinks.append(outlink)
|
||||
yield FacebookPost(cleanUrl = cleanUrl, dirtyUrl = dirtyUrl, date = date, content = content, outlinks = outlinks, outlinksss = ' '.join(outlinks))
|
||||
yield FacebookPost(cleanUrl = cleanUrl, dirtyUrl = dirtyUrl, date = date, content = content, outlinks = outlinks)
|
||||
|
||||
|
||||
class FacebookUserScraper(FacebookCommonScraper):
|
||||
name = 'facebook-user'
|
||||
|
||||
class _FacebookUserAndCommunityScraper(_FacebookCommonScraper):
|
||||
def __init__(self, username, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._username = username
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:78.0) Gecko/20100101 Firefox/78.0', 'Accept-Language': 'en-US,en;q=0.5'}
|
||||
self._initialPage = None
|
||||
self._initialPageSoup = None
|
||||
|
||||
def _initial_page(self):
|
||||
if self._initialPage is None:
|
||||
_logger.info('Retrieving initial data')
|
||||
r = self._get(self._baseUrl, headers = self._headers)
|
||||
if r.status_code not in (200, 404):
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
self._initialPage = r
|
||||
self._initialPageSoup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
return self._initialPage, self._initialPageSoup
|
||||
|
||||
def get_items(self):
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', 'Accept-Language': 'en-US,en;q=0.5'}
|
||||
|
||||
nextPageLinkPattern = re.compile(r'^/pages_reaction_units/more/\?page_id=')
|
||||
spuriousForLoopPattern = re.compile(r'^for \(;;\);')
|
||||
|
||||
logger.info('Retrieving initial data')
|
||||
baseUrl = f'https://www.facebook.com/{self._username}/'
|
||||
r = self._get(baseUrl, headers = headers)
|
||||
r, soup = self._initial_page()
|
||||
if r.status_code == 404:
|
||||
logger.warning('User does not exist')
|
||||
_logger.warning('User does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
logger.error('Got status code {r.status_code}')
|
||||
return
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
yield from self._soup_to_items(soup, baseUrl, 'user')
|
||||
nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern)
|
||||
yield from self._soup_to_items(soup, self._baseUrl, 'user')
|
||||
|
||||
while nextPageLink:
|
||||
logger.info('Retrieving next page')
|
||||
while (nextPageLink := soup.find('a', ajaxify = nextPageLinkPattern)):
|
||||
_logger.info('Retrieving next page')
|
||||
|
||||
# The web app sends a bunch of additional parameters. Most of them would be easy to add, but there's also __dyn, which is a compressed list of the "modules" loaded in the browser.
|
||||
# Reproducing that would be difficult to get right, especially as Facebook's codebase evolves, so it's just not sent at all here.
|
||||
r = self._get(urllib.parse.urljoin(baseUrl, nextPageLink.get('ajaxify')) + '&__a=1', headers = headers)
|
||||
r = self._get(urllib.parse.urljoin(self._baseUrl, nextPageLink.get('ajaxify')) + '&__a=1', headers = self._headers)
|
||||
if r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
response = json.loads(spuriousForLoopPattern.sub('', r.text))
|
||||
assert 'domops' in response
|
||||
assert len(response['domops']) == 1
|
||||
assert len(response['domops'][0]) == 4
|
||||
assert response['domops'][0][0] == 'replace', f'{response["domops"][0]} is not "replace"'
|
||||
assert response['domops'][0][1] == '#www_pages_reaction_see_more_unitwww_pages_home'
|
||||
assert response['domops'][0][1] in ('#www_pages_reaction_see_more_unitwww_pages_home', '#www_pages_reaction_see_more_unitwww_pages_community_tab')
|
||||
assert response['domops'][0][2] == False
|
||||
assert '__html' in response['domops'][0][3]
|
||||
soup = bs4.BeautifulSoup(response['domops'][0][3]['__html'], 'lxml')
|
||||
yield from self._soup_to_items(soup, baseUrl, 'user')
|
||||
nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern)
|
||||
yield from self._soup_to_items(soup, self._baseUrl, 'user')
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
subparser.add_argument('username', help = 'A Facebook username or user ID')
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('username', type = snscrape.base.nonempty_string('username'), help = 'A Facebook username or user ID')
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls(args.username, retries = args.retries)
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.username)
|
||||
|
||||
|
||||
class FacebookGroupScraper(FacebookCommonScraper):
|
||||
class FacebookUserScraper(_FacebookUserAndCommunityScraper):
|
||||
name = 'facebook-user'
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._baseUrl = f'https://www.facebook.com/{self._username}/'
|
||||
|
||||
def _get_entity(self):
|
||||
kwargs = {}
|
||||
|
||||
nameVerifiedMarkupPattern = re.compile(r'"markup":\[\["__markup_a588f507_0_0",\{"__html":(".*?")\}')
|
||||
handleDivPattern = re.compile(r'<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>')
|
||||
handlePattern = re.compile(r'<a\s[^>]*(?<=\s)href="/([^/]+)')
|
||||
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
|
||||
createdDatePattern = re.compile('^(' + '|'.join(months) + r') (\d+), (\d+)$')
|
||||
|
||||
r, soup = self._initial_page()
|
||||
if r.status_code != 200:
|
||||
return
|
||||
|
||||
handleDiv = handleDivPattern.search(r.text)
|
||||
handle = handlePattern.search(handleDiv.group(0))
|
||||
kwargs['username'] = handle.group(1)
|
||||
|
||||
nameVerifiedMarkup = nameVerifiedMarkupPattern.search(r.text)
|
||||
nameVerifiedMarkup = json.loads(nameVerifiedMarkup.group(1))
|
||||
nameVerifiedSoup = bs4.BeautifulSoup(nameVerifiedMarkup, 'lxml')
|
||||
kwargs['name'] = nameVerifiedSoup.find('a', class_ = '_64-f').text
|
||||
kwargs['verified'] = bool(nameVerifiedSoup.find('a', class_ = '_56_f'))
|
||||
|
||||
pageTransparencyContentDiv = soup.find('div', class_ = '_61-0')
|
||||
if pageTransparencyContentDiv.text.startswith('Page created - '):
|
||||
createdDateMess = pageTransparencyContentDiv.text.split(' - ', 1)[1]
|
||||
m = createdDatePattern.match(createdDateMess)
|
||||
assert m, 'unexpected created div content'
|
||||
kwargs['created'] = datetime.date(int(m.group(3)), months.index(m.group(1)) + 1, int(m.group(2)))
|
||||
if pageTransparencyContentDiv.text.startswith('Confirmed Page Owner: '):
|
||||
kwargs['pageOwner'] = pageTransparencyContentDiv.text.split(': ', 1)[1]
|
||||
|
||||
communityDiv = soup.find('div', class_ = '_6590')
|
||||
for div in communityDiv.find_all('div', class_ = '_4bl9'):
|
||||
text = div.text
|
||||
if text.endswith(' people like this'):
|
||||
kwargs['likes'] = int(text.split(' ', 1)[0].replace(',', ''))
|
||||
elif text.endswith(' people follow this'):
|
||||
kwargs['followers'] = int(text.split(' ', 1)[0].replace(',', ''))
|
||||
elif text.endswith(' check-ins'):
|
||||
kwargs['checkins'] = int(text.split(' ', 1)[0].replace(',', ''))
|
||||
|
||||
aboutDiv = soup.find('div', class_ = '_u9q')
|
||||
if aboutDiv:
|
||||
# As if the above wasn't already ugly enough, this is where it gets really bad...
|
||||
for div in aboutDiv.find_all('div', class_ = '_2pi9'):
|
||||
img = div.find('img', class_ = '_3-91')
|
||||
if not img:
|
||||
continue
|
||||
if img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/y5/r/vfXKA62x4Da.png': # Address
|
||||
rawAddress = div.find('div', class_ = '_2wzd').text
|
||||
kwargs['address'] = re.sub(r' \((\d+,)?\d+(\.\d+)? mi\)', '\n', rawAddress) # Remove distance from inferred IP location, restore linebreak
|
||||
elif img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/yW/r/mYv88EsODOI.png': # Phone number
|
||||
kwargs['phone'] = div.find('div', class_ = '_4bl9').text
|
||||
elif img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/yx/r/xVA3lB-GVep.png': # Web link
|
||||
for a in div.find_all('a'):
|
||||
if a.text == '' or 'href' not in a.attrs or a.find('span'):
|
||||
continue
|
||||
dirtyWeb = a['href']
|
||||
assert dirtyWeb.startswith('https://l.facebook.com/l.php?u='), 'unexpected web link'
|
||||
kwargs['web'] = urllib.parse.unquote(dirtyWeb.split('=', 1)[1].split('&', 1)[0])
|
||||
elif img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/yl/r/LwDWwC1d0Rx.png': # Keywords
|
||||
kwargs['keywords'] = div.find('div', class_ = '_4bl9').text.split(' · ')
|
||||
|
||||
androidUrlMeta = soup.find('meta', property = 'al:android:url')
|
||||
assert androidUrlMeta['content'].startswith('fb://page/') and androidUrlMeta['content'].endswith('?referrer=app_link')
|
||||
kwargs['pageId'] = int(androidUrlMeta['content'][10:-18])
|
||||
|
||||
return User(**kwargs)
|
||||
|
||||
|
||||
class FacebookCommunityScraper(_FacebookUserAndCommunityScraper):
|
||||
name = 'facebook-community'
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._baseUrl = f'https://www.facebook.com/{self._username}/community/'
|
||||
|
||||
|
||||
class FacebookGroupScraper(_FacebookCommonScraper):
|
||||
name = 'facebook-group'
|
||||
|
||||
def __init__(self, group, **kwargs):
|
||||
@@ -191,18 +313,16 @@ class FacebookGroupScraper(FacebookCommonScraper):
|
||||
pageletDataPrefixLength = len('"GroupEntstreamPagelet",')
|
||||
spuriousForLoopPattern = re.compile(r'^for \(;;\);')
|
||||
|
||||
baseUrl = f'https://www.facebook.com/groups/{self._group}/'
|
||||
baseUrl = f'https://upload.facebook.com/groups/{self._group}/?sorting_setting=CHRONOLOGICAL'
|
||||
r = self._get(baseUrl, headers = headers)
|
||||
if r.status_code == 404:
|
||||
logger.warning('Group does not exist')
|
||||
_logger.warning('Group does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
logger.error('Got status code {r.status_code}')
|
||||
return
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
|
||||
if 'content:{pagelet_group_mall:{container_id:"' not in r.text:
|
||||
logger.error('Code container ID marker not found (does the group exist?)')
|
||||
return
|
||||
raise snscrape.base.ScraperException('Code container ID marker not found (does the group exist?)')
|
||||
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
|
||||
@@ -212,35 +332,33 @@ class FacebookGroupScraper(FacebookCommonScraper):
|
||||
codeContainerId = r.text[codeContainerIdPos : r.text.index('"', codeContainerIdPos)]
|
||||
codeContainer = soup.find('code', id = codeContainerId)
|
||||
if not codeContainer:
|
||||
raise RuntimeError('Code container not found')
|
||||
raise snscrape.base.ScraperException('Code container not found')
|
||||
if type(codeContainer.string) is not bs4.element.Comment:
|
||||
raise RuntimeError('Code container does not contain a comment')
|
||||
raise snscrape.base.ScraperException('Code container does not contain a comment')
|
||||
codeSoup = bs4.BeautifulSoup(codeContainer.string, 'lxml')
|
||||
yield from self._soup_to_items(codeSoup, baseUrl, 'group')
|
||||
|
||||
# Pagination
|
||||
data = pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:]
|
||||
while True:
|
||||
while (data := pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:]):
|
||||
# As on the user profile pages, the web app sends a lot of additional parameters, but those all seem to be unnecessary (although some change the response format, e.g. from JSON to HTML)
|
||||
r = self._get(
|
||||
f'https://www.facebook.com/ajax/pagelet/generic.php/GroupEntstreamPagelet',
|
||||
'https://upload.facebook.com/ajax/pagelet/generic.php/GroupEntstreamPagelet',
|
||||
params = {'data': data, '__a': 1},
|
||||
headers = headers,
|
||||
)
|
||||
if r.status_code != 200:
|
||||
raise RuntimeError(f'Got status code {r.status_code}')
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
obj = json.loads(spuriousForLoopPattern.sub('', r.text))
|
||||
if obj['payload'] == '':
|
||||
# End of pagination
|
||||
break
|
||||
soup = bs4.BeautifulSoup(obj['payload'], 'lxml')
|
||||
yield from self._soup_to_items(soup, baseUrl, 'group')
|
||||
data = pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:]
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
subparser.add_argument('group', help = 'A group name or ID')
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('group', type = snscrape.base.nonempty_string('group'), help = 'A group name or ID')
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls(args.group, retries = args.retries)
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.group)
|
||||
|
||||
@@ -1,115 +0,0 @@
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import snscrape.base
|
||||
import time
|
||||
import typing
|
||||
import urllib.parse
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GabPost(typing.NamedTuple, snscrape.base.Item):
|
||||
url: str
|
||||
date: datetime.datetime
|
||||
content: str
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
class GabUserCommonScraper(snscrape.base.Scraper):
|
||||
def __init__(self, mode, username, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
if mode not in ('posts', 'comments', 'media'):
|
||||
raise ValueError('Invalid mode')
|
||||
self._mode = mode
|
||||
self._username = username
|
||||
if mode == 'posts':
|
||||
self._baseUrl = f'https://gab.com/api/feed/{username}'
|
||||
self._beforeGlue = '?'
|
||||
elif mode == 'comments':
|
||||
self._baseUrl = f'https://gab.com/api/feed/{username}/comments?includes=post.conversation_parent'
|
||||
self._beforeGlue = '&'
|
||||
elif mode == 'media':
|
||||
self._baseUrl = f'https://gab.com/api/feed/{username}/media'
|
||||
self._beforeGlue = '?'
|
||||
|
||||
def _response_to_items(self, response):
|
||||
yielded = set()
|
||||
for post in response['data']:
|
||||
if post['post']['id'] not in yielded:
|
||||
yield GabPost(
|
||||
url = f'https://gab.com/{post["post"]["user"]["username"]}/posts/{post["post"]["id"]}',
|
||||
date = datetime.datetime.strptime(post['post']['created_at'].replace('-', '', 2).replace(':', ''), '%Y%m%dT%H%M%S%z'),
|
||||
content = post['post']['body'],
|
||||
)
|
||||
yielded.add(post['post']['id'])
|
||||
|
||||
def get_items(self):
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept-Language': 'en-US,en;q=0.5'}
|
||||
|
||||
logger.info('Retrieving initial data')
|
||||
r = self._get(self._baseUrl, headers = headers)
|
||||
if r.status_code == 404:
|
||||
logger.error('User does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
|
||||
response = json.loads(r.text)
|
||||
if not response['data']:
|
||||
logger.error('User has no posts')
|
||||
return
|
||||
yield from self._response_to_items(response)
|
||||
if self._mode == 'posts':
|
||||
before = response['data'][-1]['published_at']
|
||||
elif self._mode in ('comments', 'media'):
|
||||
before = 30
|
||||
|
||||
while True:
|
||||
logger.info('Retrieving next page')
|
||||
r = self._get(f'{self._baseUrl}{self._beforeGlue}before={before}', headers = headers)
|
||||
if r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
response = json.loads(r.text)
|
||||
yield from self._response_to_items(response)
|
||||
if response['no-more'] or not response['data']:
|
||||
# Last page
|
||||
return
|
||||
if self._mode == 'posts':
|
||||
before = response['data'][-1]['published_at']
|
||||
elif self._mode in ('comments', 'media'):
|
||||
before += 30
|
||||
time.sleep(1) # Gab's API is pretty quick but doesn't like being hammered...
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
subparser.add_argument('username', help = 'A Gab username')
|
||||
|
||||
|
||||
class GabUserPostsScraper(GabUserCommonScraper):
|
||||
name = 'gab-user'
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls('posts', args.username, retries = args.retries)
|
||||
|
||||
|
||||
class GabUserCommentsScraper(GabUserCommonScraper):
|
||||
name = 'gab-user-comments'
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls('comments', args.username, retries = args.retries)
|
||||
|
||||
|
||||
class GabUserMediaScraper(GabUserCommonScraper):
|
||||
name = 'gab-user-media'
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls('media', args.username, retries = args.retries)
|
||||
@@ -1,102 +0,0 @@
|
||||
import datetime
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import snscrape.base
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GooglePlusUserScraper(snscrape.base.Scraper):
|
||||
name = 'googleplus-user'
|
||||
|
||||
def __init__(self, user, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._user = user
|
||||
|
||||
def get_items(self):
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
|
||||
|
||||
logger.info('Retrieving initial data')
|
||||
r = self._get(f'https://plus.google.com/{self._user}', headers = headers)
|
||||
if r.status_code == 404:
|
||||
logger.warning('User does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
|
||||
# Global data; only needed for the session ID
|
||||
#TODO: Make this more robust somehow
|
||||
match = re.search(r'''(['"])FdrFJe\1\s*:\s*(['"])(?P<sid>.*?)\2''', r.text)
|
||||
if not match:
|
||||
logger.error('Unable to find session ID')
|
||||
return
|
||||
sid = match.group('sid')
|
||||
|
||||
# Page data
|
||||
# As of 2018-05-18, the much simpler regex r'''<script[^>]*>AF_initDataCallback\(\{key: 'ds:6',.*?return (.*?)\}\}\);</script>''' would work also, but this is more generic and less likely to break:
|
||||
match = re.search(r'''<script[^>]*>\s*(?:.*?)\s*\(\s*\{(?:|.*?,)\s*key\s*:\s*(['"])ds:6\1\s*,.*?,\s*data\s*:\s*function\s*\(\s*\)\s*\{\s*return\s*(?P<data>.*?)\}\s*\}\s*\)\s*;\s*</script>''', r.text, re.DOTALL)
|
||||
if not match:
|
||||
logger.error('Unable to extract data')
|
||||
return
|
||||
jsonData = match.group('data')
|
||||
response = json.loads(jsonData)
|
||||
if response[0][7] is None:
|
||||
logger.info('User has no posts')
|
||||
return
|
||||
for postObj in response[0][7]:
|
||||
yield snscrape.base.URLItem(f'https://plus.google.com/{postObj[6]["33558957"][21]}')
|
||||
cursor = response[0][1] # 'ADSJ_x'
|
||||
if cursor is None:
|
||||
# No further pages
|
||||
return
|
||||
baseDate = datetime.datetime.utcnow()
|
||||
baseSeconds = baseDate.hour * 3600 + baseDate.minute * 60 + baseDate.second
|
||||
userid = response[1] # Alternatively and more ugly: response[0][7][0][6]['33558957'][16]
|
||||
|
||||
for counter in itertools.count(start = 2):
|
||||
logger.info('Retrieving next page')
|
||||
reqid = 1 + baseSeconds + int(1e5) * counter
|
||||
r = self._post(
|
||||
f'https://plus.google.com/_/PlusAppUi/data?ds.extension=74333095&f.sid={sid}&hl=en-US&soc-app=199&soc-platform=1&soc-device=1&_reqid={reqid}&rt=c',
|
||||
data = [('f.req', '[[[74333095,[{"74333095":["' + cursor + '","' + userid + '"]}],null,null,0]]]'), ('', '')],
|
||||
headers = headers
|
||||
)
|
||||
if r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
|
||||
# As if everything up to here wasn't terrible already, this is where it gets *really* bad.
|
||||
# The API contains a few junk characters at the beginning, apparently as an anti-CSRF measure.
|
||||
# The remainder is effectively a self-made chunked transfer encoding but with decimal digits and including everything except the digits themselves in the chunk size.
|
||||
# It sucks.
|
||||
# Each chunk is actually one JSON object; you'd think that we can just read the first one and parse that, but there are some quirks that make this difficult.
|
||||
# I was unable to figure out what the "chunk size" actually covers exactly; the response is UTF-8 encoded, but the chunk size matches neither the binary nor the decoded length.
|
||||
# Enter the awful workaround: strip away the initial chunk size, then parse the beginning of the remaining data using a parser that doesn't care if there's junk after the JSON.
|
||||
|
||||
garbage = r.text
|
||||
assert garbage[:6] == ")]}'\n\n" # anti-CSRF and two newlines
|
||||
data = []
|
||||
pos = 6
|
||||
while garbage[pos].isdigit() or garbage[pos].isspace(): # Also strip leading whitespace
|
||||
pos += 1
|
||||
response = json.JSONDecoder().raw_decode(''.join(garbage[pos:]))[0] # Parses only the first structure in the data stream without throwing an error about the extra data at the end
|
||||
|
||||
for postObj in response[0][2]['74333095'][0][7]:
|
||||
yield snscrape.base.URLItem(f'https://plus.google.com/{postObj[6]["33558957"][21]}')
|
||||
|
||||
cursor = response[0][2]['74333095'][0][1]
|
||||
|
||||
if cursor is None:
|
||||
break
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
subparser.add_argument('user', help = 'A Google Plus username (with leading "+") or numeric ID')
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls(args.user, retries = args.retries)
|
||||
@@ -1,73 +1,65 @@
|
||||
__all__ = ['InstagramPost', 'User', 'InstagramUserScraper', 'InstagramHashtagScraper', 'InstagramLocationScraper']
|
||||
|
||||
|
||||
import dataclasses
|
||||
import datetime
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import snscrape.base
|
||||
import typing
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class InstagramPost(typing.NamedTuple, snscrape.base.Item):
|
||||
cleanUrl: str
|
||||
dirtyUrl: str
|
||||
@dataclasses.dataclass
|
||||
class InstagramPost(snscrape.base.Item):
|
||||
url: str
|
||||
date: datetime.datetime
|
||||
content: str
|
||||
content: typing.Optional[str]
|
||||
thumbnailUrl: str
|
||||
displayUrl: str
|
||||
username: str
|
||||
username: typing.Optional[str]
|
||||
likes: int
|
||||
comments: int
|
||||
commentsDisabled: bool
|
||||
isVideo: bool
|
||||
|
||||
def __str__(self):
|
||||
return self.cleanUrl
|
||||
return self.url
|
||||
|
||||
|
||||
class InstagramCommonScraper(snscrape.base.Scraper):
|
||||
def __init__(self, mode, name, **kwargs):
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Item):
|
||||
username: str
|
||||
name: typing.Optional[str]
|
||||
followers: snscrape.base.IntWithGranularity
|
||||
following: snscrape.base.IntWithGranularity
|
||||
posts: snscrape.base.IntWithGranularity
|
||||
|
||||
followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
|
||||
followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
|
||||
postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
|
||||
|
||||
def __str__(self):
|
||||
return f'https://www.instagram.com/{self.username}/'
|
||||
|
||||
|
||||
class _InstagramCommonScraper(snscrape.base.Scraper):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
if mode not in ('User', 'Hashtag', 'Location'):
|
||||
raise ValueError('Invalid mode')
|
||||
self._mode = mode
|
||||
self._name = name
|
||||
|
||||
if self._mode == 'User':
|
||||
self._initialUrl = f'https://www.instagram.com/{self._name}/'
|
||||
self._pageName = 'ProfilePage'
|
||||
self._responseContainer = 'user'
|
||||
self._edgeXToMedia = 'edge_owner_to_timeline_media'
|
||||
self._pageIDKey = 'id'
|
||||
self._queryHash = 'f2405b236d85e8296cf30347c9f08c2a'
|
||||
self._variablesFormat = '{{"id":"{pageID}","first":50,"after":"{endCursor}"}}'
|
||||
elif self._mode == 'Hashtag':
|
||||
self._initialUrl = f'https://www.instagram.com/explore/tags/{self._name}/'
|
||||
self._pageName = 'TagPage'
|
||||
self._responseContainer = 'hashtag'
|
||||
self._edgeXToMedia = 'edge_hashtag_to_media'
|
||||
self._pageIDKey = 'name'
|
||||
self._queryHash = 'f92f56d47dc7a55b606908374b43a314'
|
||||
self._variablesFormat = '{{"tag_name":"{pageID}","first":50,"after":"{endCursor}"}}'
|
||||
elif self._mode == 'Location':
|
||||
self._initialUrl = f'https://www.instagram.com/explore/locations/{self._name}/'
|
||||
self._pageName = 'LocationsPage'
|
||||
self._responseContainer = 'location'
|
||||
self._edgeXToMedia = 'edge_location_to_media'
|
||||
self._pageIDKey = 'id'
|
||||
self._queryHash = '1b84447a4d8b6d6d0426fefb34514485'
|
||||
self._variablesFormat = '{{"id":"{pageID}","first":50,"after":"{endCursor}"}}'
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
|
||||
self._initialPage = None
|
||||
|
||||
def _response_to_items(self, response):
|
||||
for node in response[self._responseContainer][self._edgeXToMedia]['edges']:
|
||||
code = node['node']['shortcode']
|
||||
username = node['node']['owner']['username'] if 'username' in node['node']['owner'] else ''
|
||||
usernameQuery = '?taken-by=' + username
|
||||
cleanUrl = f'https://www.instagram.com/p/{code}/'
|
||||
username = node['node']['owner']['username'] if 'username' in node['node']['owner'] else None
|
||||
url = f'https://www.instagram.com/p/{code}/'
|
||||
yield InstagramPost(
|
||||
cleanUrl = cleanUrl,
|
||||
dirtyUrl = f'{cleanUrl}{usernameQuery}',
|
||||
url = url,
|
||||
date = datetime.datetime.fromtimestamp(node['node']['taken_at_timestamp'], datetime.timezone.utc),
|
||||
content = node['node']['edge_media_to_caption']['edges'][0]['node']['text'] if len(node['node']['edge_media_to_caption']['edges']) else None,
|
||||
thumbnailUrl = node['node']['thumbnail_src'],
|
||||
@@ -79,6 +71,17 @@ class InstagramCommonScraper(snscrape.base.Scraper):
|
||||
isVideo = node['node']['is_video'],
|
||||
)
|
||||
|
||||
def _initial_page(self):
|
||||
if self._initialPage is None:
|
||||
_logger.info('Retrieving initial data')
|
||||
r = self._get(self._initialUrl, headers = self._headers, responseOkCallback = self._check_initial_page_callback)
|
||||
if r.status_code not in (200, 404):
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
elif r.url.startswith('https://www.instagram.com/accounts/login/'):
|
||||
raise snscrape.base.ScraperException('Redirected to login page')
|
||||
self._initialPage = r
|
||||
return self._initialPage
|
||||
|
||||
def _check_initial_page_callback(self, r):
|
||||
if r.status_code != 200:
|
||||
return True, None
|
||||
@@ -93,6 +96,8 @@ class InstagramCommonScraper(snscrape.base.Scraper):
|
||||
def _check_json_callback(self, r):
|
||||
if r.status_code != 200:
|
||||
return False, f'status code {r.status_code}'
|
||||
if r.url.startswith('https://www.instagram.com/accounts/login/'):
|
||||
raise snscrape.base.ScraperException('Redirected to login page')
|
||||
try:
|
||||
obj = json.loads(r.text)
|
||||
except json.JSONDecodeError as e:
|
||||
@@ -101,23 +106,17 @@ class InstagramCommonScraper(snscrape.base.Scraper):
|
||||
return True, None
|
||||
|
||||
def get_items(self):
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
|
||||
|
||||
logger.info('Retrieving initial data')
|
||||
r = self._get(self._initialUrl, headers = headers, responseOkCallback = self._check_initial_page_callback)
|
||||
r = self._initial_page()
|
||||
if r.status_code == 404:
|
||||
logger.warning(f'{self._mode} does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
_logger.warning('Page does not exist')
|
||||
return
|
||||
response = r._snscrape_json_obj
|
||||
rhxGis = response['rhx_gis'] if 'rhx_gis' in response else ''
|
||||
if response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['count'] == 0:
|
||||
logger.info(f'{self._mode} has no posts')
|
||||
_logger.info('Page has no posts')
|
||||
return
|
||||
if not response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['edges']:
|
||||
logger.warning('Private account')
|
||||
_logger.warning('Private account')
|
||||
return
|
||||
pageID = response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._pageIDKey]
|
||||
yield from self._response_to_items(response['entry_data'][self._pageName][0]['graphql'])
|
||||
@@ -125,16 +124,16 @@ class InstagramCommonScraper(snscrape.base.Scraper):
|
||||
return
|
||||
endCursor = response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['page_info']['end_cursor']
|
||||
|
||||
headers = self._headers.copy()
|
||||
while True:
|
||||
logger.info(f'Retrieving endCursor = {endCursor!r}')
|
||||
_logger.info(f'Retrieving endCursor = {endCursor!r}')
|
||||
variables = self._variablesFormat.format(**locals())
|
||||
headers['X-Requested-With'] = 'XMLHttpRequest'
|
||||
headers['X-Instagram-GIS'] = hashlib.md5(f'{rhxGis}:{variables}'.encode('utf-8')).hexdigest()
|
||||
r = self._get(f'https://www.instagram.com/graphql/query/?query_hash={self._queryHash}&variables={variables}', headers = headers, responseOkCallback = self._check_json_callback)
|
||||
|
||||
if r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
|
||||
response = r._snscrape_json_obj
|
||||
if not response['data'][self._responseContainer][self._edgeXToMedia]['edges']:
|
||||
@@ -145,37 +144,100 @@ class InstagramCommonScraper(snscrape.base.Scraper):
|
||||
endCursor = response['data'][self._responseContainer][self._edgeXToMedia]['page_info']['end_cursor']
|
||||
|
||||
|
||||
class InstagramUserScraper(InstagramCommonScraper):
|
||||
class InstagramUserScraper(_InstagramCommonScraper):
|
||||
name = 'instagram-user'
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
subparser.add_argument('username', help = 'An Instagram username (no leading @)')
|
||||
def __init__(self, username, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._initialUrl = f'https://www.instagram.com/{username}/'
|
||||
self._pageName = 'ProfilePage'
|
||||
self._responseContainer = 'user'
|
||||
self._edgeXToMedia = 'edge_owner_to_timeline_media'
|
||||
self._pageIDKey = 'id'
|
||||
self._queryHash = 'f2405b236d85e8296cf30347c9f08c2a'
|
||||
self._variablesFormat = '{{"id":"{pageID}","first":50,"after":"{endCursor}"}}'
|
||||
|
||||
def _get_entity(self):
|
||||
r = self._initial_page()
|
||||
if r.status_code != 200:
|
||||
return
|
||||
if '<meta property="og:description" content="' not in r.text:
|
||||
return
|
||||
ogDescriptionContentPos = r.text.index('<meta property="og:description" content="') + len('<meta property="og:description" content="')
|
||||
ogDescription = r.text[ogDescriptionContentPos : r.text.index('"', ogDescriptionContentPos)]
|
||||
|
||||
numPattern = r'\d+(?:\.\d+)?m|\d+(?:\.\d+)?k|\d+,\d+|\d+'
|
||||
ogDescriptionPattern = re.compile('^(' + numPattern + ') Followers, (' + numPattern + ') Following, (' + numPattern + r') Posts - See Instagram photos and videos from (?:(.*?) \(@([a-z0-9_.]+)\)|@([a-z0-9_-]+))$')
|
||||
m = ogDescriptionPattern.match(ogDescription)
|
||||
assert m, 'unexpected og:description format'
|
||||
|
||||
def parse_num(s):
|
||||
if s.endswith('m'):
|
||||
return int(float(s[:-1].replace(',', '')) * 1e6), 10 ** (6 if '.' not in s else 6 - len(s[:-1].replace(',', '').split('.')[1]))
|
||||
elif s.endswith('k'):
|
||||
return int(float(s[:-1].replace(',', '')) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].replace(',', '').split('.')[1]))
|
||||
else:
|
||||
return int(s.replace(',', '')), 1
|
||||
|
||||
followers = snscrape.base.IntWithGranularity(*parse_num(m.group(1)))
|
||||
following = snscrape.base.IntWithGranularity(*parse_num(m.group(2)))
|
||||
posts = snscrape.base.IntWithGranularity(*parse_num(m.group(3)))
|
||||
return User(
|
||||
username = m.group(5) or m.group(6),
|
||||
name = m.group(4) or None,
|
||||
followers = followers,
|
||||
following = following,
|
||||
posts = posts,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls('User', args.username, retries = args.retries)
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('username', type = snscrape.base.nonempty_string('username'), help = 'An Instagram username (no leading @)')
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.username)
|
||||
|
||||
|
||||
class InstagramHashtagScraper(InstagramCommonScraper):
|
||||
class InstagramHashtagScraper(_InstagramCommonScraper):
|
||||
name = 'instagram-hashtag'
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
subparser.add_argument('hashtag', help = 'An Instagram hashtag (no leading #)')
|
||||
def __init__(self, hashtag, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._initialUrl = f'https://www.instagram.com/explore/tags/{hashtag}/'
|
||||
self._pageName = 'TagPage'
|
||||
self._responseContainer = 'hashtag'
|
||||
self._edgeXToMedia = 'edge_hashtag_to_media'
|
||||
self._pageIDKey = 'name'
|
||||
self._queryHash = 'f92f56d47dc7a55b606908374b43a314'
|
||||
self._variablesFormat = '{{"tag_name":"{pageID}","first":50,"after":"{endCursor}"}}'
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls('Hashtag', args.hashtag, retries = args.retries)
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('hashtag', type = snscrape.base.nonempty_string('hashtag'), help = 'An Instagram hashtag (no leading #)')
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.hashtag)
|
||||
|
||||
|
||||
class InstagramLocationScraper(InstagramCommonScraper):
|
||||
class InstagramLocationScraper(_InstagramCommonScraper):
|
||||
name = 'instagram-location'
|
||||
|
||||
def __init__(self, locationId, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._initialUrl = f'https://www.instagram.com/explore/locations/{locationId}/'
|
||||
self._pageName = 'LocationsPage'
|
||||
self._responseContainer = 'location'
|
||||
self._edgeXToMedia = 'edge_location_to_media'
|
||||
self._pageIDKey = 'id'
|
||||
self._queryHash = '1b84447a4d8b6d6d0426fefb34514485'
|
||||
self._variablesFormat = '{{"id":"{pageID}","first":50,"after":"{endCursor}"}}'
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('locationid', help = 'An Instagram location ID', type = int)
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls('Location', args.locationid, retries = args.retries)
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.locationid)
|
||||
|
||||
340
snscrape/modules/mastodon.py
Normal file
340
snscrape/modules/mastodon.py
Normal file
@@ -0,0 +1,340 @@
|
||||
__all__ = ['Toot', 'Boost', 'Attachment', 'Poll', 'PollOption', 'User', 'CustomEmoji', 'MastodonProfileScraper', 'MastodonTootScraperMode', 'MastodonTootScraper']
|
||||
|
||||
|
||||
import bs4
|
||||
import dataclasses
|
||||
import datetime
|
||||
import enum
|
||||
import json
|
||||
import logging
|
||||
import snscrape.base
|
||||
import time
|
||||
import typing
|
||||
import urllib.parse
|
||||
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Toot(snscrape.base.Item):
|
||||
url: str
|
||||
id: str
|
||||
user: 'User'
|
||||
date: datetime.datetime
|
||||
text: str
|
||||
spoilerText: typing.Optional[str] = None
|
||||
attachments: typing.Optional[typing.List['Attachment']] = None
|
||||
links: typing.Optional[typing.List[str]] = None
|
||||
mentionedUsers: typing.Optional[typing.List['User']] = None
|
||||
hashtags: typing.Optional[typing.List[str]] = None
|
||||
poll: typing.Optional['Poll'] = None
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Boost(snscrape.base.Item):
|
||||
user: 'User'
|
||||
toot: Toot
|
||||
|
||||
def __str__(self):
|
||||
# Boosts don't have their own URLs
|
||||
return str(self.toot)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Attachment:
|
||||
url: str
|
||||
name: str
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Poll:
|
||||
id: str
|
||||
expirationDate: datetime.datetime
|
||||
multiple: bool
|
||||
options: typing.List['PollOption']
|
||||
votesCount: int
|
||||
votersCount: typing.Optional[int] = None # Available since version 3.0.0 (commit 3babf846)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class PollOption:
|
||||
title: str
|
||||
votesCount: int
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Item):
|
||||
account: str # @username@domain.invalid
|
||||
displayName: typing.Optional[str] = None
|
||||
displayNameWithCustomEmojis: typing.Optional[typing.List[typing.Union[str, 'CustomEmoji']]] = None
|
||||
avatarUrl: typing.Optional[str] = None
|
||||
_url: typing.Optional[str] = None
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
if self._url:
|
||||
return self._url
|
||||
return f'https://{"/@".join(reversed(self.account[1:].split("@")))}'
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class CustomEmoji:
|
||||
shortName: str
|
||||
url: str
|
||||
staticUrl: str
|
||||
|
||||
|
||||
class _MastodonCommonScraper(snscrape.base.Scraper):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept-Language': 'en-US,en;q=0.5'}
|
||||
self._lastRequest = 0
|
||||
|
||||
def _rate_limited_get(self, *args, **kwargs):
|
||||
if (diff := time.time() - self._lastRequest) < 3:
|
||||
time.sleep(3 - diff)
|
||||
self._lastRequest = time.time()
|
||||
return self._get(*args, **kwargs)
|
||||
|
||||
def _entries_to_items(self, entries, url):
|
||||
for entry in entries:
|
||||
if entry.find('a', class_ = 'load-more'):
|
||||
continue
|
||||
|
||||
tootKwargs = {}
|
||||
|
||||
info = entry.find('div', class_ = 'status__info')
|
||||
if not info: # Before 2.5.0 (commit bb71538b)
|
||||
info = entry.find('div', class_ = 'status__header')
|
||||
if not info: # Detailed status (i.e. toot page rather than timeline)?
|
||||
info = entry.find('div', class_ = 'detailed-status__meta')
|
||||
link = info.find('a', class_ = 'status__relative-time')
|
||||
if not link: # Detailed status?
|
||||
link = info.find('a', class_ = 'detailed-status__datetime')
|
||||
tootKwargs['url'] = link['href']
|
||||
tootKwargs['id'] = tootKwargs['url'].rsplit('/', 1)[1]
|
||||
tootKwargs['date'] = datetime.datetime.strptime(info.find('data', class_ = 'dt-published')['value'], '%Y-%m-%dT%H:%M:%S+00:00').replace(tzinfo = datetime.timezone.utc)
|
||||
|
||||
userKwargs = {}
|
||||
userLink = info.find('a', class_ = 'status__display-name')
|
||||
if not userLink: # Detailed status?
|
||||
userLink = entry.find('a', class_ = 'detailed-status__display-name')
|
||||
userNameSpan = userLink.find('span', class_ = 'display-name')
|
||||
userKwargs['account'] = userNameSpan.find('span').text.strip()
|
||||
if userKwargs['account'].count('@') == 1: # Ancient versions don't include the instance for posts from accounts on the instance itself
|
||||
userKwargs['account'] = self._url_to_account(userLink['href'])
|
||||
userKwargs['_url'] = urllib.parse.urljoin(url, userLink['href'])
|
||||
userKwargs['displayName'], userKwargs['displayNameWithCustomEmojis'] = self._display_name(userNameSpan.find('strong'), url)
|
||||
userKwargs['avatarUrl'] = urllib.parse.urljoin(url, userLink.find('img', class_ = 'u-photo')['src'])
|
||||
tootKwargs['user'] = User(**userKwargs)
|
||||
|
||||
content = entry.find('div', class_ = 'status__content')
|
||||
if not content.find(class_ = 'status__content__spoiler-link'):
|
||||
tootKwargs['text'] = '\n\n'.join(p.text for p in content.find_all('p'))
|
||||
else:
|
||||
tootKwargs['text'] = content.find('span', class_ = 'p-summary').text
|
||||
tootKwargs['spoilerText'] = '\n\n'.join(p.text for p in content.find('div', class_ = 'e-content').find_all('p'))
|
||||
|
||||
if (attachmentsDiv := entry.find('div', class_ = 'attachment-list')):
|
||||
attachments = []
|
||||
for a in attachmentsDiv.find_all('a'):
|
||||
attachments.append(Attachment(url = urllib.parse.urljoin(url, a['href']), name = a.text.strip()))
|
||||
tootKwargs['attachments'] = attachments
|
||||
elif (mediaGalleryDiv := entry.find('div', attrs = {'data-component': 'MediaGallery'})): # Before 2.7.0 (https://github.com/mastodon/mastodon/issues/6714)
|
||||
o = json.loads(mediaGalleryDiv['data-props'])
|
||||
attachments = []
|
||||
for medium in o['media']:
|
||||
attachments.append(Attachment(url = urllib.parse.urljoin(url, medium['url']), name = medium['url'].rsplit('/', 1)[-1].strip()))
|
||||
tootKwargs['attachments'] = attachments
|
||||
elif (attachmentsDiv := entry.find('div', class_ = 'status__attachments')): # Before 2.3.0 (commit 2bbf987a)
|
||||
attachments = []
|
||||
for a in attachmentsDiv.find_all('a'):
|
||||
attachments.append(Attachment(url = urllib.parse.urljoin(url, a['href']), name = a['href'].rsplit('/', 1)[1]))
|
||||
tootKwargs['attachments'] = attachments
|
||||
|
||||
links = []
|
||||
mentionedUsers = []
|
||||
hashtags = []
|
||||
for a in content.find_all('a'):
|
||||
cls = a.get('class', [])
|
||||
if 'mention' in cls and 'u-url' in cls:
|
||||
mentionUrl = urllib.parse.urljoin(url, a['href'])
|
||||
mentionedUsers.append(User(account = self._url_to_account(mentionUrl), _url = mentionUrl))
|
||||
elif 'mention' in cls and 'hashtag' in cls:
|
||||
hashtags.append(a.text.strip())
|
||||
else:
|
||||
links.append(urllib.parse.urljoin(url, a['href']))
|
||||
if links:
|
||||
tootKwargs['links'] = links
|
||||
if mentionedUsers:
|
||||
tootKwargs['mentionedUsers'] = mentionedUsers
|
||||
if hashtags:
|
||||
tootKwargs['hashtags'] = hashtags
|
||||
|
||||
if (pollDiv := entry.find('div', attrs = {'data-component': 'Poll'})):
|
||||
o = json.loads(pollDiv['data-props'])
|
||||
pollKwargs = {}
|
||||
pollKwargs['id'] = o['poll']['id']
|
||||
pollKwargs['expirationDate'] = datetime.datetime.strptime(o['poll']['expires_at'], '%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo = datetime.timezone.utc)
|
||||
pollKwargs['multiple'] = o['poll']['multiple']
|
||||
pollKwargs['options'] = [PollOption(title = op['title'], votesCount = op['votes_count']) for op in o['poll']['options']]
|
||||
pollKwargs['votesCount'] = o['poll']['votes_count']
|
||||
if 'voters_count' in o['poll']: # 3.0.0 (commit 3babf846)
|
||||
pollKwargs['votersCount'] = o['poll']['voters_count']
|
||||
tootKwargs['poll'] = Poll(**pollKwargs)
|
||||
|
||||
toot = Toot(**tootKwargs)
|
||||
|
||||
# Boosts
|
||||
prepend = entry.find('div', class_ = 'status__prepend')
|
||||
if not prepend: # Before 2.5.0 (commit bb71538b)
|
||||
prepend = entry.find('div', class_ = 'pre-header')
|
||||
if prepend and prepend.find('i', class_ = 'fa-retweet'): # Is a boost
|
||||
userKwargs = {}
|
||||
userLink = prepend.find('a', class_ = 'status__display-name')
|
||||
# The user is always on this instance since that's the only place where boosts are shown, hence there is no explicit account span. Reconstruct from URL.
|
||||
userUrl = urllib.parse.urljoin(url, userLink['href'])
|
||||
assert userUrl.count('/') == 3 and userUrl.count('/@') == 1
|
||||
userKwargs['account'] = '@'.join(reversed(userUrl.split('/')[2:]))
|
||||
userKwargs['displayName'], userKwargs['displayNameWithCustomEmojis'] = self._display_name(userLink.find('strong'), url)
|
||||
toot = Boost(user = User(**userKwargs), toot = toot)
|
||||
|
||||
yield toot
|
||||
|
||||
def _display_name(self, strong, url):
|
||||
outPlain = []
|
||||
outFull = []
|
||||
hasCustomEmoji = False
|
||||
for child in strong.children:
|
||||
if isinstance(child, bs4.element.NavigableString):
|
||||
outPlain.append(str(child))
|
||||
outFull.append(str(child))
|
||||
elif child.name == 'img' and 'custom-emoji' in child.get('class', []):
|
||||
hasCustomEmoji = True
|
||||
outPlain.append(child['alt'])
|
||||
outFull.append(CustomEmoji(shortName = child['alt'], url = urllib.parse.urljoin(url, child['data-original']), staticUrl = urllib.parse.urljoin(url, child['data-static'])))
|
||||
elif child.name == 'img' and 'emojione' in child.get('class', []):
|
||||
# Version 2.0.0 (which first added custom emojis) to 2.9.4: no data-* attributes, only gets one of the URLs with no (easy, reliable) way of knowing which it is.
|
||||
hasCustomEmoji = True
|
||||
outPlain.append(child['alt'])
|
||||
outFull.append(CustomEmoji(shortName = child['alt'], url = urllib.parse.urljoin(url, child['src'])))
|
||||
else:
|
||||
_logger.warning(f'Unexpected display name child: {child!r}')
|
||||
return ''.join(outPlain), outFull if hasCustomEmoji else None
|
||||
|
||||
@staticmethod
|
||||
def _url_to_account(url):
|
||||
if url.count('/') == 3 and url.count('/@') == 1:
|
||||
return '@'.join(reversed(url.split('/')[2:]))
|
||||
if url.count('/') == 4 and '/users/' in url: # E.g. Pleroma, also supported by Mastodon
|
||||
return '@' + '@'.join(reversed(url.split('/')[2::2]))
|
||||
if url.count('/') == 4 and '/accounts/' in url: # E.g. Peertube
|
||||
return '@' + '@'.join(reversed(url.split('/')[2::2]))
|
||||
if url.count('/') == 4 and '/profile/' in url: # E.g. Friendica
|
||||
return '@' + '@'.join(reversed(url.split('/')[2::2]))
|
||||
raise ValueError('Unrecognised account URL format')
|
||||
|
||||
|
||||
class MastodonProfileScraper(_MastodonCommonScraper):
|
||||
name = 'mastodon-profile'
|
||||
|
||||
def __init__(self, account, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
if account.startswith('@') and account.count('@') == 2:
|
||||
account, domain = account[1:].split('@')
|
||||
url = f'https://{domain}/@{account}'
|
||||
else:
|
||||
url = account
|
||||
self._url = url
|
||||
|
||||
def get_items(self):
|
||||
initial = True
|
||||
while True:
|
||||
if initial:
|
||||
r = self._rate_limited_get(f'{self._url}/with_replies', headers = self._headers)
|
||||
if r.status_code not in (200, 404):
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
if r.status_code == 404: # Possibly an old instance where with_replies doesn't exist, try without that.
|
||||
r = self._rate_limited_get(self._url, headers = self._headers)
|
||||
if r.status_code not in (200, 404):
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
if r.status_code == 404:
|
||||
_logger.warning('Account does not exist')
|
||||
return
|
||||
_logger.warning('Old Mastodon instance, cannot retrieve reply toots')
|
||||
initial = False
|
||||
else:
|
||||
r = self._rate_limited_get(url, headers = self._headers)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
|
||||
yield from self._entries_to_items(soup.find('div', class_ = 'activity-stream').find_all('div', class_ = 'entry'), r.url)
|
||||
|
||||
nextA = soup.find('a', class_ = 'load-more', href = lambda x: '?max_id=' in x or '&max_id=' in x)
|
||||
if not nextA: # Before 2.5.0 (commit bb71538b)
|
||||
paginationDiv = soup.find('div', class_ = 'pagination')
|
||||
if paginationDiv:
|
||||
nextA = paginationDiv.find('a', class_ = 'next')
|
||||
if not nextA: # End of pagination
|
||||
break
|
||||
url = urllib.parse.urljoin(r.url, nextA['href'])
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('account', type = snscrape.base.nonempty_string('account'), help = 'A Mastodon account. This can be either a URL to the profile page or a string of the form @account@instance.example.org')
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.account)
|
||||
|
||||
|
||||
class MastodonTootScraperMode(enum.Enum):
|
||||
SINGLE = 'single'
|
||||
THREAD = 'thread'
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
if args.thread:
|
||||
return cls.THREAD
|
||||
return cls.SINGLE
|
||||
|
||||
|
||||
class MastodonTootScraper(_MastodonCommonScraper):
|
||||
name = 'mastodon-toot'
|
||||
|
||||
def __init__(self, url, *, mode = MastodonTootScraperMode.SINGLE, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._url = url
|
||||
self._mode = mode
|
||||
|
||||
def get_items(self):
|
||||
r = self._rate_limited_get(self._url, headers = self._headers)
|
||||
if r.status_code == 404:
|
||||
_logger.warning('Toot does not exist')
|
||||
return
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
if self._mode is MastodonTootScraperMode.SINGLE:
|
||||
status = soup.find('div', class_ = 'detailed-status')
|
||||
entry = status.parent
|
||||
yield from self._entries_to_items([entry], r.url)
|
||||
elif self._mode is MastodonTootScraperMode.THREAD:
|
||||
yield from self._entries_to_items(soup.find('div', class_ = 'activity-stream').find_all('div', class_ = 'entry'), r.url)
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('--thread', action = 'store_true', help = 'Collect thread around the toot referenced by the URL')
|
||||
subparser.add_argument('url', type = snscrape.base.nonempty_string('url'), help = 'A URL for a toot')
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.url, mode = MastodonTootScraperMode._cli_from_args(args))
|
||||
279
snscrape/modules/reddit.py
Normal file
279
snscrape/modules/reddit.py
Normal file
@@ -0,0 +1,279 @@
|
||||
__all__ = ['Submission', 'Comment', 'RedditUserScraper', 'RedditSubredditScraper', 'RedditSearchScraper', 'RedditSubmissionScraper']
|
||||
|
||||
|
||||
import dataclasses
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
import snscrape.base
|
||||
import snscrape.version
|
||||
import string
|
||||
import time
|
||||
import typing
|
||||
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Most of these fields should never be None, but due to broken data, they sometimes are anyway...
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Submission(snscrape.base.Item):
|
||||
author: typing.Optional[str] # E.g. submission hf7k6
|
||||
date: datetime.datetime
|
||||
id: str
|
||||
link: typing.Optional[str]
|
||||
selftext: typing.Optional[str]
|
||||
subreddit: typing.Optional[str] # E.g. submission 617p51
|
||||
title: str
|
||||
url: str
|
||||
|
||||
created = snscrape.base._DeprecatedProperty('created', lambda self: self.date, 'date')
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Comment(snscrape.base.Item):
|
||||
author: typing.Optional[str]
|
||||
body: str
|
||||
date: datetime.datetime
|
||||
id: str
|
||||
parentId: typing.Optional[str]
|
||||
subreddit: typing.Optional[str]
|
||||
url: str
|
||||
|
||||
created = snscrape.base._DeprecatedProperty('created', lambda self: self.date, 'date')
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
def _cmp_id(id1, id2):
|
||||
'''Compare two Reddit IDs. Returns -1 if id1 is less than id2, 0 if they are equal, and 1 if id1 is greater than id2.
|
||||
|
||||
id1 and id2 may have prefixes like t1_, but if included, they must be present on both and equal.'''
|
||||
|
||||
if id1.startswith('t') and '_' in id1:
|
||||
prefix, id1 = id1.split('_', 1)
|
||||
if not id2.startswith(f'{prefix}_'):
|
||||
raise ValueError('id2 must have the same prefix as id1')
|
||||
_, id2 = id2.split('_', 1)
|
||||
if id1.strip(string.ascii_lowercase + string.digits) != '':
|
||||
raise ValueError('invalid characters in id1')
|
||||
if id2.strip(string.ascii_lowercase + string.digits) != '':
|
||||
raise ValueError('invalid characters in id2')
|
||||
if len(id1) < len(id2):
|
||||
return -1
|
||||
if len(id1) > len(id2):
|
||||
return 1
|
||||
if id1 < id2:
|
||||
return -1
|
||||
if id1 > id2:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
class _RedditPushshiftScraper(snscrape.base.Scraper):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._headers = {'User-Agent': f'snscrape/{snscrape.version.__version__}'}
|
||||
|
||||
def _handle_rate_limiting(self, r):
|
||||
if r.status_code == 429:
|
||||
_logger.info('Got 429 response, sleeping')
|
||||
time.sleep(10)
|
||||
return False, 'rate-limited'
|
||||
if r.status_code != 200:
|
||||
return False, 'non-200 status code'
|
||||
return True, None
|
||||
|
||||
def _get_api(self, url, params = None):
|
||||
r = self._get(url, params = params, headers = self._headers, responseOkCallback = self._handle_rate_limiting)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
return r.json()
|
||||
|
||||
def _api_obj_to_item(self, d):
|
||||
cls = Submission if 'title' in d else Comment
|
||||
|
||||
# Pushshift doesn't always return a permalink; sometimes, there's a permalink_url instead, and sometimes there's nothing at all
|
||||
permalink = d.get('permalink')
|
||||
if permalink is None:
|
||||
# E.g. comment dovj2v7
|
||||
permalink = d.get('permalink_url')
|
||||
if permalink is None:
|
||||
if 'link_id' in d and d['link_id'].startswith('t3_'): # E.g. comment doraazf
|
||||
if 'subreddit' in d:
|
||||
permalink = f'/r/{d["subreddit"]}/comments/{d["link_id"][3:]}/_/{d["id"]}/'
|
||||
else: # E.g. submission 617p51 but can likely happen for comments as well
|
||||
permalink = f'/comments/{d["link_id"][3:]}/_/{d["id"]}/'
|
||||
else:
|
||||
_logger.warning('Unable to find or construct permalink')
|
||||
permalink = '/'
|
||||
|
||||
kwargs = {
|
||||
'author': d.get('author'),
|
||||
'date': datetime.datetime.fromtimestamp(d['created_utc'], datetime.timezone.utc),
|
||||
'url': f'https://old.reddit.com{permalink}',
|
||||
'subreddit': d.get('subreddit'),
|
||||
}
|
||||
if cls is Submission:
|
||||
kwargs['selftext'] = d.get('selftext') or None
|
||||
kwargs['link'] = (d['url'] if not d['url'].startswith('/') else f'https://old.reddit.com{d["url"]}') if not kwargs['selftext'] else None
|
||||
if kwargs['link'] == kwargs['url'] or kwargs['url'].replace('//old.reddit.com/', '//www.reddit.com/') == kwargs['link']:
|
||||
kwargs['link'] = None
|
||||
kwargs['title'] = d['title']
|
||||
kwargs['id'] = f't3_{d["id"]}'
|
||||
else:
|
||||
kwargs['body'] = d['body']
|
||||
kwargs['parentId'] = d.get('parent_id')
|
||||
kwargs['id'] = f't1_{d["id"]}'
|
||||
|
||||
return cls(**kwargs)
|
||||
|
||||
def _iter_api(self, url, params = None):
|
||||
'''Iterate through the Pushshift API using the 'until' parameter and yield the items.'''
|
||||
lowestIdSeen = None
|
||||
if params is None:
|
||||
params = {}
|
||||
while True:
|
||||
obj = self._get_api(url, params = params)
|
||||
if not obj['data'] or (lowestIdSeen is not None and all(_cmp_id(d['id'], lowestIdSeen) >= 0 for d in obj['data'])): # end of pagination
|
||||
break
|
||||
for d in obj['data']:
|
||||
if lowestIdSeen is None or _cmp_id(d['id'], lowestIdSeen) == -1:
|
||||
yield self._api_obj_to_item(d)
|
||||
lowestIdSeen = d['id']
|
||||
params['until'] = obj["data"][-1]["created_utc"] + 1
|
||||
|
||||
|
||||
class _RedditPushshiftSearchScraper(_RedditPushshiftScraper):
|
||||
def __init__(self, name, *, submissions = True, comments = True, before = None, after = None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._name = name
|
||||
self._submissions = submissions
|
||||
self._comments = comments
|
||||
self._before = before
|
||||
self._after = after
|
||||
|
||||
if not type(self)._validationFunc(self._name):
|
||||
raise ValueError(f'invalid {type(self).name.split("-", 1)[1]} name')
|
||||
if not self._submissions and not self._comments:
|
||||
raise ValueError('At least one of submissions and comments must be True')
|
||||
|
||||
def _iter_api_submissions_and_comments(self, params: dict):
|
||||
# Retrieve both submissions and comments, interleave the results to get a reverse-chronological order
|
||||
params['limit'] = '1000'
|
||||
if self._before is not None:
|
||||
params['until'] = self._before
|
||||
if self._after is not None:
|
||||
params['since'] = self._after
|
||||
|
||||
if self._submissions:
|
||||
submissionsIter = self._iter_api('https://api.pushshift.io/reddit/search/submission', params.copy()) # Pass copies to prevent the two iterators from messing each other up by using the same dict
|
||||
else:
|
||||
submissionsIter = iter(())
|
||||
if self._comments:
|
||||
commentsIter = self._iter_api('https://api.pushshift.io/reddit/search/comment', params.copy())
|
||||
else:
|
||||
commentsIter = iter(())
|
||||
|
||||
try:
|
||||
tipSubmission = next(submissionsIter)
|
||||
except StopIteration:
|
||||
# There are no submissions, just yield comments and return
|
||||
yield from commentsIter
|
||||
return
|
||||
try:
|
||||
tipComment = next(commentsIter)
|
||||
except StopIteration:
|
||||
# There are no comments, just yield submissions and return
|
||||
yield tipSubmission
|
||||
yield from submissionsIter
|
||||
return
|
||||
|
||||
while True:
|
||||
# Return newer first; if both have the same creation datetime, return the comment first
|
||||
if tipSubmission.date > tipComment.date:
|
||||
yield tipSubmission
|
||||
try:
|
||||
tipSubmission = next(submissionsIter)
|
||||
except StopIteration:
|
||||
# Reached the end of submissions, just yield the remaining comments and stop
|
||||
yield tipComment
|
||||
yield from commentsIter
|
||||
break
|
||||
else:
|
||||
yield tipComment
|
||||
try:
|
||||
tipComment = next(commentsIter)
|
||||
except StopIteration:
|
||||
yield tipSubmission
|
||||
yield from submissionsIter
|
||||
break
|
||||
|
||||
def get_items(self):
|
||||
yield from self._iter_api_submissions_and_comments({type(self)._apiField: self._name})
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('--no-submissions', dest = 'noSubmissions', action = 'store_true', default = False, help = 'Don\'t list submissions')
|
||||
subparser.add_argument('--no-comments', dest = 'noComments', action = 'store_true', default = False, help = 'Don\'t list comments')
|
||||
subparser.add_argument('--before', metavar = 'TIMESTAMP', type = int, help = 'Fetch results before a Unix timestamp')
|
||||
subparser.add_argument('--after', metavar = 'TIMESTAMP', type = int, help = 'Fetch results after a Unix timestamp')
|
||||
name = cls.name.split('-', 1)[1]
|
||||
subparser.add_argument(name, type = snscrape.base.nonempty_string(name))
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
name = cls.name.split('-', 1)[1]
|
||||
return cls._cli_construct(args, getattr(args, name), submissions = not args.noSubmissions, comments = not args.noComments, before = args.before, after = args.after)
|
||||
|
||||
|
||||
class RedditUserScraper(_RedditPushshiftSearchScraper):
|
||||
name = 'reddit-user'
|
||||
_validationFunc = lambda x: re.match('^[A-Za-z0-9_-]{3,20}$', x)
|
||||
_apiField = 'author'
|
||||
|
||||
|
||||
class RedditSubredditScraper(_RedditPushshiftSearchScraper):
|
||||
name = 'reddit-subreddit'
|
||||
_validationFunc = lambda x: re.match('^[A-Za-z0-9][A-Za-z0-9_]{2,20}$', x)
|
||||
_apiField = 'subreddit'
|
||||
|
||||
|
||||
class RedditSearchScraper(_RedditPushshiftSearchScraper):
|
||||
name = 'reddit-search'
|
||||
_validationFunc = lambda x: True
|
||||
_apiField = 'q'
|
||||
|
||||
|
||||
class RedditSubmissionScraper(_RedditPushshiftScraper):
|
||||
name = 'reddit-submission'
|
||||
|
||||
def __init__(self, submissionId, **kwargs):
|
||||
if (submissionId[3:] if submissionId.startswith('t3_') else submissionId).strip(string.ascii_lowercase + string.digits) != '':
|
||||
raise ValueError('invalid submissionId')
|
||||
super().__init__(**kwargs)
|
||||
self._submissionId = submissionId
|
||||
|
||||
def get_items(self):
|
||||
obj = self._get_api(f'https://api.pushshift.io/reddit/search/submission?ids={self._submissionId}')
|
||||
if not obj['data']:
|
||||
return
|
||||
if len(obj['data']) != 1:
|
||||
raise snscrape.base.ScraperException(f'Got {len(obj["data"])} results instead of 1')
|
||||
yield self._api_obj_to_item(obj['data'][0])
|
||||
|
||||
# Upstream bug: link_id must be provided in decimal https://old.reddit.com/r/pushshift/comments/zkggt0/update_on_colo_switchover_bug_fixes_reindexing/
|
||||
yield from self._iter_api('https://api.pushshift.io/reddit/search/comment', {'link_id': int(self._submissionId, 36), 'limit': 1000})
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('submissionId', type = snscrape.base.nonempty_string('submissionId'))
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.submissionId)
|
||||
336
snscrape/modules/telegram.py
Normal file
336
snscrape/modules/telegram.py
Normal file
@@ -0,0 +1,336 @@
|
||||
__all__ = ['LinkPreview', 'TelegramPost', 'Channel', 'TelegramChannelScraper']
|
||||
|
||||
|
||||
import bs4
|
||||
import dataclasses
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
import snscrape.base
|
||||
import typing
|
||||
import urllib.parse
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
_SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
|
||||
_STYLE_MEDIA_URL_PATTERN = re.compile(r'url\(\'(.*?)\'\)')
|
||||
|
||||
@dataclasses.dataclass
|
||||
class LinkPreview:
|
||||
href: str
|
||||
siteName: typing.Optional[str] = None
|
||||
title: typing.Optional[str] = None
|
||||
description: typing.Optional[str] = None
|
||||
image: typing.Optional[str] = None
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Channel(snscrape.base.Item):
|
||||
username: str
|
||||
title: typing.Optional[str] = None
|
||||
verified: typing.Optional[bool] = None
|
||||
photo: typing.Optional[str] = None
|
||||
description: typing.Optional[str] = None
|
||||
members: typing.Optional[int] = None
|
||||
photos: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
videos: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
links: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
files: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
|
||||
photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
|
||||
videosGranularity = snscrape.base._DeprecatedProperty('videosGranularity', lambda self: self.videos.granularity, 'videos.granularity')
|
||||
linksGranularity = snscrape.base._DeprecatedProperty('linksGranularity', lambda self: self.links.granularity, 'links.granularity')
|
||||
filesGranularity = snscrape.base._DeprecatedProperty('filesGranularity', lambda self: self.files.granularity, 'files.granularity')
|
||||
|
||||
def __str__(self):
|
||||
return f'https://t.me/s/{self.username}'
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class TelegramPost(snscrape.base.Item):
|
||||
url: str
|
||||
date: datetime.datetime
|
||||
content: str
|
||||
outlinks: typing.List[str] = None
|
||||
mentions: typing.List[str] = None
|
||||
hashtags: typing.List[str] = None
|
||||
forwarded: typing.Optional['Channel'] = None
|
||||
forwardedUrl: typing.Optional[str] = None
|
||||
media: typing.Optional[typing.List['Medium']] = None
|
||||
views: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
linkPreview: typing.Optional[LinkPreview] = None
|
||||
|
||||
outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks), 'outlinks')
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
class Medium:
|
||||
pass
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Photo(Medium):
|
||||
url: str
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Video(Medium):
|
||||
thumbnailUrl: str
|
||||
duration: float
|
||||
url: typing.Optional[str] = None
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class VoiceMessage(Medium):
|
||||
url: str
|
||||
duration: str
|
||||
bars:typing.List[float]
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Gif(Medium):
|
||||
thumbnailUrl: str
|
||||
url: typing.Optional[str] = None
|
||||
|
||||
|
||||
class TelegramChannelScraper(snscrape.base.Scraper):
|
||||
name = 'telegram-channel'
|
||||
|
||||
def __init__(self, name, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._name = name
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
|
||||
self._initialPage = None
|
||||
self._initialPageSoup = None
|
||||
|
||||
def _initial_page(self):
|
||||
if self._initialPage is None:
|
||||
r = self._get(f'https://t.me/s/{self._name}', headers = self._headers)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
self._initialPage, self._initialPageSoup = r, bs4.BeautifulSoup(r.text, 'lxml')
|
||||
return self._initialPage, self._initialPageSoup
|
||||
|
||||
def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
|
||||
posts = soup.find_all('div', attrs = {'class': 'tgme_widget_message', 'data-post': True})
|
||||
for post in reversed(posts):
|
||||
if onlyUsername:
|
||||
yield post['data-post'].split('/')[0]
|
||||
return
|
||||
dateDiv = post.find('div', class_ = 'tgme_widget_message_footer').find('a', class_ = 'tgme_widget_message_date')
|
||||
rawUrl = dateDiv['href']
|
||||
if not rawUrl.startswith('https://t.me/') or sum(x == '/' for x in rawUrl) != 4 or rawUrl.rsplit('/', 1)[1].strip('0123456789') != '':
|
||||
_logger.warning(f'Possibly incorrect URL: {rawUrl!r}')
|
||||
url = rawUrl.replace('//t.me/', '//t.me/s/')
|
||||
date = datetime.datetime.strptime(dateDiv.find('time', datetime = True)['datetime'].replace('-', '', 2).replace(':', ''), '%Y%m%dT%H%M%S%z')
|
||||
media = []
|
||||
outlinks = []
|
||||
mentions = []
|
||||
hashtags = []
|
||||
forwarded = None
|
||||
forwardedUrl = None
|
||||
|
||||
if (forwardTag := post.find('a', class_ = 'tgme_widget_message_forwarded_from_name')):
|
||||
forwardedUrl = forwardTag['href']
|
||||
forwardedName = forwardedUrl.split('t.me/')[1].split('/')[0]
|
||||
forwarded = Channel(username = forwardedName)
|
||||
|
||||
if (message := post.find('div', class_ = 'tgme_widget_message_text')):
|
||||
content = message.get_text(separator="\n")
|
||||
else:
|
||||
content = None
|
||||
|
||||
for link in post.find_all('a'):
|
||||
if any(x in link.parent.attrs.get('class', []) for x in ('tgme_widget_message_user', 'tgme_widget_message_author')):
|
||||
# Author links at the top (avatar and name)
|
||||
continue
|
||||
if link['href'] == rawUrl or link['href'] == url:
|
||||
style = link.attrs.get('style', '')
|
||||
# Generic filter of links to the post itself, catches videos, photos, and the date link
|
||||
if style != '':
|
||||
imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style)
|
||||
if len(imageUrls) == 1:
|
||||
media.append(Photo(url = imageUrls[0]))
|
||||
continue
|
||||
if _SINGLE_MEDIA_LINK_PATTERN.match(link['href']):
|
||||
style = link.attrs.get('style', '')
|
||||
imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style)
|
||||
if len(imageUrls) == 1:
|
||||
media.append(Photo(url = imageUrls[0]))
|
||||
# resp = self._get(image[0])
|
||||
# encoded_string = base64.b64encode(resp.content)
|
||||
# Individual photo or video link
|
||||
continue
|
||||
if link.text.startswith('@'):
|
||||
mentions.append(link.text.strip('@'))
|
||||
continue
|
||||
if link.text.startswith('#'):
|
||||
hashtags.append(link.text.strip('#'))
|
||||
continue
|
||||
href = urllib.parse.urljoin(pageUrl, link['href'])
|
||||
if (href not in outlinks) and (href != rawUrl) and (href != forwardedUrl):
|
||||
outlinks.append(href)
|
||||
|
||||
for voicePlayer in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}):
|
||||
audioUrl = voicePlayer.find('audio')['src']
|
||||
durationStr = voicePlayer.find('time').text
|
||||
duration = _durationStrToSeconds(durationStr)
|
||||
barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')]
|
||||
|
||||
media.append(VoiceMessage(url = audioUrl, duration = duration, bars = barHeights))
|
||||
|
||||
for videoPlayer in post.find_all('a', {'class': 'tgme_widget_message_video_player'}):
|
||||
iTag = videoPlayer.find('i')
|
||||
if iTag is None:
|
||||
videoUrl = None
|
||||
videoThumbnailUrl = None
|
||||
else:
|
||||
style = iTag['style']
|
||||
videoThumbnailUrl = _STYLE_MEDIA_URL_PATTERN.findall(style)[0]
|
||||
videoTag = videoPlayer.find('video')
|
||||
videoUrl = None if videoTag is None else videoTag['src']
|
||||
mKwargs = {
|
||||
'thumbnailUrl': videoThumbnailUrl,
|
||||
'url': videoUrl,
|
||||
}
|
||||
timeTag = videoPlayer.find('time')
|
||||
if timeTag is None:
|
||||
cls = Gif
|
||||
else:
|
||||
cls = Video
|
||||
durationStr = videoPlayer.find('time').text
|
||||
mKwargs['duration'] = _durationStrToSeconds(durationStr)
|
||||
media.append(cls(**mKwargs))
|
||||
|
||||
linkPreview = None
|
||||
if (linkPreviewA := post.find('a', class_ = 'tgme_widget_message_link_preview')):
|
||||
kwargs = {}
|
||||
kwargs['href'] = urllib.parse.urljoin(pageUrl, linkPreviewA['href'])
|
||||
if (siteNameDiv := linkPreviewA.find('div', class_ = 'link_preview_site_name')):
|
||||
kwargs['siteName'] = siteNameDiv.text
|
||||
if (titleDiv := linkPreviewA.find('div', class_ = 'link_preview_title')):
|
||||
kwargs['title'] = titleDiv.text
|
||||
if (descriptionDiv := linkPreviewA.find('div', class_ = 'link_preview_description')):
|
||||
kwargs['description'] = descriptionDiv.text
|
||||
if (imageI := linkPreviewA.find('i', class_ = 'link_preview_image')):
|
||||
if imageI['style'].startswith("background-image:url('"):
|
||||
kwargs['image'] = imageI['style'][22 : imageI['style'].index("'", 22)]
|
||||
else:
|
||||
_logger.warning(f'Could not process link preview image on {url}')
|
||||
linkPreview = LinkPreview(**kwargs)
|
||||
if kwargs['href'] in outlinks:
|
||||
outlinks.remove(kwargs['href'])
|
||||
|
||||
viewsSpan = post.find('span', class_ = 'tgme_widget_message_views')
|
||||
views = None if viewsSpan is None else _parse_num(viewsSpan.text)
|
||||
|
||||
outlinks = outlinks if outlinks else None
|
||||
media = media if media else None
|
||||
mentions = mentions if mentions else None
|
||||
hashtags = hashtags if hashtags else None
|
||||
|
||||
yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, mentions = mentions, hashtags = hashtags, linkPreview = linkPreview, media = media, forwarded = forwarded, forwardedUrl = forwardedUrl, views = views)
|
||||
|
||||
def get_items(self):
|
||||
r, soup = self._initial_page()
|
||||
if '/s/' not in r.url:
|
||||
_logger.warning('No public post list for this user')
|
||||
return
|
||||
nextPageUrl = ''
|
||||
while True:
|
||||
yield from self._soup_to_items(soup, r.url)
|
||||
try:
|
||||
if soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)['href'].split('/')[-1] == '1':
|
||||
# if message 1 is the first message in the page, terminate scraping
|
||||
break
|
||||
except:
|
||||
pass
|
||||
pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})
|
||||
if not pageLink:
|
||||
# some pages are missing a "tme_messages_more" tag, causing early termination
|
||||
if '=' not in nextPageUrl:
|
||||
nextPageUrl = soup.find('link', attrs = {'rel': 'canonical'}, href = True)['href']
|
||||
nextPostIndex = int(nextPageUrl.split('=')[-1]) - 20
|
||||
if nextPostIndex > 20:
|
||||
pageLink = {'href': nextPageUrl.split('=')[0] + f'={nextPostIndex}'}
|
||||
else:
|
||||
break
|
||||
nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
|
||||
r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = _telegramResponseOkCallback)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
|
||||
def _get_entity(self):
|
||||
kwargs = {}
|
||||
# /channel has a more accurate member count and bigger profile picture
|
||||
r = self._get(f'https://t.me/{self._name}', headers = self._headers)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
if (membersDiv := soup.find('div', class_ = 'tgme_page_extra')):
|
||||
if membersDiv.text.split(',')[0].endswith((' members', ' subscribers')):
|
||||
membersStr = ''.join(membersDiv.text.split(',')[0].split(' ')[:-1])
|
||||
kwargs['members'] = 0 if membersStr == 'no' else int(membersStr)
|
||||
photoImg = soup.find('img', class_ = 'tgme_page_photo_image')
|
||||
if photoImg is not None:
|
||||
kwargs['photo'] = photoImg.attrs['src']
|
||||
else:
|
||||
kwargs['photo'] = None
|
||||
|
||||
r, soup = self._initial_page()
|
||||
if '/s/' not in r.url: # Redirect on channels without public posts
|
||||
return
|
||||
channelInfoDiv = soup.find('div', class_ = 'tgme_channel_info')
|
||||
assert channelInfoDiv, 'channel info div not found'
|
||||
titleDiv = channelInfoDiv.find('div', class_ = 'tgme_channel_info_header_title')
|
||||
kwargs['title'] = titleDiv.find('span').text
|
||||
kwargs['verified'] = bool(titleDiv.find('i', class_ = 'verified-icon'))
|
||||
# The username in the channel info is not canonicalised, nor is the one on the /channel page anywhere.
|
||||
# However, the post URLs are, so extract the first post and use that.
|
||||
try:
|
||||
kwargs['username'] = next(self._soup_to_items(soup, r.url, onlyUsername = True))
|
||||
except StopIteration:
|
||||
# If there are no posts, fall back to the channel info div, although that should never happen due to the 'Channel created' entry.
|
||||
_logger.warning('Could not find a post; extracting username from channel info div, which may not be capitalised correctly')
|
||||
kwargs['username'] = channelInfoDiv.find('div', class_ = 'tgme_channel_info_header_username').text[1:] # Remove @
|
||||
if (descriptionDiv := channelInfoDiv.find('div', class_ = 'tgme_channel_info_description')):
|
||||
kwargs['description'] = descriptionDiv.text
|
||||
|
||||
for div in channelInfoDiv.find_all('div', class_ = 'tgme_channel_info_counter'):
|
||||
value, granularity = _parse_num(div.find('span', class_ = 'counter_value').text)
|
||||
type_ = div.find('span', class_ = 'counter_type').text
|
||||
if type_ == 'members':
|
||||
# Already extracted more accurately from /channel, skip
|
||||
continue
|
||||
elif type_ in ('photos', 'videos', 'links', 'files'):
|
||||
kwargs[type_] = snscrape.base.IntWithGranularity(value, granularity)
|
||||
|
||||
return Channel(**kwargs)
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('channel', type = snscrape.base.nonempty_string('channel'), help = 'A channel name')
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.channel)
|
||||
|
||||
def _parse_num(s):
|
||||
s = s.replace(' ', '')
|
||||
if s.endswith('M'):
|
||||
return int(float(s[:-1]) * 1e6), 10 ** (6 if '.' not in s else 6 - len(s[:-1].split('.')[1]))
|
||||
elif s.endswith('K'):
|
||||
return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1]))
|
||||
return int(s), 1
|
||||
|
||||
def _durationStrToSeconds(durationStr):
|
||||
durationList = durationStr.split(':')
|
||||
return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(durationList))])
|
||||
|
||||
def _telegramResponseOkCallback(r):
|
||||
if r.status_code == 200:
|
||||
return (True, None)
|
||||
return (False, f'{r.status_code=}')
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,107 +1,391 @@
|
||||
__all__ = ['VKontaktePost', 'Photo', 'PhotoVariant', 'Video', 'User', 'VKontakteUserScraper']
|
||||
|
||||
|
||||
import bs4
|
||||
import collections
|
||||
import dataclasses
|
||||
import datetime
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import snscrape.base
|
||||
import typing
|
||||
import urllib.parse
|
||||
try:
|
||||
import zoneinfo
|
||||
except ImportError:
|
||||
# Python 3.8 support; nowadays, Europe/Moscow is always UTC+3, but it's more complicated before 2014, so need proper zone info
|
||||
import pytz
|
||||
def _timezone(s):
|
||||
return pytz.timezone(s)
|
||||
def _localised_datetime(tz, *args, **kwargs):
|
||||
return tz.localize(datetime.datetime(*args, **kwargs))
|
||||
else:
|
||||
def _timezone(s):
|
||||
return zoneinfo.ZoneInfo(s)
|
||||
def _localised_datetime(tz, *args, **kwargs):
|
||||
return datetime.datetime(*args, tzinfo = tz, **kwargs)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
_logger = logging.getLogger(__name__)
|
||||
_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
|
||||
_datePattern = re.compile(r'^(?P<date>today'
|
||||
r'|yesterday'
|
||||
r'|(?P<day1>\d+)\s+(?P<month1>' + '|'.join(_months) + r')(\s+(?P<year1>\d{4}))?'
|
||||
r'|(?P<month2>' + '|'.join(_months) + r')\s+(?P<day2>\d+),\s+(?P<year2>\d{4})'
|
||||
')'
|
||||
r'\s+at\s+(?P<hour>\d+):(?P<minute>\d+)\s+(?P<ampm>[ap]m)$')
|
||||
|
||||
|
||||
class VKontaktePost(typing.NamedTuple, snscrape.base.Item):
|
||||
@dataclasses.dataclass
|
||||
class VKontaktePost(snscrape.base.Item):
|
||||
url: str
|
||||
date: datetime.datetime
|
||||
date: typing.Optional[typing.Union[datetime.datetime, datetime.date]]
|
||||
content: str
|
||||
outlinks: typing.Optional[typing.List[str]] = None
|
||||
photos: typing.Optional[typing.List['Photo']] = None
|
||||
video: typing.Optional['Video'] = None
|
||||
quotedPost: typing.Optional['VKontaktePost'] = None
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Photo:
|
||||
variants: typing.List['PhotoVariant']
|
||||
url: typing.Optional[str] = None
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class PhotoVariant:
|
||||
url: str
|
||||
width: int
|
||||
height: int
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Video:
|
||||
id: str
|
||||
list: str
|
||||
duration: int
|
||||
url: str
|
||||
thumbUrl: str
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Item):
|
||||
username: str
|
||||
name: str
|
||||
verified: bool
|
||||
description: typing.Optional[str] = None
|
||||
websites: typing.Optional[typing.List[str]] = None
|
||||
followers: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
posts: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
photos: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
tags: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
following: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
|
||||
followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
|
||||
postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
|
||||
photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
|
||||
tagsGranularity = snscrape.base._DeprecatedProperty('tagsGranularity', lambda self: self.tags.granularity, 'tags.granularity')
|
||||
followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
|
||||
|
||||
def __str__(self):
|
||||
return f'https://vk.com/{self.username}'
|
||||
|
||||
|
||||
class VKontakteUserScraper(snscrape.base.Scraper):
|
||||
name = 'vkontakte-user'
|
||||
|
||||
def __init__(self, username, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._username = username
|
||||
self._baseUrl = f'https://vk.com/{self._username}'
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept-Language': 'en-US,en;q=0.5'}
|
||||
self._initialPage = None
|
||||
self._initialPageSoup = None
|
||||
|
||||
def _soup_to_items(self, soup, baseUrl):
|
||||
for post in soup.find_all('div', class_ = 'post'):
|
||||
def _away_a_to_url(self, a):
|
||||
# Transform an <a> tag with an href of /away.php?to=... to a plain URL; returns None if a doesn't have that form.
|
||||
if a and a.get('href', '').startswith('/away.php?to='):
|
||||
end = a['href'].find('&', 13)
|
||||
if end == -1:
|
||||
end = None
|
||||
return urllib.parse.unquote(a['href'][13 : end])
|
||||
return None
|
||||
|
||||
def is_photo(self, a):
|
||||
return 'aria-label' in a.attrs and a.attrs['aria-label'].startswith('photo')
|
||||
|
||||
def _date_span_to_date(self, dateSpan):
|
||||
if not dateSpan:
|
||||
return None
|
||||
if 'time' in dateSpan.attrs:
|
||||
return datetime.datetime.fromtimestamp(int(dateSpan['time']), datetime.timezone.utc)
|
||||
if (match := _datePattern.match(dateSpan.text)):
|
||||
# Datetime information down to minutes
|
||||
tz = _timezone('Europe/Moscow')
|
||||
if match.group('date') in ('today', 'yesterday'):
|
||||
date = datetime.datetime.now(tz = tz)
|
||||
if match.group('date') == 'yesterday':
|
||||
date -= datetime.timedelta(days = 1)
|
||||
year, month, day = date.year, date.month, date.day
|
||||
else:
|
||||
year = int(match.group('year1') or match.group('year2') or datetime.datetime.now(tz = tz).year)
|
||||
month = _months.index(match.group('month1') or match.group('month2')) + 1
|
||||
day = int(match.group('day1') or match.group('day2'))
|
||||
hour = int(match.group('hour'))
|
||||
# Damn AM/PM...
|
||||
if hour == 12:
|
||||
hour -= 12
|
||||
if match.group('ampm') == 'pm':
|
||||
hour += 12
|
||||
minute = int(match.group('minute'))
|
||||
return _localised_datetime(tz, year, month, day, hour, minute)
|
||||
if (match := re.match(r'^(?P<day>\d+)\s+(?P<month>' + '|'.join(_months) + r')\s+(?P<year>\d{4})$', dateSpan.text)):
|
||||
# Date only
|
||||
return datetime.date(int(match.group('year')), _months.index(match.group('month')) + 1, int(match.group('day')))
|
||||
if dateSpan.text not in ('video', 'photo'): # Silently ignore video and photo reposts which have no original date attached
|
||||
_logger.warning(f'Could not parse date string: {dateSpan.text!r}')
|
||||
|
||||
def _post_div_to_item(self, post, isCopy = False):
|
||||
postLink = post.find('a', class_ = 'post_link' if not isCopy else 'published_by_date')
|
||||
if not postLink:
|
||||
_logger.warning(f'Skipping post without link: {str(post)[:200]!r}')
|
||||
return
|
||||
url = urllib.parse.urljoin(self._baseUrl, postLink['href'])
|
||||
assert (url.startswith('https://vk.com/wall') or (isCopy and (url.startswith('https://vk.com/video') or url.startswith('https://vk.com/photo')))) and '_' in url and url[-1] != '_' and url.rsplit('_', 1)[1].strip('0123456789') in ('', '?reply=')
|
||||
if not isCopy:
|
||||
dateSpan = post.find('div', class_ = 'post_date').find('span', class_ = 'rel_date')
|
||||
textDiv = post.find('div', class_ = 'wall_post_text')
|
||||
yield VKontaktePost(
|
||||
url = urllib.parse.urljoin(baseUrl, post.find('a', class_ = 'post_link')['href']),
|
||||
date = datetime.datetime.fromtimestamp(int(dateSpan['time']), datetime.timezone.utc) if 'time' in dateSpan else None,
|
||||
content = textDiv.text if textDiv else None,
|
||||
)
|
||||
else:
|
||||
dateSpan = post.find('div', class_ = 'copy_post_date').find('a', class_ = 'published_by_date')
|
||||
textDiv = post.find('div', class_ = 'wall_post_text')
|
||||
outlinks = [h for a in textDiv.find_all('a') if (h := self._away_a_to_url(a))] if textDiv else []
|
||||
if (mediaLinkDiv := post.find('div', class_ = 'media_link')) and \
|
||||
(mediaLinkA := mediaLinkDiv.find('a', class_ = 'media_link__title')) and \
|
||||
(href := self._away_a_to_url(mediaLinkA)) and \
|
||||
href not in outlinks:
|
||||
outlinks.append(href)
|
||||
photos = None
|
||||
video = None
|
||||
if (thumbsDiv := (post.find('div', class_ = 'wall_text') if not isCopy else post).find('div', class_ = 'page_post_sized_thumbs')) and \
|
||||
not (not isCopy and thumbsDiv.parent.name == 'div' and 'class' in thumbsDiv.parent.attrs and 'copy_quote' in thumbsDiv.parent.attrs['class']): # Skip post quotes
|
||||
photos = []
|
||||
for a in thumbsDiv.find_all('a', class_ = 'page_post_thumb_wrap'):
|
||||
if not self.is_photo(a) and 'data-video' not in a.attrs:
|
||||
_logger.warning(f'Skipping non-photo and non-video thumb wrap on {url}')
|
||||
continue
|
||||
if 'data-video' in a.attrs:
|
||||
# Video
|
||||
video = Video(
|
||||
id = a['data-video'],
|
||||
list = a['data-list'],
|
||||
duration = int(a['data-duration']),
|
||||
url = f'https://vk.com{a["href"]}',
|
||||
thumbUrl = a['style'][(begin := a['style'].find('background-image: url(') + 22) : a['style'].find(')', begin)],
|
||||
)
|
||||
continue
|
||||
# From here on: photo
|
||||
if 'onclick' not in a.attrs or not a['onclick'].startswith("return showPhoto('") or '{"temp":' not in a['onclick'] or not a['onclick'].endswith('}, event)'):
|
||||
_logger.warning(f'Photo thumb wrap on {url} has no or unexpected onclick, skipping')
|
||||
continue
|
||||
photoData = a['onclick'][a['onclick'].find('{"temp":') : -8] # -8 = len(', event)')
|
||||
photoObj = json.loads(photoData)
|
||||
singleLetterKeys = [k for k in photoObj['temp'].keys() if len(k) == 1 and 97 <= ord(k) <= 122] # 97 = ord('a'), 122 = ord('z')
|
||||
for x in singleLetterKeys:
|
||||
# Merge base into URLs
|
||||
if not photoObj['temp'][x].startswith('https://'):
|
||||
photoObj['temp'][x] = f'{photoObj["temp"]["base"]}{photoObj["temp"][x]}'
|
||||
x_ = f'{x}_'
|
||||
if not photoObj['temp'][x_][0].startswith('https://'):
|
||||
photoObj['temp'][x_][0] = f'{photoObj["temp"]["base"]}{photoObj["temp"][x_][0]}'
|
||||
if any(k not in {'base', 'w', 'w_', 'x', 'x_', 'y', 'y_', 'z', 'z_'} for k in photoObj['temp'].keys()) or \
|
||||
not all(photoObj['temp'][x] in (photoObj['temp'][f'{x}_'][0], photoObj['temp'][f'{x}_'][0] + '.jpg') for x in singleLetterKeys) or \
|
||||
not all(photoObj['temp'][x].startswith('https://sun') and '.userapi.com/' in photoObj['temp'][x] for x in singleLetterKeys) or \
|
||||
not all(len(photoObj['temp'][(x_ := f'{x}_')]) == 3 and isinstance(photoObj['temp'][x_][1], int) and isinstance(photoObj['temp'][x_][2], int) for x in singleLetterKeys):
|
||||
_logger.warning(f'Photo thumb wrap on {url} has unexpected data structure, skipping')
|
||||
continue
|
||||
photoVariants = []
|
||||
for x in singleLetterKeys:
|
||||
x_ = f'{x}_'
|
||||
photoVariants.append(PhotoVariant(url = f'{photoObj["temp"][x_][0]}.jpg' if '.jpg' not in photoObj['temp'][x_][0] else photoObj['temp'][x_][0], width = photoObj['temp'][x_][1], height = photoObj['temp'][x_][2]))
|
||||
photoUrl = f'https://vk.com{a["href"]}' if 'href' in a.attrs and a['href'].startswith('/photo') and a['href'][6:].strip('0123456789-_') == '' else None
|
||||
photos.append(Photo(variants = photoVariants, url = photoUrl))
|
||||
quotedPost = self._post_div_to_item(quoteDiv, isCopy = True) if (quoteDiv := post.find('div', class_ = 'copy_quote')) else None
|
||||
return VKontaktePost(
|
||||
url = url,
|
||||
date = self._date_span_to_date(dateSpan),
|
||||
content = textDiv.text if textDiv else None,
|
||||
outlinks = outlinks or None,
|
||||
photos = photos or None,
|
||||
video = video or None,
|
||||
quotedPost = quotedPost,
|
||||
)
|
||||
|
||||
def _soup_to_items(self, soup):
|
||||
for post in soup.find_all('div', class_ = 'post'):
|
||||
yield self._post_div_to_item(post)
|
||||
|
||||
def _initial_page(self):
|
||||
if self._initialPage is None:
|
||||
_logger.info('Retrieving initial data')
|
||||
r = self._get(self._baseUrl, headers = self._headers)
|
||||
if r.status_code not in (200, 404):
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
# VK sends windows-1251-encoded data, but Requests's decoding doesn't seem to work correctly and causes lxml to choke, so we need to pass the binary content and the encoding explicitly.
|
||||
self._initialPage, self._initialPageSoup = r, bs4.BeautifulSoup(r.content, 'lxml', from_encoding = r.encoding)
|
||||
return self._initialPage, self._initialPageSoup
|
||||
|
||||
def get_items(self):
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept-Language': 'en-US,en;q=0.5'}
|
||||
baseUrl = f'https://vk.com/{self._username}'
|
||||
|
||||
logger.info('Retrieving initial data')
|
||||
r = self._get(baseUrl, headers = headers)
|
||||
r, soup = self._initial_page()
|
||||
if r.status_code == 404:
|
||||
logger.error('Wall does not exist')
|
||||
_logger.warning('Wall does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
|
||||
# VK sends windows-1251-encoded data, but Requests's decoding doesn't seem to work correctly and causes lxml to choke, so we need to pass the binary content and the encoding explicitly.
|
||||
soup = bs4.BeautifulSoup(r.content, 'lxml', from_encoding = r.encoding)
|
||||
|
||||
if soup.find('div', class_ = 'profile_closed_wall_dummy'):
|
||||
logger.error('Private profile')
|
||||
_logger.warning('Private profile')
|
||||
return
|
||||
|
||||
profileDeleted = soup.find('h5', class_ = 'profile_deleted_text')
|
||||
if profileDeleted:
|
||||
if (profileDeleted := soup.find('h5', class_ = 'profile_deleted_text')):
|
||||
# Unclear what this state represents, so just log website text.
|
||||
logger.error(profileDeleted.text)
|
||||
_logger.warning(profileDeleted.text)
|
||||
return
|
||||
|
||||
newestPost = soup.find('div', class_ = 'post')
|
||||
if not newestPost:
|
||||
logger.info('Wall has no posts')
|
||||
_logger.info('Wall has no posts')
|
||||
return
|
||||
ownerID = newestPost.attrs['data-post-id'].split('_')[0]
|
||||
# If there is a pinned post, we need its ID for the pagination requests
|
||||
if 'post_fixed' in newestPost.attrs['class']:
|
||||
fixedPostID = newestPost.attrs['id'].split('_')[1]
|
||||
fixedPostID = int(newestPost.attrs['id'].split('_')[1])
|
||||
else:
|
||||
fixedPostID = ''
|
||||
|
||||
yield from self._soup_to_items(soup, baseUrl)
|
||||
last1000PostIDs = collections.deque(maxlen = 1000)
|
||||
|
||||
headers['X-Requested-With'] = 'XMLHttpRequest'
|
||||
def _process_soup(soup):
|
||||
nonlocal last1000PostIDs
|
||||
for item in self._soup_to_items(soup):
|
||||
postID = int(item.url.rsplit('_', 1)[1])
|
||||
if postID not in last1000PostIDs:
|
||||
yield item
|
||||
last1000PostIDs.append(postID)
|
||||
|
||||
yield from _process_soup(soup)
|
||||
|
||||
lastWorkingOffset = 0
|
||||
for offset in itertools.count(start = 10, step = 10):
|
||||
logger.info('Retrieving next page')
|
||||
r = self._post(
|
||||
'https://vk.com/al_wall.php',
|
||||
data = [('act', 'get_wall'), ('al', 1), ('fixed', fixedPostID), ('offset', offset), ('onlyCache', 'false'), ('owner_id', ownerID), ('type', 'own'), ('wall_start_from', offset)],
|
||||
headers = headers
|
||||
)
|
||||
if r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
# Convert to JSON and read the HTML payload. Note that this implicitly converts the data to a Python string (i.e., Unicode), away from a windows-1251-encoded bytes.
|
||||
posts = r.json()['payload'][1][0]
|
||||
posts = self._get_wall_offset(fixedPostID, ownerID, offset)
|
||||
if posts.startswith('<div class="page_block no_posts">'):
|
||||
# Reached the end
|
||||
break
|
||||
if not posts.startswith('<div id="post'):
|
||||
logger.error(f'Got an unknown response: {posts[:200]!r}...')
|
||||
break
|
||||
if posts == '"\\/blank.php?block=119910902"':
|
||||
_logger.warning(f'Encountered geoblock on offset {offset}, trying to work around the block but might be missing content')
|
||||
for geoblockOffset in range(lastWorkingOffset + 1, offset + 10):
|
||||
geoPosts = self._get_wall_offset(fixedPostID, ownerID, geoblockOffset)
|
||||
if geoPosts.startswith('<div class="page_block no_posts">'):
|
||||
# No breaking the outer loop, it'll just make one extra request and exit as well
|
||||
break
|
||||
if not geoPosts.startswith('<div id="post'):
|
||||
if geoPosts == '"\\/blank.php?block=119910902"':
|
||||
continue
|
||||
raise snscrape.base.ScraperException(f'Got an unknown response: {geoPosts[:200]!r}...')
|
||||
yield from _process_soup(soup = bs4.BeautifulSoup(geoPosts, 'lxml'))
|
||||
continue
|
||||
raise snscrape.base.ScraperException(f'Got an unknown response: {posts[:200]!r}...')
|
||||
lastWorkingOffset = offset
|
||||
soup = bs4.BeautifulSoup(posts, 'lxml')
|
||||
yield from self._soup_to_items(soup, baseUrl)
|
||||
yield from _process_soup(soup)
|
||||
|
||||
def _get_wall_offset(self, fixedPostID, ownerID, offset):
|
||||
headers = self._headers.copy()
|
||||
headers['X-Requested-With'] = 'XMLHttpRequest'
|
||||
_logger.info(f'Retrieving page offset {offset}')
|
||||
r = self._post(
|
||||
'https://vk.com/al_wall.php',
|
||||
data = [('act', 'get_wall'), ('al', 1), ('fixed', fixedPostID), ('offset', offset), ('onlyCache', 'false'), ('owner_id', ownerID), ('type', 'own'), ('wall_start_from', offset)],
|
||||
headers = headers
|
||||
)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
# Convert to JSON and read the HTML payload. Note that this implicitly converts the data to a Python string (i.e., Unicode), away from a windows-1251-encoded bytes.
|
||||
posts = r.json()['payload'][1][0]
|
||||
return posts
|
||||
|
||||
def _get_entity(self):
|
||||
r, soup = self._initial_page()
|
||||
if r.status_code != 200:
|
||||
return
|
||||
kwargs = {}
|
||||
kwargs['username'] = r.url.rsplit('/', 1)[1]
|
||||
nameH1 = soup.find('h1', class_ = 'page_name')
|
||||
kwargs['name'] = nameH1.text
|
||||
kwargs['verified'] = bool(nameH1.find('div', class_ = 'page_verified'))
|
||||
|
||||
if (descriptionDiv := soup.find('div', id = 'page_current_info')):
|
||||
kwargs['description'] = descriptionDiv.text
|
||||
|
||||
if (infoDiv := soup.find('div', id = 'page_info_wrap')):
|
||||
websites = []
|
||||
for rowDiv in infoDiv.find_all('div', class_ = ['profile_info_row', 'group_info_row']):
|
||||
if 'profile_info_row' in rowDiv['class']:
|
||||
labelDiv = rowDiv.find('div', class_ = 'fl_l')
|
||||
if not labelDiv or labelDiv.text != 'Website:':
|
||||
continue
|
||||
else: # group_info_row
|
||||
if rowDiv['title'] == 'Description':
|
||||
kwargs['description'] = rowDiv.text
|
||||
if rowDiv['title'] != 'Website':
|
||||
continue
|
||||
for a in rowDiv.find_all('a'):
|
||||
if not a['href'].startswith('/away.php?to='):
|
||||
_logger.warning(f'Skipping odd website link: {a["href"]!r}')
|
||||
continue
|
||||
websites.append(urllib.parse.unquote(a['href'].split('=', 1)[1].split('&', 1)[0]))
|
||||
if websites:
|
||||
kwargs['websites'] = websites
|
||||
|
||||
def parse_num(s: str) -> typing.Tuple[int, int]:
|
||||
if s.endswith('K'):
|
||||
return int(s[:-1]) * 1000, 1000
|
||||
elif s.endswith('M'):
|
||||
baseNum = s[:-1]
|
||||
precision = 1000000
|
||||
if '.' in s:
|
||||
precision //= (10 ** len(baseNum.split('.')[1]))
|
||||
return int(float(baseNum) * 1000000), precision
|
||||
else:
|
||||
return int(s.replace(',', '')), 1
|
||||
|
||||
if (countsDiv := soup.find('div', class_ = 'counts_module')):
|
||||
for a in countsDiv.find_all('a', class_ = 'page_counter'):
|
||||
count, granularity = parse_num(a.find('div', class_ = 'count').text)
|
||||
label = a.find('div', class_ = 'label').text
|
||||
if label in ('follower', 'post', 'photo', 'tag'):
|
||||
label = f'{label}s'
|
||||
if label in ('followers', 'posts', 'photos', 'tags'):
|
||||
kwargs[label] = snscrape.base.IntWithGranularity(count, granularity)
|
||||
|
||||
if (idolsDiv := soup.find('div', id = 'profile_idols')):
|
||||
if (topDiv := idolsDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Following':
|
||||
kwargs['following'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
|
||||
|
||||
# On public pages, this is where followers are listed
|
||||
if (followersDiv := soup.find('div', id = 'public_followers')):
|
||||
if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Followers':
|
||||
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
|
||||
|
||||
return User(**kwargs)
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
subparser.add_argument('username', help = 'A VK username')
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('username', type = snscrape.base.nonempty_string('username'), help = 'A VK username')
|
||||
|
||||
@classmethod
|
||||
def from_args(cls, args):
|
||||
return cls(args.username, retries = args.retries)
|
||||
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.username)
|
||||
153
snscrape/modules/weibo.py
Normal file
153
snscrape/modules/weibo.py
Normal file
@@ -0,0 +1,153 @@
|
||||
__all__ = ['Post', 'User', 'WeiboUserScraper']
|
||||
|
||||
|
||||
import dataclasses
|
||||
import logging
|
||||
import re
|
||||
import snscrape.base
|
||||
import typing
|
||||
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
_userDoesNotExist = object()
|
||||
_HTML_STRIP_PATTERN = re.compile(r'<[^>]*>')
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Post(snscrape.base.Item):
|
||||
url: str
|
||||
id: str
|
||||
user: typing.Optional['User']
|
||||
createdAt: str # Can have a variety of inconsistent formats
|
||||
text: str
|
||||
repostsCount: typing.Optional[int]
|
||||
commentsCount: typing.Optional[typing.Union[int, str]]
|
||||
likesCount: typing.Optional[int]
|
||||
picturesCount: typing.Optional[int]
|
||||
pictures: typing.Optional[typing.List[str]] # May be shorter than pictureCount if the API didn't return all of them (e.g. post Ipay2evb0)
|
||||
video: typing.Optional[str]
|
||||
link: typing.Optional[str]
|
||||
repostedPost: typing.Optional['Post']
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Item):
|
||||
screenname: str
|
||||
uid: int
|
||||
verified: bool
|
||||
verifiedReason: typing.Optional[str]
|
||||
description: str
|
||||
statusesCount: int
|
||||
followersCount: int
|
||||
followCount: int
|
||||
avatar: str
|
||||
|
||||
def __str__(self):
|
||||
return f'https://m.weibo.cn/u/{self.uid}'
|
||||
|
||||
|
||||
class WeiboUserScraper(snscrape.base.Scraper):
|
||||
name = 'weibo-user'
|
||||
|
||||
def __init__(self, user, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._user = user
|
||||
self._isUserId = isinstance(user, int)
|
||||
self._headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
|
||||
|
||||
def _ensure_user_id(self):
|
||||
if self._isUserId:
|
||||
return
|
||||
r = self._get(f'https://m.weibo.cn/n/{self._user}', headers = self._headers, allowRedirects = False)
|
||||
if r.status_code == 302 and r.headers['Location'].startswith('/u/') and len(r.headers['Location']) == 13 and r.headers['Location'][3:].strip('0123456789') == '':
|
||||
# Redirect to uid URL
|
||||
self._user = int(r.headers['Location'][3:])
|
||||
self._isUserId = True
|
||||
elif r.status_code == 200 and '<p class="h5-4con">用户不存在</p>' in r.text:
|
||||
_logger.warning('User does not exist')
|
||||
self._user = _userDoesNotExist
|
||||
else:
|
||||
raise snscrape.base.ScraperException(f'Got unexpected response on resolving username ({r.status_code})')
|
||||
|
||||
def _check_timeline_response(self, r):
|
||||
if r.status_code == 200 and r.content == b'{"ok":0,"msg":"\\u8fd9\\u91cc\\u8fd8\\u6ca1\\u6709\\u5185\\u5bb9","data":{"cards":[]}}':
|
||||
# 'No content here yet'. Appears to happen sometimes on pagination, possibly due to too fast requests; retry this
|
||||
return False, 'no-content message'
|
||||
if r.status_code != 200:
|
||||
return False, 'non-200 status code'
|
||||
return True, None
|
||||
|
||||
def _mblog_to_item(self, mblog):
|
||||
if mblog.get('page_info', {}).get('type') not in (None, 'video', 'webpage'):
|
||||
_logger.warning(f'Skipping unknown page info {mblog["page_info"]["type"]!r} on status {mblog["id"]}')
|
||||
return Post(
|
||||
url = f'https://m.weibo.cn/status/{mblog["bid"]}',
|
||||
id = mblog['id'],
|
||||
user = self._user_info_to_entity(mblog['user']) if mblog['user'] is not None else None,
|
||||
createdAt = mblog['created_at'],
|
||||
text = mblog['raw_text'] if 'raw_text' in mblog else _HTML_STRIP_PATTERN.sub('', mblog['text']),
|
||||
repostsCount = mblog.get('reposts_count'),
|
||||
commentsCount = mblog.get('comments_count'),
|
||||
likesCount = mblog.get('attitudes_count'),
|
||||
picturesCount = mblog.get('pic_num'),
|
||||
pictures = [x['large']['url'] for x in mblog['pics']] if 'pics' in mblog else None,
|
||||
video = urls.get('mp4_720p_mp4') or urls.get('mp4_hd_mp4') or urls['mp4_ld_mp4'] if 'page_info' in mblog and mblog['page_info']['type'] == 'video' and (urls := mblog['page_info']['urls']) else None,
|
||||
link = mblog['page_info']['page_url'] if 'page_info' in mblog and mblog['page_info']['type'] == 'webpage' else None,
|
||||
repostedPost = self._mblog_to_item(mblog['retweeted_status']) if 'retweeted_status' in mblog else None,
|
||||
)
|
||||
|
||||
def get_items(self):
|
||||
self._ensure_user_id()
|
||||
if self._user is _userDoesNotExist:
|
||||
return
|
||||
sinceId = None
|
||||
while True:
|
||||
sinceParam = f'&since_id={sinceId}' if sinceId is not None else ''
|
||||
r = self._get(f'https://m.weibo.cn/api/container/getIndex?type=uid&value={self._user}&containerid=107603{self._user}&count=25{sinceParam}', headers = self._headers, responseOkCallback = self._check_timeline_response)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
o = r.json()
|
||||
for card in o['data']['cards']:
|
||||
if card['card_type'] != 9:
|
||||
_logger.warning(f'Skipping card of type {card["card_type"]}')
|
||||
continue
|
||||
yield self._mblog_to_item(card['mblog'])
|
||||
if 'since_id' not in o['data']['cardlistInfo']:
|
||||
# End of pagination
|
||||
break
|
||||
sinceId = o['data']['cardlistInfo']['since_id']
|
||||
|
||||
def _user_info_to_entity(self, userInfo):
|
||||
return User(
|
||||
screenname = userInfo['screen_name'],
|
||||
uid = userInfo['id'],
|
||||
verified = userInfo['verified'],
|
||||
verifiedReason = userInfo.get('verified_reason'),
|
||||
description = userInfo['description'],
|
||||
statusesCount = userInfo['statuses_count'],
|
||||
followersCount = userInfo['followers_count'],
|
||||
followCount = userInfo['follow_count'],
|
||||
avatar = userInfo['avatar_hd'],
|
||||
)
|
||||
|
||||
def _get_entity(self):
|
||||
self._ensure_user_id()
|
||||
if self._user is _userDoesNotExist:
|
||||
return
|
||||
r = self._get(f'https://m.weibo.cn/api/container/getIndex?type=uid&value={self._user}', headers = self._headers)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException('Could not fetch user info')
|
||||
o = r.json()
|
||||
return self._user_info_to_entity(o['data']['userInfo'])
|
||||
|
||||
@classmethod
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('--name', dest = 'isName', action = 'store_true', help = 'Use username instead of user ID')
|
||||
subparser.add_argument('user', type = snscrape.base.nonempty_string('user'), help = 'A user ID')
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, user = args.user if args.isName else int(args.user))
|
||||
16
snscrape/utils.py
Normal file
16
snscrape/utils.py
Normal file
@@ -0,0 +1,16 @@
|
||||
def dict_map(input, keyMap):
|
||||
'''Return a new dict from an input dict and a {'input_key': 'output_key'} mapping'''
|
||||
|
||||
return {outputKey: input[inputKey] for inputKey, outputKey in keyMap.items() if inputKey in input}
|
||||
|
||||
|
||||
def snake_to_camel(**kwargs):
|
||||
'''Return a new dict from kwargs with snake_case keys replaced by camelCase'''
|
||||
|
||||
out = {}
|
||||
for key, value in kwargs.items():
|
||||
keyParts = key.split('_')
|
||||
for i in range(1, len(keyParts)):
|
||||
keyParts[i] = keyParts[i][:1].upper() + keyParts[i][1:]
|
||||
out[''.join(keyParts)] = value
|
||||
return out
|
||||
@@ -1,7 +1,7 @@
|
||||
import pkg_resources
|
||||
import importlib.metadata
|
||||
|
||||
|
||||
try:
|
||||
__version__ = pkg_resources.get_distribution('snscrape').version
|
||||
except pkg_resources.DistributionNotFound:
|
||||
__version__ = importlib.metadata.version('snscrape')
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
__version__ = None
|
||||
|
||||
Reference in New Issue
Block a user