336 Commits

Author SHA1 Message Date
Tristan Lee
40b8d9f267 Merge pull request #7 from bellingcat/more-tg-info
More tg info
2022-07-05 08:29:09 -07:00
Tristan Lee
fdc40f7411 Merge pull request #6 from bellingcat/add-vk-user
added User dataclass as argument to VKontaktePost dataclass
2022-07-05 08:28:01 -07:00
Tristan Lee
82351800d6 Merge pull request #5 from JustAnotherArchivist/master
merge upstream
2022-07-05 08:25:20 -07:00
Tristan Lee
73f10a4f24 fixed edge case where channel with no members fails _get_entity 2022-07-05 10:23:26 -05:00
Tristan Lee
cb429909d0 added User dataclass as argument to VKontaktePost dataclass 2022-07-05 10:21:59 -05:00
JustAnotherArchivist
d72b51953f Fix missing r prefix on string with regex backslashes 2022-06-24 23:12:50 +00:00
Tristan Lee
056cd6215c incorporated requested changes from maintainer, removed modifications to VK module 2022-06-23 15:47:18 -05:00
JustAnotherArchivist
d5b406bc1b Update API parameters to what Twitter currently uses
The `count` reduction does not affect anything as Twitter ignores that parameter now. Cf. #481
2022-06-23 19:50:17 +00:00
Tristan Lee
56e4232083 fixed typo 2022-06-23 11:51:13 -05:00
JustAnotherArchivist
50899c01f3 Fix crash on malformed guest token cache file
Fixes #494
2022-06-16 17:12:04 +00:00
JustAnotherArchivist
bcad6923c2 Rename Tweet.content to rawContent and User.description to renderedDescription for consistency
Closes #479
2022-06-14 00:35:02 +00:00
JustAnotherArchivist
0d361685ff Fix AttributeError crash on scrapers using the default CLI constructor
Introduced by 267b7d0e

Fixes #483
2022-06-01 17:35:38 +00:00
JustAnotherArchivist
530f4fa122 Fix KeyErrors on display_url and expanded_url for certain users with broken profile links
Fixes #480
2022-05-29 17:23:43 +00:00
JustAnotherArchivist
dc6bc9bf9d Refactor how links on Twitter are handled
All links in text (tweets, profile descriptions, and profile links) are now represented by TextLink objects, which contain all relevant information: the displayed text (if available), the URL, the short t.co URL, and the indices in the text at which it appears.

Closes #478
2022-05-29 07:16:04 +00:00
JustAnotherArchivist
01cf6a09b3 Fix type of description URL objects 2022-05-29 05:08:23 +00:00
JustAnotherArchivist
ef7c4fad3e Fix AttributeError for DescriptionURL on from-import 2022-05-29 05:08:23 +00:00
Tristan Lee
65723f10ff fixed merge 2022-05-25 06:47:47 -05:00
Tristan Lee
07a5f6fd7d merged master into more-tg-info to update upstream PR 2022-05-25 01:18:48 -05:00
Tristan Lee
0822a9c354 Merge pull request #4 from JustAnotherArchivist/master
upstream merge
2022-05-24 23:10:38 -07:00
JustAnotherArchivist
faeffe2603 Merge pull request #474 from GeraniumKF/GeraniumKF-reddit-since-crash
Fix crash using --since with Reddit
2022-05-23 23:06:16 +00:00
Geranium
e3bdc02a7c Reddit: deprecate 'created' property for 'date'
This fixes a crash when using --since with the Reddit scraper,
as the CLI code expects items to have a date property.
2022-05-23 23:31:44 +01:00
Tristan Lee
e2d922301e forgot to save modified twitter.py module 2022-05-09 09:37:36 -05:00
Tristan Lee
b13e62eb5d Merge branch 'JustAnotherArchivist-master' 2022-05-09 09:35:35 -05:00
Tristan Lee
f38513503d fixed merge conflicts 2022-05-09 09:35:19 -05:00
Tristan Lee
0a4bd39ca6 Merge pull request #2 from bellingcat/telegram-media
Implemented JustAnotherArchivist's requested changes to Telegram scraper from PR
2022-05-09 07:23:39 -07:00
Tristan Lee
c18ca0f047 Merge branch 'master' into telegram-media 2022-05-09 09:21:40 -05:00
Tristan Lee
5648e957d0 improved consistency of code formatting and added _STYLE_MEDIA_URL_PATTERN as variable 2022-04-27 16:41:24 -05:00
Tristan Lee
21f7b620ec moved forward finding out of tgme_widget_message_text clause, since it wasn't correctly getting the forwarding information in forwarded posts that contained attachments but no text 2022-04-21 18:26:31 -05:00
Tristan Lee
9b3faec980 added additional attributes for hashtags and user mentions, removed redundant outlinks 2022-04-21 18:06:43 -05:00
Tristan Lee
97d38e5cde added additional termination criteria to Telegram scraper 2022-04-21 09:41:53 -05:00
Tristan Lee
b276c3cc27 fixed issue where some videos and photos weren't being scraped (because they weren't in a post containing a 'tgme_widget_message_text' div 2022-04-17 06:50:43 -05:00
Tristan Lee
1e4e0c278d fixed issue where Telegram scraper terminated early because some pages didn't have a next page link (added reasonable default) 2022-04-17 04:33:22 -05:00
Tristan Lee
babcddda19 made Telegram scraper not return full channel info for forwarded_from attribute; fixed video edge cases. 2022-04-17 03:55:37 -05:00
JustAnotherArchivist
ed3ea944d1 Fix newsletter issue cards without an issue description
Fixes #456
2022-04-16 19:44:36 +00:00
JustAnotherArchivist
e7a6d38a5f Add support for community_details cards 2022-04-15 20:07:01 +00:00
JustAnotherArchivist
6c50eee31b Fix proxies not being applied correctly due to missing merge with environment settings
Fixes #447
2022-04-15 19:23:54 +00:00
JustAnotherArchivist
5103a33afa Fix t.co card URL replacement on retweets
Fixes #411
2022-04-15 03:18:45 +00:00
JustAnotherArchivist
247bd82d79 Refactor to tweetId variable 2022-04-15 03:14:29 +00:00
JustAnotherArchivist
5fc67f2bcf Add support for 'message me' cards 2022-04-15 02:52:37 +00:00
JustAnotherArchivist
65e7d8bd24 Fix warning on card URL translation to include the tweet ID 2022-04-15 02:52:03 +00:00
JustAnotherArchivist
3870282a42 Fix broadcast and event card crashes 2022-04-12 20:53:38 +00:00
JustAnotherArchivist
7c0fcdec43 Fix Periscope card crashes 2022-04-12 18:29:51 +00:00
JustAnotherArchivist
9af1f19034 Properly support all card types
Fixes #407
2022-04-12 18:11:26 +00:00
JustAnotherArchivist
5fc3c0e290 Fix crash in locals dumping on module-less frames 2022-04-12 18:03:36 +00:00
Tristan Lee
f978954bb3 Merge branch 'JustAnotherArchivist:master' into master 2022-04-03 01:49:28 -05:00
Tristan Lee
2ce014ade4 fixed edge case for videos that have data-link-attr but no href attribute 2022-04-03 01:45:25 -05:00
JustAnotherArchivist
5d156c6a15 Detect and raise error on redirect from GraphQL endpoint to login
#165
2022-04-03 02:34:30 +00:00
Tristan Lee
4e59638e7c added a forwardedUrl attribute to TelegramPost and made forwarded attribute type Channel. 2022-03-30 21:33:03 -05:00
Tristan Lee
a7eb54d226 implemented Media dataclasses for Telegram, and added variable for extracting a post's view count 2022-03-30 21:07:17 -05:00
Tristan Lee
d32c9add8a added capability to scrape multiple videos from a single post 2022-03-30 18:13:15 -05:00
Tristan Lee
fb8d73ac95 handled case where channel has no profile image 2022-03-29 13:15:53 -05:00
Tristan Lee
ed829163a0 added capability to extract the number of channel members when the the string in membersDiv has the word 'subscribers' rather than 'members'. 2022-03-29 01:12:07 -05:00
JustAnotherArchivist
694657ef80 Fix broken exception references 2022-03-09 01:01:47 +00:00
Logan Williams
b8efce2a12 Clean up unnecessary imports 2022-03-08 15:10:15 +01:00
JustAnotherArchivist
1ab0f4fccb Fix missing quoted tweet reference in certain buggy cases 2022-03-07 22:16:58 +00:00
JustAnotherArchivist
3a92b5bf0d Add log message for guest token file deletion 2022-02-26 19:32:55 +00:00
JustAnotherArchivist
2480b173f4 Fix crash on race condition in CLI guest token manager resets
Fixes #414
2022-02-26 19:31:08 +00:00
Logan Williams
de4ebed81f Fix KeyError caused by retweets without URLs in TwitterProfileScraper 2022-02-24 18:08:12 +01:00
Logan Williams
72b26f2373 Scrape images, video, and post forwarding information for Telegram channel posts 2022-02-24 15:31:02 +01:00
JustAnotherArchivist
77bbb9f61f Remove useless pass 2022-02-20 18:54:51 +00:00
JustAnotherArchivist
57a624c618 Merge pull request #410 from AccentuSoft/master
Fix Vkontakte-user module crash on users with millions of followers
2022-02-18 06:01:35 +00:00
AccentuSoft
b1cfd51121 Implementing changes 2022-02-17 21:52:15 +02:00
AccentuSoft
ace2c16f54 Fix Vkontakte-user module crash on users with millions of followers 2022-02-17 15:42:46 +02:00
JustAnotherArchivist
2f9c0457df Convert t.co card URLs to unshortened when possible 2022-02-17 01:50:15 +00:00
JustAnotherArchivist
878f2a3c7a Handle cards without descriptions and thumbnails
Fixes #407
2022-02-17 01:49:32 +00:00
JustAnotherArchivist
25ee014e29 Extract cards 2022-02-16 02:59:21 +00:00
JustAnotherArchivist
a192dc6236 Handle TweetWithVisibilityResults
Fixes #400
2022-02-14 18:08:59 +00:00
JustAnotherArchivist
a7242f340b Remove obsolete TODO
There is no retweetedTweetRef in Twitter's JS.
2022-02-14 18:08:29 +00:00
JustAnotherArchivist
359cc25cdf Fix crash on entity attribute when scraping suspended users
Fixes #396
2022-02-10 04:22:59 +00:00
JustAnotherArchivist
01799a7391 Detect when CLI guest token from file has expired 2022-02-08 19:38:45 +00:00
JustAnotherArchivist
b0753c34ed Fix forgotten method name changes in 7d939c11
Fixes #393
2022-02-08 15:35:49 +00:00
JustAnotherArchivist
7f78fa0bc0 Recurse through all tweets encountered, not only ones with a positive replyCount
Fixes #266
2022-02-07 18:13:56 +00:00
JustAnotherArchivist
8702a9c7e2 Add Reddit submission scraper
Closes #312
2022-02-07 04:43:54 +00:00
JustAnotherArchivist
8ac1fd3ea8 Refactor Pushshift code to separate the general things from the search 2022-02-07 04:43:19 +00:00
JustAnotherArchivist
9235890f9a Fix KeyError crash on attempting to scrape inexistent tweet ID 2022-02-07 04:04:21 +00:00
JustAnotherArchivist
7d939c110c Port profile and tweet scrapers to GraphQL API
Fixes #367
2022-02-07 03:49:14 +00:00
JustAnotherArchivist
8e95e9a9a7 Fix crash on places without a bounding box
Fixes #374
2022-02-07 00:38:22 +00:00
JustAnotherArchivist
aa7d7d3dc3 Refactor automatic importing in snscrape.modules to something less hacky
Cf. #357
2022-02-05 03:22:55 +00:00
JustAnotherArchivist
560c78c5cf Make all optional scraper arguments keyword-only and fix Mastodon argument style to conform with the other scrapers
Cf. #376
2022-01-30 00:21:18 +00:00
JustAnotherArchivist
107c3c71c2 Remove unnecessary f-strings
Cf. #370
2022-01-28 21:22:13 +00:00
JustAnotherArchivist
7f88678253 Merge pull request #359 from own3dh2so4/master
Added proxy option to Scraper base
2022-01-13 23:08:28 +00:00
David Garcia Alvarez
52e4f9fb69 Added proxy option to Scraper base 2022-01-13 16:56:00 +01:00
JustAnotherArchivist
eebdfc1c55 Refactor username vs ID mess
Closes #354
2022-01-12 22:36:26 +00:00
JustAnotherArchivist
e6076353c8 Fix user ID being a string instead of an int on the entity 2022-01-12 22:35:50 +00:00
JustAnotherArchivist
a32d79fab2 Fix crash on certain mblogs that lack the raw_text attribute 2022-01-12 22:31:49 +00:00
JustAnotherArchivist
65391297f6 Move CLI methods to end of class definition for consistent code style 2022-01-12 21:09:38 +00:00
JustAnotherArchivist
deb2659dd6 Prefix CLI-related methods with an underscore
Closes #355
2022-01-12 21:07:10 +00:00
JustAnotherArchivist
93e62744d7 Fix missing timezone info 2022-01-07 00:42:09 +00:00
JustAnotherArchivist
3f3632d341 Add support for Mastodon profile and toot scrapes
Closes #43
2022-01-06 03:25:06 +00:00
JustAnotherArchivist
5070953feb Skip private fields and properties on dataclass-to-JSON conversion 2022-01-06 02:08:48 +00:00
JustAnotherArchivist
853848ed5d ScrollDirection is not part of the public API 2022-01-05 19:43:19 +00:00
JustAnotherArchivist
0b4abdc43f Fix baseUrl on tweet scrapes 2022-01-05 02:39:54 +00:00
JustAnotherArchivist
267b7d0e32 Rename CLI classmethods 2022-01-05 02:27:09 +00:00
JustAnotherArchivist
acb7f10a4f Cache Twitter tokens on disk from the CLI for reuse between scrapes
Closes #339
2022-01-05 02:20:40 +00:00
JustAnotherArchivist
ca00b480b1 Fix AssertionError on quoted comments
Fixes #340
2022-01-04 01:15:08 +00:00
JustAnotherArchivist
f189ab4241 Prefix all private API names with an underscore
Cf. #328
2022-01-03 17:51:23 +00:00
JustAnotherArchivist
c6e1e33a23 Fix crashing typos 2022-01-03 17:49:55 +00:00
JustAnotherArchivist
a37ea528d3 Refactor Reddit scrapers again to merge RedditPushshiftScraper and RedditScraper
Cf. #328
2022-01-03 17:48:35 +00:00
JustAnotherArchivist
eee06d8593 Refactor Reddit scrapers into a more reasonable code structure
Cf. #328
2021-12-24 04:58:32 +00:00
JustAnotherArchivist
4dd3ee6e47 Refactor Instagram scrapers to get rid of the awkward mode parameter
Cf. #328
2021-12-24 04:50:53 +00:00
JustAnotherArchivist
0336ce13ed Add support for fetching a guest token from the API 2021-12-23 04:26:50 +00:00
JustAnotherArchivist
193d4f80d6 Fix user agent in API headers staying constant 2021-12-23 04:25:23 +00:00
JustAnotherArchivist
e7d35ec1eb Fix date parsing on quoted posts 2021-12-15 16:55:14 +00:00
JustAnotherArchivist
8540045658 Fix typo 2021-12-15 16:36:28 +00:00
JustAnotherArchivist
1f1c1bd8af Fix docstring style 2021-12-14 20:05:51 +00:00
JustAnotherArchivist
7fdc8bcb53 Randomise user agent when the guest token can't be found 2021-12-14 20:04:46 +00:00
JustAnotherArchivist
4b3c6aefe7 Add default values to user and tweet scrapers for a more untuitive usage 2021-12-12 04:57:16 +00:00
JustAnotherArchivist
525cd71225 Retry guest token retrieval
Fixes #325 (hopefully)
2021-12-12 00:10:59 +00:00
JustAnotherArchivist
72abff9e5c Reuse guest tokens across scrapes
Cf. #326
2021-12-11 23:18:42 +00:00
JustAnotherArchivist
bcaa477b3d Update list of scrapers 2021-12-08 08:29:02 +00:00
JustAnotherArchivist
66d4c99f82 Remove dev version notice 2021-12-08 08:25:21 +00:00
JustAnotherArchivist
0ac50f1383 Add README to package metadata 2021-12-08 08:18:25 +00:00
JustAnotherArchivist
c2257ad16e Add Python 3.10 classifier 2021-12-08 08:15:05 +00:00
JustAnotherArchivist
58f654405f Add --citation
Closes #229
2021-12-08 07:51:28 +00:00
JustAnotherArchivist
35fb61a327 Fix crash on dumping scopes which have a variable pointing to a dataclass 2021-11-24 03:39:06 +00:00
JustAnotherArchivist
a6b6f3faaa Throw an error on empty arguments
Fixes #290
2021-10-10 17:43:27 +00:00
JustAnotherArchivist
5e829e2541 Refactor class instantiation to remove the need to repeat 'retries' everywhere 2021-09-30 09:58:10 +00:00
JustAnotherArchivist
d4567da23c Improve list of scrapers on --help output
Don't list all scrapers in the usage line, and provide a sorted readable list instead.
2021-09-30 09:35:17 +00:00
JustAnotherArchivist
e5e0da25a0 Remove unused imports 2021-09-30 09:24:18 +00:00
JustAnotherArchivist
821326bcfb Fix a few f-strings 2021-09-30 09:23:56 +00:00
JustAnotherArchivist
4bf9ef239c Restructure usage section 2021-09-30 09:18:43 +00:00
JustAnotherArchivist
e382891642 Fix Twitter trends not having a str representation 2021-09-21 21:40:50 +00:00
JustAnotherArchivist
e5f4389464 Add Twitter trend scraper
Due to restrictions on Twitter's side, it is not possible to get trends from a custom location as that would require using an account and/or their API.

Closes #206
2021-09-21 21:28:41 +00:00
JustAnotherArchivist
d91f971f51 Refactor user label implementation and add support for bot accounts
Closes #281
2021-09-21 19:39:40 +00:00
JustAnotherArchivist
67e8295293 Merge pull request #280 from edsu/master
User Labels
2021-09-19 03:35:49 +00:00
JustAnotherArchivist
5fc2562642 Add user label support on entity retrieval 2021-09-19 03:32:35 +00:00
JustAnotherArchivist
2825bd0a73 Remove accidental empty line 2021-09-19 03:31:56 +00:00
Ed Summers
9831f2a4a0 missing ext
While doing some long term data collection I found some user objects
that lack the key 'ext'. This would cause an exception unless it's
checked for before trying to dig out results.
2021-09-16 13:31:47 -04:00
Ed Summers
a11eef6b06 User label url
Each label also has a URL which is used for learning more about the
label. While there are more label descriptions than label URLs the URLs
do seem to group language variants of the same label. For example
https://help.twitter.com/rules-and-policies/state-affiliated-china is
used for all of the following label descriptions:

* Média affilié à un État, Chine
* China state-affiliated media
* 中国官方媒体
* Çin devletine bağlı medya
* China government official

In some analysis contexts it could be useful to group these together.
2021-09-16 13:04:57 -04:00
Ed Summers
3fb731ade1 User Labels
In August of 2020 Twitter started to label the accounts of government
officials and state-affiliated media entities:

https://blog.twitter.com/en_us/topics/product/2020/new-labels-for-government-and-state-affiliated-media-accounts

This information is extremely important for researchers who are studying
the impact of social media on political discourse, especially because it is not
currently available through either Twitter's v1.1 or v2 API endpoints.

The code in this small PR may seem a bit brittle but I've been using it
to collect data with each of the twitter subcommands and it seems to
work reliably. While there are image and page URLs associated with each
label I chose to only collect the text description of the lable since it
should be sufficient for finding the additional information later if
needed.
2021-09-16 08:06:05 -04:00
JustAnotherArchivist
c76f1637ce Handle 403s from Twitter search
Closes #269
2021-08-30 23:29:20 +00:00
JustAnotherArchivist
ed117e8891 Log response status code and redirects 2021-08-29 18:26:00 +00:00
JustAnotherArchivist
f9a3fafb3f Fix --cursor on twitter-search 2021-08-01 20:59:16 +00:00
JustAnotherArchivist
660b8c7a0a Retry empty result sets from Twitter as a workaround for random early stops
#37
2021-07-18 23:59:52 +00:00
JustAnotherArchivist
0c22608dc7 Extract video view count
Also fix the broken ext values sent to Twitter

Closes #246
2021-07-01 17:58:45 +00:00
JustAnotherArchivist
2bb706feda Dump request and response attributes of RequestExceptions
Cf. #243
2021-06-30 21:44:02 +00:00
JustAnotherArchivist
5e6bc4ec50 Fix type of content field (may be None on text-less posts) 2021-05-27 00:33:12 +00:00
JustAnotherArchivist
57d0aaafc1 Remove dirtyUrl which does not appear to be used anymore by Instagram
#234
2021-05-27 00:32:03 +00:00
JustAnotherArchivist
157e4d4265 Fix default value of username field
#234
2021-05-27 00:29:33 +00:00
JustAnotherArchivist
54588e9c42 Add support for fetching top instead of live/chronological tweets
Closes #109
2021-05-23 03:24:30 +00:00
JustAnotherArchivist
9e7274f3d7 Clean up params dict construction 2021-05-23 03:24:11 +00:00
JustAnotherArchivist
ac4e335bdb Clean up duplicated default values 2021-05-23 03:03:32 +00:00
JustAnotherArchivist
1d255de48d Add hashtags and cashtags 2021-05-23 02:51:38 +00:00
JustAnotherArchivist
9c1dcd37f9 Add Tweet.{inReplyToTweetId,inReplyToUser}
This makes User.displayname optional because the replied-to user is not always present in the user mentions.
2021-05-23 02:44:40 +00:00
JustAnotherArchivist
f8dac183d0 Fix type of User.id 2021-05-23 02:43:53 +00:00
JustAnotherArchivist
45d1fa27de Add twitter-tweet scraper for retrieving tweets by ID, including scroll and recursion modes
Closes #51, closes #137
2021-05-23 02:12:13 +00:00
JustAnotherArchivist
98b798b0e5 Remove obsolete twitter-thread scraper
It was still based on the old, deprecated Twitter UI and broke a long time ago.

Closes #176
2021-05-22 22:37:21 +00:00
JustAnotherArchivist
f18b64e7da Add support for scraping Twitter users by ID
Closes #222
2021-05-22 21:17:14 +00:00
JustAnotherArchivist
460be9d581 Add _type attribute on all JSON objects, remove separate attribute on Twitter media 2021-05-22 18:14:54 +00:00
JustAnotherArchivist
97c8caea48 Set Accept-Language header on API requests to English 2021-04-20 01:50:14 +00:00
JustAnotherArchivist
a34f93076a Merge pull request #218 from NoeCampos22/Place_Data
Extract more information on Twitter places
2021-04-20 01:45:22 +00:00
NoeCampos22
8f1c470061 Tweet.place to Place dataclass 2021-04-19 15:13:33 -05:00
NoeCampos22
dbf2a2f689 Get more data from the place
Data like the country, place type and the single place name are now also returned on the JSON.
2021-04-19 12:01:14 -05:00
JustAnotherArchivist
39a34a57ac Handle API endpoints that don't include geolocation data (e.g. twitter-profile scraper)
Fixes #215
2021-04-13 20:15:42 +00:00
JustAnotherArchivist
f44b39705a Fix coordinate extraction from place bounding boxes 2021-04-06 20:53:05 +00:00
JustAnotherArchivist
f64ce217b7 Merge pull request #209 from Lukpier/master
Add tweet location (place full name & geo coordinates) where available
2021-04-06 16:19:33 +00:00
Luca Pierri
cdf87f4b8f Retrieve tweet location 2021-04-06 16:08:34 +00:00
JustAnotherArchivist
47fbc2a84d Add note on features exclusive to the dev version
Cf. #195
2021-02-24 19:39:45 +00:00
JustAnotherArchivist
5cd3b7d7cc Fix crash on rare weird 503 responses from Twitter without content 2021-01-26 22:39:02 +00:00
JustAnotherArchivist
0121fa51c2 Fix crash on users with a broken URL in the profile description 2021-01-26 18:33:34 +00:00
JustAnotherArchivist
892941b609 Fix crash on reposts of hidden profiles 2020-12-13 23:22:17 +00:00
JustAnotherArchivist
e3022628b6 Fix crash on photo reposts 2020-12-13 22:46:28 +00:00
JustAnotherArchivist
fdc33d0dba Include properties in JSON representation
This fixes the lack of the profile URL on Twitter users because it's generated using the username rather than set explicitly as a field.
2020-11-05 05:55:26 +00:00
JustAnotherArchivist
6d6411cc24 Fix KeyError on entity for inexistent Twitter accounts 2020-11-03 23:21:28 +00:00
JustAnotherArchivist
61a1ecffc5 Merge pull request #141 from gitshrl/twitter/split-source-url-label
Split tweet source into URL and label
2020-10-27 18:44:10 +00:00
sahrul
d2dce37fa0 add the original tweet source 2020-10-27 13:21:21 +07:00
sahrul
d65f0434da split source into url and label 2020-10-26 16:46:10 +07:00
JustAnotherArchivist
7499384110 Merge pull request #131 from gitshrl/facebook/fix-group-pagination
Fix pagination error for Facebook group scraper
2020-10-21 15:08:50 +00:00
sahrul
7a0f68b7ec fix pagination for facebook group scraper 2020-10-21 21:30:00 +07:00
JustAnotherArchivist
1a219fd2b6 Merge pull request #129 from gitshrl/facebook/fix-group-scraper
Update base URL for Facebook group scraper
2020-10-21 14:03:59 +00:00
sahrul
6fb98dae12 update base url for facebook group scraper 2020-10-21 19:57:02 +07:00
JustAnotherArchivist
8c2c0fa47a Remove workaround for http://bugs.python.org/issue16308 as snscrape requires 3.8+ now anyway 2020-10-18 20:25:54 +00:00
JustAnotherArchivist
58c8365c33 Add test extra requirements 2020-10-18 20:03:29 +00:00
JustAnotherArchivist
2c11ec38fa Replace requests.models with plain requests
requests.models is all but undocumented, and the three types needed here are all in the requests namespace as well.
2020-10-18 02:35:55 +00:00
JustAnotherArchivist
fe5e23502d collections.deque support and other minor improvements to snscrape._cli._repr 2020-10-18 02:12:09 +00:00
JustAnotherArchivist
644cd1d2fb Add support for various further complicated types to snscrape._cli._repr 2020-10-18 01:42:45 +00:00
JustAnotherArchivist
5ccfab6314 Add .gitignore 2020-10-18 01:14:04 +00:00
JustAnotherArchivist
bf895ea5b1 Minor README cleanup 2020-10-17 21:21:20 +00:00
JustAnotherArchivist
e956e2562b Replace pkg_resources with importlib.metadata 2020-10-17 21:16:45 +00:00
JustAnotherArchivist
defe874bf4 Fix date extraction on VK
Only the most recent posts have the nice timestamp property...
2020-10-17 02:22:15 +00:00
JustAnotherArchivist
3f8935ee4d Fix crash on video reposts 2020-10-17 02:20:40 +00:00
JustAnotherArchivist
cd12500dbf Fix date extraction on quoted posts 2020-10-17 02:13:27 +00:00
JustAnotherArchivist
5dc61d50ac Add support for outlinks, photos, videos, and quoted posts on VK 2020-10-17 00:07:26 +00:00
JustAnotherArchivist
11a82e110a Remove obsolete comment
Cf. f296f9d2
2020-10-16 18:37:51 +00:00
JustAnotherArchivist
16ebe8bf48 Introduce dedicated IntWithGranularity type and deprecate the direct *Granularity fields 2020-10-16 18:20:47 +00:00
JustAnotherArchivist
1bbe25647a Refactor deprecated properties 2020-10-16 18:11:52 +00:00
JustAnotherArchivist
e22b461563 Add Python 3.9 classifier 2020-10-16 01:27:17 +00:00
JustAnotherArchivist
c4a5715e18 Fix Facebook user and community scrapers
Facebook is redirecting the previous user agent to the mobile site; use current Firefox ESR instead.
2020-10-16 01:20:50 +00:00
JustAnotherArchivist
5cb64faa72 Formally deprecate the already deprecated item attributes 2020-10-16 00:55:55 +00:00
JustAnotherArchivist
0f78aa45fc Refactor --format handling to avoid conversion to dict 2020-10-16 00:55:14 +00:00
JustAnotherArchivist
179112a310 Fix --format
Broken by the switch to dataclasses in bd53e729
2020-10-16 00:27:13 +00:00
JustAnotherArchivist
4ce9ed4eb3 Add --progress option that prints a status update every 100 results and at the end
Closes #116
2020-10-16 00:00:43 +00:00
JustAnotherArchivist
11414cb68f Rename cli module to make it clear that it is considered private API 2020-10-15 23:47:07 +00:00
JustAnotherArchivist
bd53e729a0 Replace named tuples with dataclasses and move JSON conversion logic to the base classes
Named tuples were never really adequate for this since the order aspect of them doesn't make sense.
Further, named tuples don't support multiple inheritance. This meant that the objects returned by get_items() were not actually Items, for example. Since Python 3.9, such named tuples cannot be created anymore.

Fixes #111
2020-10-15 23:44:28 +00:00
JustAnotherArchivist
ffd9289edc Reduce the logging level of retryable retrieval errors from WARNING to INFO
There is no real need to report these as WARNINGs as snscrape tries and in most cases manages to recover. Without --verbose, snscrape's output can be confusing (see #76). If the retries fail as well, snscrape will still log that as an ERROR and crash loudly.
2020-10-11 22:29:27 +00:00
JustAnotherArchivist
b1a7b9607f Skip individual Telegram photo/video links 2020-10-07 01:27:26 +00:00
JustAnotherArchivist
119e53d07c Fix Telegram post URL extraction 2020-10-07 01:15:51 +00:00
JustAnotherArchivist
c3e2e12369 Deprecate outlinksss 2020-10-01 22:00:26 +00:00
JustAnotherArchivist
a70b361176 Use more assignment expressions where appropriate 2020-10-01 21:45:25 +00:00
JustAnotherArchivist
8b68f1a8af Fix link previews for pure-image previews
... and any other preview that doesn't have all the things for some reason.
2020-10-01 18:56:55 +00:00
JustAnotherArchivist
c72bf3174f Use assignment expressions for cleaner code 2020-10-01 18:54:57 +00:00
JustAnotherArchivist
472cef2382 Add support for link previews 2020-10-01 18:51:14 +00:00
JustAnotherArchivist
b1d8475a03 Fix link extraction on Telegram 2020-10-01 18:29:08 +00:00
JustAnotherArchivist
3d3faf80bf Add python_requires to make it even clearer that 3.8+ is required 2020-09-26 16:32:00 +00:00
JustAnotherArchivist
bbb372284b Bump Python version in README 2020-09-26 15:56:55 +00:00
JustAnotherArchivist
8cf81e9bfc Fix twitter-profile scraper
The Twitter API returns different data structures there, leading to a variety of errors.
2020-09-25 02:45:07 +00:00
JustAnotherArchivist
d90f06b389 Extract more information on users from Twitter
Closes #78
2020-09-24 18:39:32 +00:00
JustAnotherArchivist
c519832755 Clarify twitter-list-posts argument value 2020-09-24 18:37:37 +00:00
JustAnotherArchivist
397a0b988e Remove Twitter list member scraper
It has been broken for a while. Member lists were removed from the old design, and they're behind a login wall on the new design.
2020-09-24 18:34:15 +00:00
JustAnotherArchivist
f1428fa0e0 Fix crash on nested quoted tweets 2020-09-24 02:45:49 +00:00
JustAnotherArchivist
7d2c546ee5 Deprecate hacky fields in Tweet objects 2020-09-24 02:00:45 +00:00
JustAnotherArchivist
2332c30e26 Replace locale-dependent strptime date parsing with email.utils.parsedate_to_datetime 2020-09-24 02:00:21 +00:00
JustAnotherArchivist
b78bf3e642 Fix crash on banner-less profiles and nested descriptionUrls 2020-09-24 01:58:38 +00:00
JustAnotherArchivist
1a09f9b9a3 Extract more information from Twitter
Including: reply/retweet/like/quote counts, media (photos, videos, and GIFs), full user object, quoted tweets, mentioned users, rendered content, conversation ID, language, source
2020-09-24 01:45:08 +00:00
JustAnotherArchivist
5ae5ec7bcd Bump Python version classifier
Python 3.8 is required since commit 1a2e367a.
2020-09-23 22:25:38 +00:00
JustAnotherArchivist
c0ff6631aa Update README 2020-09-22 22:30:08 +00:00
JustAnotherArchivist
ae60a4d0fd Add Weibo scraper
Closes #52
2020-09-13 02:27:35 +00:00
JustAnotherArchivist
800cfd5be0 Add support for disabling following redirects 2020-09-13 00:52:26 +00:00
JustAnotherArchivist
f296f9d21d Refactor post extraction of VK again to work around their weird behaviours
VK doesn't always return posts in chronological order, so that can't be used to filter out duplicates. Instead, remember the last 1k post IDs and filter using that. This should catch the vast majority of duplicates. (Also, duplicates can't only happen in the geoblocking workaround; sometimes, VK also simply returns the same post again for no obvious reason.)
2020-09-12 02:00:50 +00:00
JustAnotherArchivist
8265ffc19e Work around geoblocked posts on VK
To get around the block, try to iterate over post offsets individually instead of in 10-steps. This means we should get every post that isn't blocked as long as there are at least 10 posts between two blocked ones.

Fixes #68
2020-09-12 02:00:26 +00:00
JustAnotherArchivist
f8efe98608 Fix post order on VK: reinsert pinned post at the correct location in the stream 2020-09-12 00:03:29 +00:00
JustAnotherArchivist
2b5444f89e Restrict --max-results to zero or positive values; use zero to indicate fetching only the entity 2020-09-11 15:37:22 +00:00
JustAnotherArchivist
07d446fd19 Fix crash in VK scraper 2020-09-10 21:05:03 +00:00
JustAnotherArchivist
a25426043b Fix Telegram username canonicalisation 2020-09-09 09:33:57 +00:00
JustAnotherArchivist
84692846b9 Fix crash in Telegram scraper 2020-09-09 09:22:00 +00:00
JustAnotherArchivist
039b2c6719 Restructure Twitter classes since the 'common' scraper is only used for the old design anymore 2020-09-07 02:38:27 +00:00
JustAnotherArchivist
70a3d9ba3a Fix infinite loop at the end of profile pages 2020-09-01 04:01:27 +00:00
JustAnotherArchivist
bd619bf4e9 Log and ignore tweets which are not contained in the globalObjects
Fixes #61
2020-09-01 03:45:23 +00:00
JustAnotherArchivist
072519f539 Fix pagination on profile pages 2020-09-01 03:23:45 +00:00
JustAnotherArchivist
d9572ec450 Correctly serialise nested NamedTuples 2020-09-01 03:16:25 +00:00
JustAnotherArchivist
ba250aabf2 Extract retweeted tweet if present 2020-09-01 03:15:21 +00:00
JustAnotherArchivist
0cc4f0c016 Add support for Twitter profile pages
Closes #5
2020-09-01 03:13:49 +00:00
JustAnotherArchivist
1a2e367a87 Cache entities 2020-09-01 02:34:21 +00:00
JustAnotherArchivist
4f24843f89 Extract user ID 2020-09-01 02:26:13 +00:00
JustAnotherArchivist
bfb92a47b9 Move Tweet object generation to TwitterAPIScraper 2020-09-01 02:25:00 +00:00
JustAnotherArchivist
dc5d55004b Refactor API interaction into something cleaner and more reusable 2020-09-01 01:56:07 +00:00
JustAnotherArchivist
d8e7f96d4d Add support for Reddit
Closes #15
2020-08-31 03:38:20 +00:00
JustAnotherArchivist
bb83d1d72f Validate Twitter usernames
Closes #55
2020-08-24 19:03:52 +00:00
JustAnotherArchivist
1480260e47 Handle Telegram channels without public posts 2020-08-24 17:54:30 +00:00
JustAnotherArchivist
c8d688d39f Fix crash on Telegram pages without a description 2020-08-24 17:53:50 +00:00
JustAnotherArchivist
9df4352089 Fix crash on VK pages without an info div 2020-08-24 17:42:33 +00:00
JustAnotherArchivist
dd25fd0526 Add support for extracting the entity behind a scrape
Closes #11

Backwards incompatibility: snscrape.modules.twitter.Account is now called User. However, this was previously only used on the list member scraper, which has been broken for a while since the list member list is no longer publicly accessible.

For compatibility reasons, the CLI does not output the entity by default; the new option --with-entity enables it.
2020-08-24 01:38:27 +00:00
JustAnotherArchivist
c90fd54b6b Make datetime.date serialisable 2020-08-24 01:12:38 +00:00
JustAnotherArchivist
9528df48cd Refactor base URL handling 2020-08-24 01:12:06 +00:00
JustAnotherArchivist
924c35f883 Refactor guest token extraction code 2020-08-22 22:59:43 +00:00
JustAnotherArchivist
588ec415ff Force TwitterThreadScraper to fetch the old design (take 2) 2020-08-12 17:19:42 +00:00
JustAnotherArchivist
bf229414ba Add JSONL output format 2020-08-12 15:09:02 +00:00
JustAnotherArchivist
afa819547d Update README 2020-08-11 22:18:04 +00:00
JustAnotherArchivist
dbcdc159ef Add support for scraping Facebook page visitor posts aka 'Community'
Closes #18
2020-08-11 22:14:27 +00:00
JustAnotherArchivist
30f945897a Clean Facebook group post URLs
Most of the time, the URLs are already clean, but occasionally, Facebook includes tracking parameters (__xts__[0], __tn__)...
2020-08-11 20:48:14 +00:00
JustAnotherArchivist
eee5794ff9 Extract Facebook group post in chronological order (instead of by last comment)
Fixes #66
2020-08-11 20:47:42 +00:00
JustAnotherArchivist
966a6ebd8e Skip promoted tweets/ads
Fixes #67
2020-08-11 20:28:35 +00:00
JustAnotherArchivist
4d3d0fe0d7 Update search API parameter values to the ones currently used on Twitter 2020-08-11 20:26:56 +00:00
JustAnotherArchivist
7b967ff82a Twitter reverted their guest token change (90f9598e) 2020-07-08 22:07:18 +00:00
JustAnotherArchivist
90f9598ecc Adjust to Twitter's new method of handing out guest tokens
Fixes #64
2020-06-24 21:22:58 +00:00
JustAnotherArchivist
7b3c7deb28 Catch login redirects on Instagram 2020-05-30 00:56:34 +00:00
JustAnotherArchivist
040a11656c Update README 2020-05-30 00:53:52 +00:00
JustAnotherArchivist
1459245258 Consistently raise ScraperException on fatal errors 2020-05-30 00:53:49 +00:00
JustAnotherArchivist
dbe4c5ce55 Remove Google+ module
Google+ was mostly shut down in early 2019. What remained (Google+ for G Suite) was renamed to Google Currents and is for internal communication only (and therefore out of scope for snscrape).
2020-05-30 00:35:06 +00:00
JustAnotherArchivist
80491ecc2c Remove Gab module
Since Gab's move to a fork of Mastodon in July 2019, the module had been broken, and a new module would better be written from scratch as the platform changed entirely.
2020-05-30 00:23:33 +00:00
JustAnotherArchivist
1a71b58101 Add support for Telegram
Closes #50
2020-05-29 23:44:01 +00:00
JustAnotherArchivist
0ce37a69d4 Log exception details on crashes 2020-05-29 22:29:23 +00:00
JustAnotherArchivist
722bfd5f7c Handle Twitter tombstones
Fixes #63
2020-05-29 22:12:37 +00:00
JustAnotherArchivist
b6cc3180d9 Force TwitterThreadScraper and TwitterListMembersScraper to fetch the old design 2020-03-04 00:40:49 +00:00
JustAnotherArchivist
613395d1c2 Port TwitterSearchScraper to redesign
Fixes #57
2020-03-04 00:40:49 +00:00
JustAnotherArchivist
82a87b7b5a Merge pull request #53 from JackDallas/add-more-insta-fields
Add more fields to the instagram scraper
2020-02-09 23:48:59 +00:00
Jack Dallas
9568028bf9 Update changed fields 2020-02-07 11:30:16 +00:00
JustAnotherArchivist
6df351772e Fix crash in Facebook scraper on link-less entries 2020-02-05 16:15:10 +00:00
JustAnotherArchivist
541173b0c8 Merge pull request #54 from jodizzle/fix/vkontakte-user
Fix vkontakte-user: pagination returns JSON now, and handle some unscrapable profiles.
2020-02-05 14:56:12 +00:00
Jody Leonard
b6772d3778 vkontakte-user: Handle additional un-scrapeable profile case 2019-10-31 16:01:29 -04:00
Jody Leonard
20ea117a2c Fix vkontakte-user pagination 2019-10-30 22:29:49 -04:00
JackDallas
ff54c350bc Add more fields to the instagram scraper 2019-08-30 12:43:02 +01:00
JustAnotherArchivist
e6aae35304 Use setuptools_scm for versioning through git tags 2019-07-01 17:41:18 +00:00
JustAnotherArchivist
b698a201f5 Update scraper list 2019-07-01 16:05:21 +00:00
JustAnotherArchivist
7fe72cf708 Add a note about reporting issues with proper debugging information 2019-07-01 16:01:11 +00:00
JustAnotherArchivist
4651cde447 Refactor CLI logging and add --dump-locals for better debugging 2019-07-01 15:46:10 +00:00
JustAnotherArchivist
c99cc4b5d3 Remove existing logging handlers 2019-07-01 15:42:06 +00:00
JustAnotherArchivist
628074d6fc Print contents when ignoring a link-less entry 2019-07-01 01:35:00 +00:00
JustAnotherArchivist
64b293bd9e Add support for media sets
Closes #48
2019-07-01 01:34:17 +00:00
JustAnotherArchivist
180f4dfeb7 Add support for photo.php URLs
Fixes #42
2019-06-30 18:36:39 +00:00
JustAnotherArchivist
6d6e3fa16c Fix crash on (some?) inexistent groups 2019-06-30 18:36:30 +00:00
JustAnotherArchivist
5f7e6936c1 Add support for Facebook groups
Closes #47
2019-06-30 17:16:09 +00:00
JustAnotherArchivist
e2c05c9e0c Split common code off into FacebookCommonScraper and refactor odd link detection in preparation of group scraping 2019-06-30 16:28:33 +00:00
JustAnotherArchivist
14e11b28d2 Add support for Twitter lists
Closes #46
2019-06-30 14:39:29 +00:00
JustAnotherArchivist
1a07b3b7e8 Add support for Twitter threads 2019-06-30 02:11:46 +00:00
JustAnotherArchivist
4d8cc7bdb9 Extract outlinks from Facebook 2019-06-27 15:29:05 +00:00
JustAnotherArchivist
eec83f181e Check HTTP status code before attempting parsing 2019-06-27 15:25:26 +00:00
JustAnotherArchivist
fae7432c64 Log details about failed JSON parsing 2019-06-27 15:25:08 +00:00
JustAnotherArchivist
757818474d Add tweet ID and username fields to Tweet items 2019-06-23 11:48:54 +00:00
JustAnotherArchivist
e6c934c0b8 Retrieve as many posts at once as possible for Instagram hashtags 2019-06-21 09:56:12 +00:00
JustAnotherArchivist
d2315feec1 Add support for Instagram locations 2019-06-21 09:55:30 +00:00
JustAnotherArchivist
765ceeeb10 More complete and more readable exception dump 2019-06-18 14:25:38 +00:00
JustAnotherArchivist
731a2e8c8b Check that Instagram returned valid JSON, take 2
Fixes #22
2019-06-10 15:03:15 +00:00
JustAnotherArchivist
7d1916292c Twitter: stop recursion based on whether the server returns the same position instead of detecting an empty feed
Fixes #37
2019-06-10 14:38:25 +00:00
JustAnotherArchivist
0d509c4ba0 Check that Instagram returned valid JSON (fixes #22) 2019-05-30 15:04:05 +00:00
JustAnotherArchivist
907a003a59 Fix crash when Twitter search produces no results (fixes #41) 2019-05-24 11:51:50 +00:00
JustAnotherArchivist
8ada279b57 Add warning if Twitter module gets no results 2019-05-24 11:50:39 +00:00
JustAnotherArchivist
900eae54a6 Ignore branded content link on Facebook silently 2019-05-24 11:49:44 +00:00
JustAnotherArchivist
7989af27b5 Handle tweets by temporarily blocked accounts (which show up in the search results but don't have a date or content) 2019-05-21 22:37:43 +00:00
JustAnotherArchivist
e528ca3f26 Dump locals only for snscrape modules (closes #39) 2019-05-18 01:08:49 +00:00
JustAnotherArchivist
32a427dac3 Fix pagination on Twitter (fixes #40) 2019-05-18 01:08:00 +00:00
JustAnotherArchivist
7001983556 Skip timeline entries that don't have a link (fixes #36) 2019-05-16 23:17:46 +00:00
JustAnotherArchivist
64438afc92 Work around tweet URLs that don't have a data-expanded-url attribute (fixes #38) 2019-05-16 22:51:22 +00:00
JustAnotherArchivist
9e6538556a Dump also the deeper frames, not just the get_items one 2019-05-16 22:48:35 +00:00
JustAnotherArchivist
9c8bbf051c Fix order of processing in Twitter module for more useful locals dump output 2019-05-16 22:22:53 +00:00
JustAnotherArchivist
c6a11298ac Fix missing linebreak in locals dump output 2019-05-16 22:22:21 +00:00
JustAnotherArchivist
02cbf6ddf6 Dump locals to a temporary file in case of an exception 2019-05-16 18:29:30 +00:00
JustAnotherArchivist
3817aa59d4 Add support for extracting links from tweets (including cards)
Both the t.co and the original URLs can be extracted. Note that card links are always t.co since Twitter's HTML does not include the original URL for those.
2019-05-16 16:42:52 +00:00
JustAnotherArchivist
46a51008f8 Fix Instagram signature calculation 2019-05-16 16:19:51 +00:00
JustAnotherArchivist
f91979eb32 Add --max-position option to twitter-search scraper as a workaround for pagination stopping early (#37)
The value needs to be of the format 'TWEET-<seenID>-<newestID>' where <seenID> is the last result that was returned by a previous scrape and <newestID> is the first result returned by the initial scrape.
2019-05-10 17:30:15 +00:00
JustAnotherArchivist
85fff319bc Disable Twitter's spelling correction
src=typd means "this is what was typed in and could be incorrect". src=spxr is "no, I really mean that". src=sprv appears to be an alias of spxr that is no longer used.
2019-05-10 16:43:59 +00:00
JustAnotherArchivist
6b145526b7 Update README with new modules 2019-04-21 23:10:32 +02:00
JustAnotherArchivist
abf31764b1 Version 0.2.0 2019-04-21 23:03:21 +02:00
JustAnotherArchivist
64693f74bb Update Instagram query hash 2019-04-19 01:47:38 +02:00
JustAnotherArchivist
a7d08ed51c Remove leftover debugging print 2019-04-19 01:40:29 +02:00
JustAnotherArchivist
f48ca7726e Add support for Gab 2019-04-19 00:40:43 +02:00
JustAnotherArchivist
78c295f7e0 Add support for VKontakte (fixes #13) 2019-04-18 18:39:21 +02:00
JustAnotherArchivist
a5aca1a14f Add support for Instagram hashtags (fixes #29) 2019-04-18 16:14:54 +02:00
JustAnotherArchivist
96f7d871c1 Ignore Scraper subclasses which don't set a name 2019-04-18 16:14:26 +02:00
JustAnotherArchivist
b5dfd37949 Support unix timestamps in --since 2019-04-18 16:01:35 +02:00
JustAnotherArchivist
b511397791 Add --since option to return only results newer than a certain date (fixes #19) 2019-04-18 15:12:29 +02:00
JustAnotherArchivist
536fcb3303 Return proper items from scrapers including clean URLs (fixes #9 and #10) 2019-04-18 14:44:21 +02:00
JustAnotherArchivist
f8d812f799 Include permalink.php, events, and notes (fixes #32) 2019-04-18 04:22:47 +02:00
JustAnotherArchivist
c2cebd9166 Accept-Language header to get an English response unconditionally 2019-04-18 03:58:37 +02:00
JustAnotherArchivist
73bc99596f Treat Twitter responses without a Content-Type header as invalid (fixes #21) 2019-04-18 02:24:35 +02:00
JustAnotherArchivist
8458c12218 Rewrite link extraction on Facebook (fixes #17)
Facebook's returned HTML has a large number of inconsistencies; some (most) pages include a <link rel="canonical" /> but some don't, for example. This was at the root of the failing post extraction for some Facebook pages (#17). The previous link extraction technique was also quite poor for other reasons though. The new method uses the relevant CSS classes instead. Despite probably being the result of a CSS minimiser or similar, these seem to be quite stable: they haven't changed in the past two years (but the more readable ones have!).
2019-04-18 02:14:21 +02:00
JustAnotherArchivist
b59c7e8d8f Merge pull request #28 from peterk/master
Adds socks proxy support (via requests)
2019-03-11 13:32:07 +01:00
Peter Krantz
3ceb849d98 Adds socks proxy support (via requests) 2019-01-10 22:54:42 +01:00
JustAnotherArchivist
f5ee1f7ac5 Merge pull request #26 from ludios/avoid-twitter-bans
twitter: randomize user agent to avoid Twitter's (IP, UA)-keyed bans
2018-12-25 02:19:17 +01:00
Ivan Kozik
1984110f78 twitter: randomize user agent to avoid Twitter's (IP, UA)-keyed bans 2018-12-24 08:03:33 +00:00
JustAnotherArchivist
c5a5dcb92c snscrape is now on PyPI 2018-10-09 17:26:03 +02:00
JustAnotherArchivist
cfb1c9a2aa Version 0.1.3 2018-10-01 03:26:22 +02:00
JustAnotherArchivist
d0d3c8b2a6 Better log output for temporary failures (fixes #2) 2018-10-01 03:24:29 +02:00
JustAnotherArchivist
4d0350e541 Disable "quality filter" on Twitter (fixes #3) 2018-10-01 02:51:33 +02:00
JustAnotherArchivist
d17aa15bcb Version 0.1.2 2018-09-11 12:44:07 +02:00
JustAnotherArchivist
d1ef280d6e Fix snscrape.modules not getting installed 2018-09-11 12:43:10 +02:00
17 changed files with 4385 additions and 366 deletions

4
.gitignore vendored Normal file
View File

@@ -0,0 +1,4 @@
__pycache__/
/dist/
/snscrape.egg-info/
/.eggs/

View File

@@ -1,38 +1,66 @@
# snscrape
snscrape is a scraper for social networking services (SNS). It scrapes things like user profiles, hashtags, or searches and returns the discovered items, e.g. the relevant posts.
snscrape is a scraper for social networking services (SNS). It scrapes things like user profiles, hashtags, or searches and returns the discovered items, e.g. the relevant posts.
The following services are currently supported:
* Facebook: user profiles
* Google Plus: user profiles
* Instagram: user profiles
* Twitter: user profiles, hashtags, and searches
* Facebook: user profiles, groups, and communities (aka visitor posts)
* Instagram: user profiles, hashtags, and locations
* Mastodon: user profiles and toots (single or thread)
* Reddit: users, subreddits, and searches (via Pushshift)
* Telegram: channels
* Twitter: users, user profiles, hashtags, searches, tweets (single or surrounding thread), list posts, and trends
* VKontakte: user profiles
* Weibo (Sina Weibo): user profiles
## Requirements
snscrape requires Python 3.6 or higher. The Python package dependencies are installed automatically when you install snscrape.
snscrape requires Python 3.8 or higher. The Python package dependencies are installed automatically when you install snscrape.
Note that one of the dependencies, lxml, also requires libxml2 and libxslt to be installed.
## Installation
pip3 install snscrape
If you want to use the development version:
pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git
## Usage
To get all tweets by Jason Scott (@textfiles):
### CLI
The generic syntax of snscrape's CLI is:
snscrape [GLOBAL-OPTIONS] SCRAPER-NAME [SCRAPER-OPTIONS] [SCRAPER-ARGUMENTS...]
`snscrape --help` and `snscrape SCRAPER-NAME --help` provide details on the options and arguments. `snscrape --help` also lists all available scrapers.
The default output of the CLI is the URL of each result.
Some noteworthy global options are:
* `--jsonl` to get output as JSONL. This includes all information extracted by snscrape (e.g. message content, datetime, images; details vary by scraper).
* `--max-results NUMBER` to only return the first `NUMBER` results.
* `--with-entity` to get an item on the entity being scraped, e.g. the user or channel. This is not supported on all scrapers. (You can use this together with `--max-results 0` to only fetch the entity info.)
#### Examples
Collect all tweets by Jason Scott (@textfiles):
snscrape twitter-user textfiles
It's usually useful to redirect the output to a file for further processing, e.g. in bash using the filename `@textfiles-tweets`:
It's usually useful to redirect the output to a file for further processing, e.g. in bash using the filename `twitter-@textfiles`:
```bash
snscrape twitter-user textfiles >@textfiles-tweets
snscrape twitter-user textfiles >twitter-@textfiles
```
To get the latest 100 tweets with the hashtag #archiveteam:
snscrape --max-results 100 twitter-hashtag archiveteam
`snscrape --help` or `snscrape <module> --help` provides details on the available options. `snscrape --help` also lists all available modules.
### Library
It is also possible to use snscrape as a library in Python, but this is currently undocumented.
## Issue reporting
If you discover an issue with snscrape, please report it at <https://github.com/JustAnotherArchivist/snscrape/issues>. If possible please run snscrape with `-vv` and `--dump-locals` and include the log output as well as the dump files referenced in the log in the issue. Note that the files may contain sensitive information in some cases and could potentially be used to identify you (e.g. if the service includes your IP address in its response). If you prefer to arrange a file transfer privately, just mention that in the issue.
## License
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

View File

@@ -1,22 +1,42 @@
import os.path
import setuptools
with open(os.path.join(os.path.dirname(__file__), 'README.md')) as fp:
readme = fp.read()
setuptools.setup(
name = 'snscrape',
version = '0.1.1',
description = 'A social networking service scraper',
long_description = readme,
long_description_content_type = 'text/markdown',
author = 'JustAnotherArchivist',
url = 'https://github.com/JustAnotherArchivist/snscrape',
classifiers = [
'Development Status :: 4 - Beta',
'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
],
packages = ['snscrape'],
install_requires = ['requests', 'lxml', 'beautifulsoup4'],
packages = ['snscrape', 'snscrape.modules'],
setup_requires = ['setuptools_scm'],
use_scm_version = True,
install_requires = [
'requests[socks]',
'lxml',
'beautifulsoup4',
'pytz; python_version < "3.9.0"',
'filelock',
],
python_requires = '~=3.8',
extras_require = {
'test': ['coverage'],
},
entry_points = {
'console_scripts': [
'snscrape = snscrape.cli:main',
'snscrape = snscrape._cli:main',
],
},
)

338
snscrape/_cli.py Normal file
View File

@@ -0,0 +1,338 @@
import argparse
import collections
import contextlib
import dataclasses
import datetime
import importlib.metadata
import inspect
import logging
import requests
# Imported in parse_args() after setting up the logger:
#import snscrape.base
#import snscrape.modules
#import snscrape.version
import sys
import tempfile
## Logging
dumpLocals = False
logger = logging # Replaced below after setting the logger class
class Logger(logging.Logger):
def _log_with_stack(self, level, *args, **kwargs):
super().log(level, *args, **kwargs)
if dumpLocals:
stack = inspect.stack()
if len(stack) >= 3:
name = _dump_stack_and_locals(stack[2:][::-1])
super().log(level, f'Dumped stack and locals to {name}')
def warning(self, *args, **kwargs):
self._log_with_stack(logging.WARNING, *args, **kwargs)
def error(self, *args, **kwargs):
self._log_with_stack(logging.ERROR, *args, **kwargs)
def critical(self, *args, **kwargs):
self._log_with_stack(logging.CRITICAL, *args, **kwargs)
def log(self, level, *args, **kwargs):
if level >= logging.WARNING:
self._log_with_stack(level, *args, **kwargs)
else:
super().log(level, *args, **kwargs)
def _requests_request_repr(name, request):
ret = []
ret.append(f'{name} = {request!r}')
ret.append(f'\n {name}.method = {request.method}')
ret.append(f'\n {name}.url = {request.url}')
ret.append(f'\n {name}.headers = \\')
for field in request.headers:
ret.append(f'\n {field} = {_repr("_", request.headers[field])}')
for attr in ('body', 'params', 'data'):
if hasattr(request, attr) and getattr(request, attr):
ret.append(f'\n {name}.{attr} = ')
ret.append(_repr('_', getattr(request, attr)).replace('\n', '\n '))
return ''.join(ret)
def _requests_response_repr(name, response, withHistory = True):
ret = []
ret.append(f'{name} = {response!r}')
ret.append(f'\n {name}.url = {response.url}')
ret.append(f'\n {name}.request = ')
ret.append(_repr('_', response.request).replace('\n', '\n '))
if withHistory and response.history:
ret.append(f'\n {name}.history = [')
for previousResponse in response.history:
ret.append('\n ')
ret.append(_requests_response_repr('_', previousResponse, withHistory = False).replace('\n', '\n '))
ret.append('\n ]')
ret.append(f'\n {name}.status_code = {response.status_code}')
ret.append(f'\n {name}.headers = \\')
for field in response.headers:
ret.append(f'\n {field} = {_repr("_", response.headers[field])}')
ret.append(f'\n {name}.content = {_repr("_", response.content)}')
return ''.join(ret)
def _requests_exception_repr(name, exc):
ret = []
ret.append(f'{name} = {exc!r}')
ret.append('\n ' + _repr(f'{name}.request', exc.request).replace('\n', '\n '))
ret.append('\n ' + _repr(f'{name}.response', exc.response).replace('\n', '\n '))
return ''.join(ret)
def _repr(name, value):
if type(value) is requests.Response:
return _requests_response_repr(name, value)
if type(value) in (requests.PreparedRequest, requests.Request):
return _requests_request_repr(name, value)
if isinstance(value, requests.exceptions.RequestException):
return _requests_exception_repr(name, value)
if isinstance(value, dict):
return f'{name} = <{type(value).__module__}.{type(value).__name__}>\n ' + \
'\n '.join(_repr(f'{name}[{k!r}]', v).replace('\n', '\n ') for k, v in value.items())
if isinstance(value, (list, tuple, collections.deque)) and not all(isinstance(v, (int, str)) for v in value):
return f'{name} = <{type(value).__module__}.{type(value).__name__}>\n ' + \
'\n '.join(_repr(f'{name}[{i}]', v).replace('\n', '\n ') for i, v in enumerate(value))
if dataclasses.is_dataclass(value) and not isinstance(value, type):
return f'{name} = <{type(value).__module__}.{type(value).__name__}>\n ' + \
'\n '.join(_repr(f'{name}.{f.name}', f.name) + ' = ' + _repr(f'{name}.{f.name}', getattr(value, f.name)).replace('\n', '\n ') for f in dataclasses.fields(value))
valueRepr = f'{name} = {value!r}'
if '\n' in valueRepr:
return ''.join(['\\\n ', valueRepr.replace('\n', '\n ')])
return valueRepr
@contextlib.contextmanager
def _dump_locals_on_exception():
try:
yield
except Exception as e:
trace = inspect.trace()
if len(trace) >= 2:
name = _dump_stack_and_locals(trace[1:], exc = e)
logger.fatal(f'Dumped stack and locals to {name}')
raise
def _dump_stack_and_locals(trace, exc = None):
with tempfile.NamedTemporaryFile('w', prefix = 'snscrape_locals_', delete = False) as fp:
if exc is not None:
fp.write('Exception:\n')
fp.write(f' {type(exc).__module__}.{type(exc).__name__}: {exc!s}\n')
fp.write(f' args: {exc.args!r}\n')
fp.write('\n')
fp.write('Stack:\n')
for frameRecord in trace:
fp.write(f' File "{frameRecord.filename}", line {frameRecord.lineno}, in {frameRecord.function}\n')
if frameRecord.code_context is not None:
for line in frameRecord.code_context:
fp.write(f' {line.strip()}\n')
fp.write('\n')
modules = [inspect.getmodule(frameRecord[0]) for frameRecord in trace]
for i, (module, frameRecord) in enumerate(zip(modules, trace)):
if module is None:
# Module-less frame, e.g. dataclass.__init__
for j in reversed(range(i)):
if modules[j] is not None:
break
else:
# No previous module scope
continue
module = modules[j]
if not module.__name__.startswith('snscrape.') and module.__name__ != 'snscrape':
continue
locals_ = frameRecord[0].f_locals
fp.write(f'Locals from file "{frameRecord.filename}", line {frameRecord.lineno}, in {frameRecord.function}:\n')
for variableName in locals_:
variable = locals_[variableName]
varRepr = _repr(variableName, variable)
fp.write(f' {variableName} {type(variable)} = ')
fp.write(varRepr.replace('\n', '\n '))
fp.write('\n')
fp.write('\n')
if 'self' in locals_ and hasattr(locals_['self'], '__dict__'):
fp.write('Object dict:\n')
fp.write(repr(locals_['self'].__dict__))
fp.write('\n\n')
name = fp.name
return name
def parse_datetime_arg(arg):
for format in ('%Y-%m-%d %H:%M:%S %z', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %z', '%Y-%m-%d'):
try:
d = datetime.datetime.strptime(arg, format)
except ValueError:
continue
else:
if d.tzinfo is None:
return d.replace(tzinfo = datetime.timezone.utc)
return d
# Try treating it as a unix timestamp
try:
d = datetime.datetime.fromtimestamp(int(arg), datetime.timezone.utc)
except ValueError:
pass
else:
return d
raise argparse.ArgumentTypeError(f'Cannot parse {arg!r} into a datetime object')
def parse_format(arg):
# Replace '{' by '{0.' to use properties of the item, but keep '{{' intact
parts = arg.split('{')
out = ''
it = iter(zip(parts, parts[1:]))
for part, nextPart in it:
out += part
if nextPart == '': # Double brace
out += '{{'
next(it)
else: # Single brace
out += '{0.'
out += parts[-1]
return out
class CitationAction(argparse.Action):
def __init__(self, option_strings, dest = argparse.SUPPRESS, *args, default = argparse.SUPPRESS, **kwargs):
super().__init__(option_strings, dest, *args, **kwargs)
def __call__(self, parser, namespace, values, optionString):
try:
m = importlib.metadata.metadata('snscrape')
except importlib.metadata.PackageNotFoundError:
print('Error: could not find snscrape installation. --citation does not work without the package being installed.', file = sys.stderr)
parser.exit(1)
print(f'Author: {m["author"]}')
print(f'Title: {m["name"]}: {m["summary"]}')
print(f'URL: {m["home-page"]}')
print(f'Version: {m["version"]}')
print(f'Date: 2018{m["version"].split(".", 3)[3][:4]}')
if '.dev' in m['version']:
print()
print('WARNING! You are running a development version. The date range may be incorrect. Please adjust the upper end of the range to the year of the commit.')
parser.exit()
def parse_args():
import snscrape.base
import snscrape.modules
import snscrape.version
parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--version', action = 'version', version = f'snscrape {snscrape.version.__version__}')
parser.add_argument('--citation', action = CitationAction, nargs = 0, help = 'Display recommended citation information and exit')
parser.add_argument('-v', '--verbose', '--verbosity', dest = 'verbosity', action = 'count', default = 0, help = 'Increase output verbosity')
parser.add_argument('--dump-locals', dest = 'dumpLocals', action = 'store_true', default = False, help = 'Dump local variables on serious log messages (warnings or higher)')
parser.add_argument('--retry', '--retries', dest = 'retries', type = int, default = 3, metavar = 'N',
help = 'When the connection fails or the server returns an unexpected response, retry up to N times with an exponential backoff')
parser.add_argument('-n', '--max-results', dest = 'maxResults', type = lambda x: int(x) if int(x) >= 0 else parser.error('--max-results N must be zero or positive'), metavar = 'N', help = 'Only return the first N results')
group = parser.add_mutually_exclusive_group(required = False)
group.add_argument('-f', '--format', dest = 'format', type = parse_format, default = None, help = 'Output format')
group.add_argument('--jsonl', dest = 'jsonl', action = 'store_true', default = False, help = 'Output JSONL')
parser.add_argument('--with-entity', dest = 'withEntity', action = 'store_true', default = False, help = 'Include the entity (e.g. user, channel) as the first output item')
parser.add_argument('--since', type = parse_datetime_arg, metavar = 'DATETIME', help = 'Only return results newer than DATETIME')
parser.add_argument('--progress', action = 'store_true', default = False, help = 'Report progress on stderr')
subparsers = parser.add_subparsers(dest = 'scraper', metavar = 'SCRAPER', title = 'scrapers', required = True)
classes = snscrape.base.Scraper.__subclasses__()
scrapers = {}
for cls in classes:
if cls.name is not None:
scrapers[cls.name] = cls
classes.extend(cls.__subclasses__())
for scraper, cls in sorted(scrapers.items()):
subparser = subparsers.add_parser(cls.name, help = '', formatter_class = argparse.ArgumentDefaultsHelpFormatter)
cls._cli_setup_parser(subparser)
subparser.set_defaults(cls = cls)
args = parser.parse_args()
if not args.withEntity and args.maxResults == 0:
parser.error('--max-results 0 is only valid when used with --with-entity')
return args
def setup_logging():
logging.setLoggerClass(Logger)
global logger
logger = logging.getLogger(__name__)
def configure_logging(verbosity, dumpLocals_):
global dumpLocals
dumpLocals = dumpLocals_
rootLogger = logging.getLogger()
# Set level
if verbosity > 0:
level = logging.INFO if verbosity == 1 else logging.DEBUG
rootLogger.setLevel(level)
for handler in rootLogger.handlers:
handler.setLevel(level)
# Create formatter
formatter = logging.Formatter('{asctime}.{msecs:03.0f} {levelname} {name} {message}', datefmt = '%Y-%m-%d %H:%M:%S', style = '{')
# Remove existing handlers
for handler in rootLogger.handlers:
rootLogger.removeHandler(handler)
# Add stream handler
handler = logging.StreamHandler()
handler.setFormatter(formatter)
rootLogger.addHandler(handler)
def main():
setup_logging()
args = parse_args()
configure_logging(args.verbosity, args.dumpLocals)
scraper = args.cls._cli_from_args(args)
i = 0
with _dump_locals_on_exception():
if args.withEntity and (entity := scraper.entity):
if args.jsonl:
print(entity.json())
else:
print(entity)
if args.maxResults == 0:
logger.info('Exiting after 0 results')
return
for i, item in enumerate(scraper.get_items(), start = 1):
if args.since is not None and item.date < args.since:
logger.info(f'Exiting due to reaching older results than {args.since}')
break
if args.jsonl:
print(item.json())
elif args.format is not None:
print(args.format.format(item))
else:
print(item)
if args.progress and i % 100 == 0:
print(f'Scraping, {i} results so far', file = sys.stderr)
if args.maxResults and i >= args.maxResults:
logger.info(f'Exiting after {i} results')
if args.progress:
print(f'Stopped scraping after {i} results due to --max-results', file = sys.stderr)
break
else:
logger.info(f'Done, found {i} results')
if args.progress:
print(f'Finished, {i} results', file = sys.stderr)

View File

@@ -1,22 +1,121 @@
import abc
import copy
import dataclasses
import datetime
import functools
import json
import logging
import requests
import time
import warnings
logger = logging.getLogger(__name__)
class Item:
class _DeprecatedProperty:
def __init__(self, name, repl, replStr):
self.name = name
self.repl = repl
self.replStr = replStr
def __get__(self, obj, objType):
if obj is None: # if the access is through the class using _DeprecatedProperty rather than an instance of the class:
return self
warnings.warn(f'{self.name} is deprecated, use {self.replStr} instead', FutureWarning, stacklevel = 2)
return self.repl(obj)
def _json_serialise_datetime(obj):
'''A JSON serialiser that converts datetime.datetime and datetime.date objects to ISO-8601 strings.'''
if isinstance(obj, (datetime.datetime, datetime.date)):
return obj.isoformat()
raise TypeError(f'Object of type {type(obj)} is not JSON serializable')
def _json_dataclass_to_dict(obj):
if isinstance(obj, _JSONDataclass) or dataclasses.is_dataclass(obj):
out = {}
out['_type'] = f'{type(obj).__module__}.{type(obj).__name__}'
for field in dataclasses.fields(obj):
assert field.name != '_type'
if field.name.startswith('_'):
continue
out[field.name] = _json_dataclass_to_dict(getattr(obj, field.name))
# Add in (non-deprecated) properties
for k in dir(obj):
if isinstance(getattr(type(obj), k, None), property):
assert k != '_type'
if k.startswith('_'):
continue
out[k] = _json_dataclass_to_dict(getattr(obj, k))
return out
elif isinstance(obj, (tuple, list)):
return type(obj)(_json_dataclass_to_dict(x) for x in obj)
elif isinstance(obj, dict):
return {_json_dataclass_to_dict(k): _json_dataclass_to_dict(v) for k, v in obj.items()}
elif isinstance(obj, set):
return {_json_dataclass_to_dict(v) for v in obj}
else:
return copy.deepcopy(obj)
@dataclasses.dataclass
class _JSONDataclass:
'''A base class for dataclasses for conversion to JSON'''
def json(self):
'''Convert the object to a JSON string'''
out = _json_dataclass_to_dict(self)
for key, value in list(out.items()): # Modifying the dict below, so make a copy first
if isinstance(value, IntWithGranularity):
out[key] = int(value)
assert f'{key}.granularity' not in out, f'Granularity collision on {key}.granularity'
out[f'{key}.granularity'] = value.granularity
return json.dumps(out, default = _json_serialise_datetime)
@dataclasses.dataclass
class Item(_JSONDataclass):
'''An abstract base class for an item returned by the scraper's get_items generator.
An item can really be anything. The string representation should be useful for the CLI output (e.g. a direct URL for the item).'''
An item can really be anything. The string representation should be useful for the CLI output (e.g. a direct URL for the item).
'''
@abc.abstractmethod
def __str__(self):
pass
@dataclasses.dataclass
class Entity(_JSONDataclass):
'''An abstract base class for an entity returned by the scraper's entity property.
An entity is typically the account of a person or organisation. The string representation should be the preferred direct URL to the entity's page on the network.
'''
@abc.abstractmethod
def __str__(self):
pass
class IntWithGranularity(int):
'''A number with an associated granularity
For example, an IntWithGranularity(42000, 1000) represents a number on the order of 42000 with two significant digits, i.e. something counted with a granularity of 1000.
'''
def __new__(cls, value, granularity, *args, **kwargs):
obj = super().__new__(cls, value, *args, **kwargs)
obj.granularity = granularity
return obj
def __reduce__(self):
return (IntWithGranularity, (int(self), self.granularity))
class URLItem(Item):
'''A generic item which only holds a URL string.'''
@@ -40,30 +139,74 @@ class Scraper:
name = None
def __init__(self, retries = 3):
def __init__(self, *, retries = 3, proxies = None):
self._retries = retries
self._proxies = proxies
self._session = requests.Session()
@abc.abstractmethod
def get_items(self):
'''Iterator yielding Items.'''
pass
def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None):
def _get_entity(self):
'''Get the entity behind the scraper, if any.
This is the method implemented by subclasses for doing the actual retrieval/entity object creation. For accessing the scraper's entity, use the entity property.
'''
return None
@functools.cached_property
def entity(self):
return self._get_entity()
def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None, allowRedirects = True, proxies = None):
proxies = proxies or self._proxies or {}
for attempt in range(self._retries + 1):
# The request is newly prepared on each retry because of potential cookie updates.
req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None)
logger.info(f'Retrieving {req.url}')
logger.debug(f'... with headers: {headers!r}')
if data:
logger.debug(f'... with data: {data!r}')
if environmentSettings:
logger.debug(f'... with environmentSettings: {environmentSettings!r}')
try:
r = self._session.send(req, timeout = timeout)
if responseOkCallback is None or responseOkCallback(r):
logger.debug(f'{req.url} retrieved successfully')
return r
r = self._session.send(req, allow_redirects = allowRedirects, timeout = timeout, **environmentSettings)
except requests.exceptions.RequestException as exc:
logger.error(f'Error retrieving {url}: {exc!r}')
if attempt < self._retries:
retrying = ', retrying'
level = logging.INFO
else:
retrying = ''
level = logging.ERROR
logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
else:
redirected = f' (redirected to {r.url})' if r.history else ''
logger.info(f'Retrieved {req.url}{redirected}: {r.status_code}')
if r.history:
for i, redirect in enumerate(r.history):
logger.debug(f'... request {i}: {redirect.request.url}: {r.status_code} (Location: {r.headers.get("Location")})')
if responseOkCallback is not None:
success, msg = responseOkCallback(r)
else:
success, msg = (True, None)
msg = f': {msg}' if msg else ''
if success:
logger.debug(f'{req.url} retrieved successfully{msg}')
return r
else:
if attempt < self._retries:
retrying = ', retrying'
level = logging.INFO
else:
retrying = ''
level = logging.ERROR
logger.log(level, f'Error retrieving {req.url}{msg}{retrying}')
if attempt < self._retries:
sleepTime = 1.0 * 2**attempt # exponential backoff: sleep 1 second after first attempt, 2 after second, 4 after third, etc.
logger.info(f'Waiting {sleepTime:.0f} seconds')
@@ -81,11 +224,23 @@ class Scraper:
return self._request('POST', *args, **kwargs)
@classmethod
@abc.abstractmethod
def setup_parser(cls, subparser):
def _cli_setup_parser(cls, subparser):
pass
@classmethod
@abc.abstractmethod
def from_args(cls, args):
pass
def _cli_from_args(cls, args):
return cls._cli_construct(args)
@classmethod
def _cli_construct(cls, argparseArgs, *args, **kwargs):
return cls(*args, **kwargs, retries = argparseArgs.retries)
def nonempty_string(name):
def f(s):
s = s.strip()
if s:
return s
raise ValueError('must not be an empty string')
f.__name__ = name
return f

View File

@@ -1,65 +0,0 @@
import argparse
import logging
import snscrape.base
import snscrape.modules
logger = logging.getLogger(__name__)
def parse_args():
parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-v', '--verbose', '--verbosity', dest = 'verbosity', action = 'count', default = 0, help = 'Increase output verbosity')
parser.add_argument('--retry', '--retries', dest = 'retries', type = int, default = 3, metavar = 'N',
help = 'When the connection fails or the server returns an unexpected response, retry up to N times with an exponential backoff')
parser.add_argument('-n', '--max-results', dest = 'maxResults', type = int, metavar = 'N', help = 'Only return the first N results')
subparsers = parser.add_subparsers(dest = 'scraper', help = 'The scraper you want to use')
classes = snscrape.base.Scraper.__subclasses__()
for cls in classes:
subparser = subparsers.add_parser(cls.name, formatter_class = argparse.ArgumentDefaultsHelpFormatter)
cls.setup_parser(subparser)
subparser.set_defaults(cls = cls)
classes.extend(cls.__subclasses__())
args = parser.parse_args()
# http://bugs.python.org/issue16308 / https://bugs.python.org/issue26510 (fixed in Python 3.7)
if not args.scraper:
raise RuntimeError('Error: no scraper specified')
return args
def setup_logging(verbosity):
rootLogger = logging.getLogger()
# Set level
if verbosity > 0:
level = logging.INFO if verbosity == 1 else logging.DEBUG
rootLogger.setLevel(level)
for handler in rootLogger.handlers:
handler.setLevel(level)
# Create formatter
formatter = logging.Formatter('{asctime}.{msecs:03.0f} {levelname} {name} {message}', datefmt = '%Y-%m-%d %H:%M:%S', style = '{')
# Add stream handler
handler = logging.StreamHandler()
handler.setFormatter(formatter)
rootLogger.addHandler(handler)
def main():
args = parse_args()
setup_logging(args.verbosity)
scraper = args.cls.from_args(args)
i = 0
for i, item in enumerate(scraper.get_items(), start = 1):
print(item)
if args.maxResults and i >= args.maxResults:
logger.info(f'Exiting after {i} results')
break
else:
logger.info(f'Done, found {i} results')

View File

@@ -1,15 +1,17 @@
import importlib
import os
import snscrape.base
import pkgutil
__all__ = []
def _import_modules():
files = os.listdir(__path__[0])
for fn in files:
if fn.endswith('.py') and fn != '__init__.py':
# Import module if not already imported
moduleName = f'snscrape.modules.{fn[:-3]}'
module = importlib.import_module(moduleName)
prefixLen = len(__name__) + 1
for importer, moduleName, isPkg in pkgutil.iter_modules(__path__, prefix = f'{__name__}.'):
assert not isPkg
moduleNameWithoutPrefix = moduleName[prefixLen:]
__all__.append(moduleNameWithoutPrefix)
module = importer.find_module(moduleName).load_module(moduleName)
globals()[moduleNameWithoutPrefix] = module
_import_modules()

View File

@@ -1,77 +1,364 @@
__all__ = ['FacebookPost', 'User', 'FacebookUserScraper', 'FacebookCommunityScraper', 'FacebookGroupScraper']
import bs4
import dataclasses
import datetime
import json
import logging
import re
import snscrape.base
import typing
import urllib.parse
logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)
class FacebookUserScraper(snscrape.base.Scraper):
name = 'facebook-user'
@dataclasses.dataclass
class FacebookPost(snscrape.base.Item):
cleanUrl: str
dirtyUrl: str
date: datetime.datetime
content: typing.Optional[str]
outlinks: list
outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks), 'outlinks')
def __str__(self):
return self.cleanUrl
@dataclasses.dataclass
class User(snscrape.base.Entity):
username: str
pageId: int
name: str
verified: bool
created: typing.Optional[datetime.date] = None
pageOwner: typing.Optional[str] = None
likes: typing.Optional[int] = None
followers: typing.Optional[int] = None
checkins: typing.Optional[int] = None
address: typing.Optional[str] = None
phone: typing.Optional[str] = None
web: typing.Optional[str] = None
keywords: typing.Optional[typing.List[str]] = None
def __str__(self):
return f'https://www.facebook.com/{self.username}/'
class _FacebookCommonScraper(snscrape.base.Scraper):
def _clean_url(self, dirtyUrl):
u = urllib.parse.urlparse(dirtyUrl)
if u.path == '/permalink.php':
# Retain only story_fbid and id parameters
q = urllib.parse.parse_qs(u.query)
clean = (u.scheme, u.netloc, u.path, urllib.parse.urlencode((('story_fbid', q['story_fbid'][0]), ('id', q['id'][0]))), '')
elif u.path == '/photo.php':
# Retain only the fbid parameter
q = urllib.parse.parse_qs(u.query)
clean = (u.scheme, u.netloc, u.path, urllib.parse.urlencode((('fbid', q['fbid'][0]),)), '')
elif u.path == '/media/set/':
# Retain only the set parameter and try to shorten it to the minimum
q = urllib.parse.parse_qs(u.query)
setVal = q['set'][0]
if setVal.rstrip('0123456789').endswith('.a.'):
setVal = f'a.{setVal.rsplit(".", 1)[1]}'
clean = (u.scheme, u.netloc, u.path, urllib.parse.urlencode((('set', setVal),)), '')
elif u.path.split('/')[2] == 'posts' or u.path.startswith('/events/') or u.path.startswith('/notes/') or u.path.split('/')[1:4:2] == ['groups', 'permalink']:
# No manipulation of the path needed, but strip the query string
clean = (u.scheme, u.netloc, u.path, '', '')
elif u.path.split('/')[2] in ('photos', 'videos'):
# Path: "/" username or ID "/" photos or videos "/" crap "/" ID of photo or video "/"
# But to be safe, also handle URLs that don't have that crap correctly.
if u.path.count('/') == 4:
clean = (u.scheme, u.netloc, u.path, '', '')
elif u.path.count('/') == 5:
# Strip out the third path component
pathcomps = u.path.split('/')
pathcomps.pop(3) # Don't forget about the empty string at the beginning!
clean = (u.scheme, u.netloc, '/'.join(pathcomps), '', '')
else:
return dirtyUrl
else:
# If we don't recognise the URL, just return the original one.
return dirtyUrl
return urllib.parse.urlunsplit(clean)
def _is_odd_link(self, href, entryText, mode):
# Returns (isOddLink: bool, warn: bool|None)
if mode == 'user':
if not any(x in href for x in ('/posts/', '/photos/', '/videos/', '/permalink.php?', '/events/', '/notes/', '/photo.php?', '/media/set/')):
if href == '#' and 'new photo' in entryText and 'to the album' in entryText:
# Don't print a warning if it's a "User added 5 new photos to the album"-type entry, which doesn't have a permalink.
return True, False
elif href.startswith('/business/help/788160621327601/?'):
# Skip the help article about branded content
return True, False
else:
return True, True
return False, None
elif mode == 'group':
if not re.match(r'^/groups/[^/]+/permalink/\d+/(\?|$)', href):
return True, True
return False, None
def _soup_to_items(self, soup, baseUrl, mode):
cleanUrl = None # Value from previous iteration is used for warning on link-less entries
for entry in soup.find_all('div', class_ = '_5pcr'): # also class 'fbUserContent' in 2017 and 'userContentWrapper' in 2019
# Check that this is not inside another div._5pcr to avoid duplicates or extracting the wrong URL (e.g. 'X was mentioned in a post' on community pages)
parent = entry.parent
isNested = False
while parent:
if parent.name == 'div' and 'class' in parent.attrs and '_5pcr' in parent.attrs['class']:
isNested = True
break
parent = parent.parent
if isNested:
continue
entryA = entry.find('a', class_ = '_5pcq') # There can be more than one, e.g. when a post is shared by another user, but the first one is always the one of this entry.
mediaSetA = entry.find('a', class_ = '_17z-')
if not mediaSetA and not entryA:
_logger.warning(f'Ignoring link-less entry after {cleanUrl}: {entry.text!r}')
continue
if mediaSetA and (not entryA or entryA['href'] == '#'):
href = mediaSetA['href']
elif entryA:
href = entryA['href']
oddLink, warn = self._is_odd_link(href, entry.text, mode)
if oddLink:
if warn:
_logger.warning(f'Ignoring odd link: {href}')
continue
dirtyUrl = urllib.parse.urljoin(baseUrl, href)
cleanUrl = self._clean_url(dirtyUrl)
date = datetime.datetime.fromtimestamp(int(entry.find('abbr', class_ = '_5ptz')['data-utime']), datetime.timezone.utc)
if (contentDiv := entry.find('div', class_ = '_5pbx')):
content = contentDiv.text
else:
content = None
outlinks = []
for a in entry.find_all('a'):
if not a.has_attr('href'):
continue
href = a.get('href')
if not href.startswith('https://l.facebook.com/l.php?'):
continue
query = urllib.parse.parse_qs(urllib.parse.urlparse(href).query)
if 'u' not in query or len(query['u']) != 1:
_logger.warning(f'Ignoring odd outlink: {href}')
continue
outlink = query['u'][0]
if outlink.startswith('http://') or outlink.startswith('https://') and outlink not in outlinks:
outlinks.append(outlink)
yield FacebookPost(cleanUrl = cleanUrl, dirtyUrl = dirtyUrl, date = date, content = content, outlinks = outlinks)
class _FacebookUserAndCommunityScraper(_FacebookCommonScraper):
def __init__(self, username, **kwargs):
super().__init__(**kwargs)
self._username = username
self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:78.0) Gecko/20100101 Firefox/78.0', 'Accept-Language': 'en-US,en;q=0.5'}
self._initialPage = None
self._initialPageSoup = None
def _soup_to_items(self, soup, username, baseUrl):
yielded = set()
for a in soup.find_all('a', href = re.compile(r'^/[^/]+/(posts|photos|videos)/[^/]*\d')):
href = a.get('href')
if href.startswith(f'/{username}/'):
link = urllib.parse.urljoin(baseUrl, href)
if link not in yielded:
yield snscrape.base.URLItem(link)
yielded.add(link)
def _initial_page(self):
if self._initialPage is None:
_logger.info('Retrieving initial data')
r = self._get(self._baseUrl, headers = self._headers)
if r.status_code not in (200, 404):
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
self._initialPage = r
self._initialPageSoup = bs4.BeautifulSoup(r.text, 'lxml')
return self._initialPage, self._initialPageSoup
def get_items(self):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
nextPageLinkPattern = re.compile(r'^/pages_reaction_units/more/\?page_id=')
spuriousForLoopPattern = re.compile(r'^for \(;;\);')
logger.info('Retrieving initial data')
baseUrl = f'https://www.facebook.com/{self._username}/'
r = self._get(baseUrl, headers = headers)
r, soup = self._initial_page()
if r.status_code == 404:
logger.warning('User does not exist')
_logger.warning('User does not exist')
return
elif r.status_code != 200:
logger.error('Got status code {r.status_code}')
return
soup = bs4.BeautifulSoup(r.text, 'lxml')
username = re.sub(r'^https://www\.facebook\.com/([^/]+)/$', r'\1', soup.find('link').get('href')) # Canonical capitalisation
baseUrl = f'https://www.facebook.com/{username}/'
yield from self._soup_to_items(soup, username, baseUrl)
nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern)
yield from self._soup_to_items(soup, self._baseUrl, 'user')
while nextPageLink:
logger.info('Retrieving next page')
while (nextPageLink := soup.find('a', ajaxify = nextPageLinkPattern)):
_logger.info('Retrieving next page')
# The web app sends a bunch of additional parameters. Most of them would be easy to add, but there's also __dyn, which is a compressed list of the "modules" loaded in the browser.
# Reproducing that would be difficult to get right, especially as Facebook's codebase evolves, so it's just not sent at all here.
r = self._get(urllib.parse.urljoin(baseUrl, nextPageLink.get('ajaxify')) + '&__a=1', headers = headers)
r = self._get(urllib.parse.urljoin(self._baseUrl, nextPageLink.get('ajaxify')) + '&__a=1', headers = self._headers)
if r.status_code != 200:
logger.error(f'Got status code {r.status_code}')
return
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
response = json.loads(spuriousForLoopPattern.sub('', r.text))
assert 'domops' in response
assert len(response['domops']) == 1
assert len(response['domops'][0]) == 4
assert response['domops'][0][0] == 'replace', f'{response["domops"][0]} is not "replace"'
assert response['domops'][0][1] == '#www_pages_reaction_see_more_unitwww_pages_home'
assert response['domops'][0][1] in ('#www_pages_reaction_see_more_unitwww_pages_home', '#www_pages_reaction_see_more_unitwww_pages_community_tab')
assert response['domops'][0][2] == False
assert '__html' in response['domops'][0][3]
soup = bs4.BeautifulSoup(response['domops'][0][3]['__html'], 'lxml')
yield from self._soup_to_items(soup, username, baseUrl)
nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern)
yield from self._soup_to_items(soup, self._baseUrl, 'user')
@classmethod
def setup_parser(cls, subparser):
subparser.add_argument('username', help = 'A Facebook username or user ID')
def _cli_setup_parser(cls, subparser):
subparser.add_argument('username', type = snscrape.base.nonempty_string('username'), help = 'A Facebook username or user ID')
@classmethod
def from_args(cls, args):
return cls(args.username, retries = args.retries)
def _cli_from_args(cls, args):
return cls._cli_construct(args, args.username)
class FacebookUserScraper(_FacebookUserAndCommunityScraper):
name = 'facebook-user'
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._baseUrl = f'https://www.facebook.com/{self._username}/'
def _get_entity(self):
kwargs = {}
nameVerifiedMarkupPattern = re.compile(r'"markup":\[\["__markup_a588f507_0_0",\{"__html":(".*?")\}')
handleDivPattern = re.compile(r'<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>')
handlePattern = re.compile(r'<a\s[^>]*(?<=\s)href="/([^/]+)')
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
createdDatePattern = re.compile('^(' + '|'.join(months) + r') (\d+), (\d+)$')
r, soup = self._initial_page()
if r.status_code != 200:
return
handleDiv = handleDivPattern.search(r.text)
handle = handlePattern.search(handleDiv.group(0))
kwargs['username'] = handle.group(1)
nameVerifiedMarkup = nameVerifiedMarkupPattern.search(r.text)
nameVerifiedMarkup = json.loads(nameVerifiedMarkup.group(1))
nameVerifiedSoup = bs4.BeautifulSoup(nameVerifiedMarkup, 'lxml')
kwargs['name'] = nameVerifiedSoup.find('a', class_ = '_64-f').text
kwargs['verified'] = bool(nameVerifiedSoup.find('a', class_ = '_56_f'))
pageTransparencyContentDiv = soup.find('div', class_ = '_61-0')
if pageTransparencyContentDiv.text.startswith('Page created - '):
createdDateMess = pageTransparencyContentDiv.text.split(' - ', 1)[1]
m = createdDatePattern.match(createdDateMess)
assert m, 'unexpected created div content'
kwargs['created'] = datetime.date(int(m.group(3)), months.index(m.group(1)) + 1, int(m.group(2)))
if pageTransparencyContentDiv.text.startswith('Confirmed Page Owner: '):
kwargs['pageOwner'] = pageTransparencyContentDiv.text.split(': ', 1)[1]
communityDiv = soup.find('div', class_ = '_6590')
for div in communityDiv.find_all('div', class_ = '_4bl9'):
text = div.text
if text.endswith(' people like this'):
kwargs['likes'] = int(text.split(' ', 1)[0].replace(',', ''))
elif text.endswith(' people follow this'):
kwargs['followers'] = int(text.split(' ', 1)[0].replace(',', ''))
elif text.endswith(' check-ins'):
kwargs['checkins'] = int(text.split(' ', 1)[0].replace(',', ''))
aboutDiv = soup.find('div', class_ = '_u9q')
if aboutDiv:
# As if the above wasn't already ugly enough, this is where it gets really bad...
for div in aboutDiv.find_all('div', class_ = '_2pi9'):
img = div.find('img', class_ = '_3-91')
if not img:
continue
if img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/y5/r/vfXKA62x4Da.png': # Address
rawAddress = div.find('div', class_ = '_2wzd').text
kwargs['address'] = re.sub(r' \((\d+,)?\d+(\.\d+)? mi\)', '\n', rawAddress) # Remove distance from inferred IP location, restore linebreak
elif img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/yW/r/mYv88EsODOI.png': # Phone number
kwargs['phone'] = div.find('div', class_ = '_4bl9').text
elif img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/yx/r/xVA3lB-GVep.png': # Web link
for a in div.find_all('a'):
if a.text == '' or 'href' not in a.attrs or a.find('span'):
continue
dirtyWeb = a['href']
assert dirtyWeb.startswith('https://l.facebook.com/l.php?u='), 'unexpected web link'
kwargs['web'] = urllib.parse.unquote(dirtyWeb.split('=', 1)[1].split('&', 1)[0])
elif img['src'] == 'https://static.xx.fbcdn.net/rsrc.php/v3/yl/r/LwDWwC1d0Rx.png': # Keywords
kwargs['keywords'] = div.find('div', class_ = '_4bl9').text.split(' · ')
androidUrlMeta = soup.find('meta', property = 'al:android:url')
assert androidUrlMeta['content'].startswith('fb://page/') and androidUrlMeta['content'].endswith('?referrer=app_link')
kwargs['pageId'] = int(androidUrlMeta['content'][10:-18])
return User(**kwargs)
class FacebookCommunityScraper(_FacebookUserAndCommunityScraper):
name = 'facebook-community'
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._baseUrl = f'https://www.facebook.com/{self._username}/community/'
class FacebookGroupScraper(_FacebookCommonScraper):
name = 'facebook-group'
def __init__(self, group, **kwargs):
super().__init__(**kwargs)
self._group = group
def get_items(self):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', 'Accept-Language': 'en-US,en;q=0.5'}
pageletDataPattern = re.compile(r'"GroupEntstreamPagelet",\{.*?\}(?=,\{)')
pageletDataPrefixLength = len('"GroupEntstreamPagelet",')
spuriousForLoopPattern = re.compile(r'^for \(;;\);')
baseUrl = f'https://upload.facebook.com/groups/{self._group}/?sorting_setting=CHRONOLOGICAL'
r = self._get(baseUrl, headers = headers)
if r.status_code == 404:
_logger.warning('Group does not exist')
return
elif r.status_code != 200:
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
if 'content:{pagelet_group_mall:{container_id:"' not in r.text:
raise snscrape.base.ScraperException('Code container ID marker not found (does the group exist?)')
soup = bs4.BeautifulSoup(r.text, 'lxml')
# Posts are inside an HTML comment in two code tags with IDs listed in JS...
for codeContainerIdStart in ('content:{pagelet_group_mall:{container_id:"', 'content:{group_mall_after_tti:{container_id:"'):
codeContainerIdPos = r.text.index(codeContainerIdStart) + len(codeContainerIdStart)
codeContainerId = r.text[codeContainerIdPos : r.text.index('"', codeContainerIdPos)]
codeContainer = soup.find('code', id = codeContainerId)
if not codeContainer:
raise snscrape.base.ScraperException('Code container not found')
if type(codeContainer.string) is not bs4.element.Comment:
raise snscrape.base.ScraperException('Code container does not contain a comment')
codeSoup = bs4.BeautifulSoup(codeContainer.string, 'lxml')
yield from self._soup_to_items(codeSoup, baseUrl, 'group')
# Pagination
while (data := pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:]):
# As on the user profile pages, the web app sends a lot of additional parameters, but those all seem to be unnecessary (although some change the response format, e.g. from JSON to HTML)
r = self._get(
'https://upload.facebook.com/ajax/pagelet/generic.php/GroupEntstreamPagelet',
params = {'data': data, '__a': 1},
headers = headers,
)
if r.status_code != 200:
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
obj = json.loads(spuriousForLoopPattern.sub('', r.text))
if obj['payload'] == '':
# End of pagination
break
soup = bs4.BeautifulSoup(obj['payload'], 'lxml')
yield from self._soup_to_items(soup, baseUrl, 'group')
@classmethod
def _cli_setup_parser(cls, subparser):
subparser.add_argument('group', type = snscrape.base.nonempty_string('group'), help = 'A group name or ID')
@classmethod
def _cli_from_args(cls, args):
return cls._cli_construct(args, args.group)

View File

@@ -1,102 +0,0 @@
import datetime
import itertools
import json
import logging
import re
import snscrape.base
logger = logging.getLogger(__name__)
class GooglePlusUserScraper(snscrape.base.Scraper):
name = 'googleplus-user'
def __init__(self, user, **kwargs):
super().__init__(**kwargs)
self._user = user
def get_items(self):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
logger.info('Retrieving initial data')
r = self._get(f'https://plus.google.com/{self._user}', headers = headers)
if r.status_code == 404:
logger.warning('User does not exist')
return
elif r.status_code != 200:
logger.error(f'Got status code {r.status_code}')
return
# Global data; only needed for the session ID
#TODO: Make this more robust somehow
match = re.search(r'''(['"])FdrFJe\1\s*:\s*(['"])(?P<sid>.*?)\2''', r.text)
if not match:
logger.error('Unable to find session ID')
return
sid = match.group('sid')
# Page data
# As of 2018-05-18, the much simpler regex r'''<script[^>]*>AF_initDataCallback\(\{key: 'ds:6',.*?return (.*?)\}\}\);</script>''' would work also, but this is more generic and less likely to break:
match = re.search(r'''<script[^>]*>\s*(?:.*?)\s*\(\s*\{(?:|.*?,)\s*key\s*:\s*(['"])ds:6\1\s*,.*?,\s*data\s*:\s*function\s*\(\s*\)\s*\{\s*return\s*(?P<data>.*?)\}\s*\}\s*\)\s*;\s*</script>''', r.text, re.DOTALL)
if not match:
logger.error('Unable to extract data')
return
jsonData = match.group('data')
response = json.loads(jsonData)
if response[0][7] is None:
logger.info('User has no posts')
return
for postObj in response[0][7]:
yield snscrape.base.URLItem(f'https://plus.google.com/{postObj[6]["33558957"][21]}')
cursor = response[0][1] # 'ADSJ_x'
if cursor is None:
# No further pages
return
baseDate = datetime.datetime.utcnow()
baseSeconds = baseDate.hour * 3600 + baseDate.minute * 60 + baseDate.second
userid = response[1] # Alternatively and more ugly: response[0][7][0][6]['33558957'][16]
for counter in itertools.count(start = 2):
logger.info('Retrieving next page')
reqid = 1 + baseSeconds + int(1e5) * counter
r = self._post(
f'https://plus.google.com/_/PlusAppUi/data?ds.extension=74333095&f.sid={sid}&hl=en-US&soc-app=199&soc-platform=1&soc-device=1&_reqid={reqid}&rt=c',
data = [('f.req', '[[[74333095,[{"74333095":["' + cursor + '","' + userid + '"]}],null,null,0]]]'), ('', '')],
headers = headers
)
if r.status_code != 200:
logger.error(f'Got status code {r.status_code}')
return
# As if everything up to here wasn't terrible already, this is where it gets *really* bad.
# The API contains a few junk characters at the beginning, apparently as an anti-CSRF measure.
# The remainder is effectively a self-made chunked transfer encoding but with decimal digits and including everything except the digits themselves in the chunk size.
# It sucks.
# Each chunk is actually one JSON object; you'd think that we can just read the first one and parse that, but there are some quirks that make this difficult.
# I was unable to figure out what the "chunk size" actually covers exactly; the response is UTF-8 encoded, but the chunk size matches neither the binary nor the decoded length.
# Enter the awful workaround: strip away the initial chunk size, then parse the beginning of the remaining data using a parser that doesn't care if there's junk after the JSON.
garbage = r.text
assert garbage[:6] == ")]}'\n\n" # anti-CSRF and two newlines
data = []
pos = 6
while garbage[pos].isdigit() or garbage[pos].isspace(): # Also strip leading whitespace
pos += 1
response = json.JSONDecoder().raw_decode(''.join(garbage[pos:]))[0] # Parses only the first structure in the data stream without throwing an error about the extra data at the end
for postObj in response[0][2]['74333095'][0][7]:
yield snscrape.base.URLItem(f'https://plus.google.com/{postObj[6]["33558957"][21]}')
cursor = response[0][2]['74333095'][0][1]
if cursor is None:
break
@classmethod
def setup_parser(cls, subparser):
subparser.add_argument('user', help = 'A Google Plus username (with leading "+") or numeric ID')
@classmethod
def from_args(cls, args):
return cls(args.user, retries = args.retries)

View File

@@ -1,74 +1,243 @@
__all__ = ['InstagramPost', 'User', 'InstagramUserScraper', 'InstagramHashtagScraper', 'InstagramLocationScraper']
import dataclasses
import datetime
import hashlib
import json
import logging
import re
import snscrape.base
import typing
logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)
class InstagramUserScraper(snscrape.base.Scraper):
@dataclasses.dataclass
class InstagramPost(snscrape.base.Item):
url: str
date: datetime.datetime
content: typing.Optional[str]
thumbnailUrl: str
displayUrl: str
username: typing.Optional[str]
likes: int
comments: int
commentsDisabled: bool
isVideo: bool
def __str__(self):
return self.url
@dataclasses.dataclass
class User(snscrape.base.Entity):
username: str
name: typing.Optional[str]
followers: snscrape.base.IntWithGranularity
following: snscrape.base.IntWithGranularity
posts: snscrape.base.IntWithGranularity
followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
def __str__(self):
return f'https://www.instagram.com/{self.username}/'
class _InstagramCommonScraper(snscrape.base.Scraper):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self._headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
self._initialPage = None
def _response_to_items(self, response):
for node in response[self._responseContainer][self._edgeXToMedia]['edges']:
code = node['node']['shortcode']
username = node['node']['owner']['username'] if 'username' in node['node']['owner'] else None
url = f'https://www.instagram.com/p/{code}/'
yield InstagramPost(
url = url,
date = datetime.datetime.fromtimestamp(node['node']['taken_at_timestamp'], datetime.timezone.utc),
content = node['node']['edge_media_to_caption']['edges'][0]['node']['text'] if len(node['node']['edge_media_to_caption']['edges']) else None,
thumbnailUrl = node['node']['thumbnail_src'],
displayUrl = node['node']['display_url'],
username = username,
likes = node['node']['edge_media_preview_like']['count'],
comments = node['node']['edge_media_to_comment']['count'],
commentsDisabled = node['node']['comments_disabled'],
isVideo = node['node']['is_video'],
)
def _initial_page(self):
if self._initialPage is None:
_logger.info('Retrieving initial data')
r = self._get(self._initialUrl, headers = self._headers, responseOkCallback = self._check_initial_page_callback)
if r.status_code not in (200, 404):
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
elif r.url.startswith('https://www.instagram.com/accounts/login/'):
raise snscrape.base.ScraperException('Redirected to login page')
self._initialPage = r
return self._initialPage
def _check_initial_page_callback(self, r):
if r.status_code != 200:
return True, None
jsonData = r.text.split('<script type="text/javascript">window._sharedData = ')[1].split(';</script>')[0] # May throw an IndexError if Instagram changes something again; we just let that bubble.
try:
obj = json.loads(jsonData)
except json.JSONDecodeError:
return False, 'invalid JSON'
r._snscrape_json_obj = obj
return True, None
def _check_json_callback(self, r):
if r.status_code != 200:
return False, f'status code {r.status_code}'
if r.url.startswith('https://www.instagram.com/accounts/login/'):
raise snscrape.base.ScraperException('Redirected to login page')
try:
obj = json.loads(r.text)
except json.JSONDecodeError as e:
return False, f'invalid JSON ({e!r})'
r._snscrape_json_obj = obj
return True, None
def get_items(self):
r = self._initial_page()
if r.status_code == 404:
_logger.warning('Page does not exist')
return
response = r._snscrape_json_obj
rhxGis = response['rhx_gis'] if 'rhx_gis' in response else ''
if response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['count'] == 0:
_logger.info('Page has no posts')
return
if not response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['edges']:
_logger.warning('Private account')
return
pageID = response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._pageIDKey]
yield from self._response_to_items(response['entry_data'][self._pageName][0]['graphql'])
if not response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['page_info']['has_next_page']:
return
endCursor = response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['page_info']['end_cursor']
headers = self._headers.copy()
while True:
_logger.info(f'Retrieving endCursor = {endCursor!r}')
variables = self._variablesFormat.format(**locals())
headers['X-Requested-With'] = 'XMLHttpRequest'
headers['X-Instagram-GIS'] = hashlib.md5(f'{rhxGis}:{variables}'.encode('utf-8')).hexdigest()
r = self._get(f'https://www.instagram.com/graphql/query/?query_hash={self._queryHash}&variables={variables}', headers = headers, responseOkCallback = self._check_json_callback)
if r.status_code != 200:
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
response = r._snscrape_json_obj
if not response['data'][self._responseContainer][self._edgeXToMedia]['edges']:
return
yield from self._response_to_items(response['data'])
if not response['data'][self._responseContainer][self._edgeXToMedia]['page_info']['has_next_page']:
return
endCursor = response['data'][self._responseContainer][self._edgeXToMedia]['page_info']['end_cursor']
class InstagramUserScraper(_InstagramCommonScraper):
name = 'instagram-user'
def __init__(self, username, **kwargs):
super().__init__(**kwargs)
self._username = username
self._initialUrl = f'https://www.instagram.com/{username}/'
self._pageName = 'ProfilePage'
self._responseContainer = 'user'
self._edgeXToMedia = 'edge_owner_to_timeline_media'
self._pageIDKey = 'id'
self._queryHash = 'f2405b236d85e8296cf30347c9f08c2a'
self._variablesFormat = '{{"id":"{pageID}","first":50,"after":"{endCursor}"}}'
def _response_to_items(self, response, username):
for node in response['user']['edge_owner_to_timeline_media']['edges']:
code = node['node']['shortcode']
yield snscrape.base.URLItem(f'https://www.instagram.com/p/{code}/?taken-by={username}') #TODO: Do we want the taken-by parameter in here?
def get_items(self):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
logger.info('Retrieving initial data')
r = self._get(f'https://www.instagram.com/{self._username}/', headers = headers)
if r.status_code == 404:
logger.warning('User does not exist')
def _get_entity(self):
r = self._initial_page()
if r.status_code != 200:
return
elif r.status_code != 200:
logger.error(f'Got status code {r.status_code}')
if '<meta property="og:description" content="' not in r.text:
return
jsonData = r.text.split('<script type="text/javascript">window._sharedData = ')[1].split(';</script>')[0] # May throw an IndexError if Instagram changes something again; we just let that bubble.
response = json.loads(jsonData)
rhxGis = response['rhx_gis']
if response['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['count'] == 0:
logger.info('User has no posts')
return
if not response['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges']:
logger.warning('Private account')
return
userID = response['entry_data']['ProfilePage'][0]['graphql']['user']['id']
username = response['entry_data']['ProfilePage'][0]['graphql']['user']['username'] # Might have different capitalisation than self._username
yield from self._response_to_items(response['entry_data']['ProfilePage'][0]['graphql'], username)
if not response['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page']:
return
endCursor = response['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor']
ogDescriptionContentPos = r.text.index('<meta property="og:description" content="') + len('<meta property="og:description" content="')
ogDescription = r.text[ogDescriptionContentPos : r.text.index('"', ogDescriptionContentPos)]
while True:
logger.info(f'Retrieving endCursor = {endCursor!r}')
variables = f'{{"id":"{userID}","first":50,"after":"{endCursor}"}}'
headers['X-Requested-With'] = 'XMLHttpRequest'
headers['X-Instagram-GIS'] = hashlib.md5(f'{rhxGis}:{variables}'.encode('utf-8')).hexdigest()
r = self._get(f'https://www.instagram.com/graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables={variables}', headers = headers)
numPattern = r'\d+(?:\.\d+)?m|\d+(?:\.\d+)?k|\d+,\d+|\d+'
ogDescriptionPattern = re.compile('^(' + numPattern + ') Followers, (' + numPattern + ') Following, (' + numPattern + r') Posts - See Instagram photos and videos from (?:(.*?) \(@([a-z0-9_.]+)\)|@([a-z0-9_-]+))$')
m = ogDescriptionPattern.match(ogDescription)
assert m, 'unexpected og:description format'
if r.status_code != 200:
logger.error(f'Got status code {r.status_code}')
return
def parse_num(s):
if s.endswith('m'):
return int(float(s[:-1].replace(',', '')) * 1e6), 10 ** (6 if '.' not in s else 6 - len(s[:-1].replace(',', '').split('.')[1]))
elif s.endswith('k'):
return int(float(s[:-1].replace(',', '')) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].replace(',', '').split('.')[1]))
else:
return int(s.replace(',', '')), 1
response = json.loads(r.text)
if not response['data']['user']['edge_owner_to_timeline_media']['edges']:
return
yield from self._response_to_items(response['data'], username)
if not response['data']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page']:
return
endCursor = response['data']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor']
followers = snscrape.base.IntWithGranularity(*parse_num(m.group(1)))
following = snscrape.base.IntWithGranularity(*parse_num(m.group(2)))
posts = snscrape.base.IntWithGranularity(*parse_num(m.group(3)))
return User(
username = m.group(5) or m.group(6),
name = m.group(4) or None,
followers = followers,
following = following,
posts = posts,
)
@classmethod
def setup_parser(cls, subparser):
subparser.add_argument('username', help = 'An Instagram username')
def _cli_setup_parser(cls, subparser):
subparser.add_argument('username', type = snscrape.base.nonempty_string('username'), help = 'An Instagram username (no leading @)')
@classmethod
def from_args(cls, args):
return cls(args.username, retries = args.retries)
def _cli_from_args(cls, args):
return cls._cli_construct(args, args.username)
class InstagramHashtagScraper(_InstagramCommonScraper):
name = 'instagram-hashtag'
def __init__(self, hashtag, **kwargs):
super().__init__(**kwargs)
self._initialUrl = f'https://www.instagram.com/explore/tags/{hashtag}/'
self._pageName = 'TagPage'
self._responseContainer = 'hashtag'
self._edgeXToMedia = 'edge_hashtag_to_media'
self._pageIDKey = 'name'
self._queryHash = 'f92f56d47dc7a55b606908374b43a314'
self._variablesFormat = '{{"tag_name":"{pageID}","first":50,"after":"{endCursor}"}}'
@classmethod
def _cli_setup_parser(cls, subparser):
subparser.add_argument('hashtag', type = snscrape.base.nonempty_string('hashtag'), help = 'An Instagram hashtag (no leading #)')
@classmethod
def _cli_from_args(cls, args):
return cls._cli_construct(args, args.hashtag)
class InstagramLocationScraper(_InstagramCommonScraper):
name = 'instagram-location'
def __init__(self, locationId, **kwargs):
super().__init__(**kwargs)
self._initialUrl = f'https://www.instagram.com/explore/locations/{locationId}/'
self._pageName = 'LocationsPage'
self._responseContainer = 'location'
self._edgeXToMedia = 'edge_location_to_media'
self._pageIDKey = 'id'
self._queryHash = '1b84447a4d8b6d6d0426fefb34514485'
self._variablesFormat = '{{"id":"{pageID}","first":50,"after":"{endCursor}"}}'
@classmethod
def _cli_setup_parser(cls, subparser):
subparser.add_argument('locationid', help = 'An Instagram location ID', type = int)
@classmethod
def _cli_from_args(cls, args):
return cls._cli_construct(args, args.locationid)

View File

@@ -0,0 +1,340 @@
__all__ = ['Toot', 'Boost', 'Attachment', 'Poll', 'PollOption', 'User', 'CustomEmoji', 'MastodonProfileScraper', 'MastodonTootScraperMode', 'MastodonTootScraper']
import bs4
import dataclasses
import datetime
import enum
import json
import logging
import snscrape.base
import time
import typing
import urllib.parse
_logger = logging.getLogger(__name__)
@dataclasses.dataclass
class Toot(snscrape.base.Item):
url: str
id: str
user: 'User'
date: datetime.datetime
text: str
spoilerText: typing.Optional[str] = None
attachments: typing.Optional[typing.List['Attachment']] = None
links: typing.Optional[typing.List[str]] = None
mentionedUsers: typing.Optional[typing.List['User']] = None
hashtags: typing.Optional[typing.List[str]] = None
poll: typing.Optional['Poll'] = None
def __str__(self):
return self.url
@dataclasses.dataclass
class Boost(snscrape.base.Item):
user: 'User'
toot: Toot
def __str__(self):
# Boosts don't have their own URLs
return str(self.toot)
@dataclasses.dataclass
class Attachment:
url: str
name: str
@dataclasses.dataclass
class Poll:
id: str
expirationDate: datetime.datetime
multiple: bool
options: typing.List['PollOption']
votesCount: int
votersCount: typing.Optional[int] = None # Available since version 3.0.0 (commit 3babf846)
@dataclasses.dataclass
class PollOption:
title: str
votesCount: int
@dataclasses.dataclass
class User(snscrape.base.Entity):
account: str # @username@domain.invalid
displayName: typing.Optional[str] = None
displayNameWithCustomEmojis: typing.Optional[typing.List[typing.Union[str, 'CustomEmoji']]] = None
avatarUrl: typing.Optional[str] = None
_url: typing.Optional[str] = None
@property
def url(self):
if self._url:
return self._url
return f'https://{"/@".join(reversed(self.account[1:].split("@")))}'
def __str__(self):
return self.url
@dataclasses.dataclass
class CustomEmoji:
shortName: str
url: str
staticUrl: str
class _MastodonCommonScraper(snscrape.base.Scraper):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept-Language': 'en-US,en;q=0.5'}
self._lastRequest = 0
def _rate_limited_get(self, *args, **kwargs):
if (diff := time.time() - self._lastRequest) < 3:
time.sleep(3 - diff)
self._lastRequest = time.time()
return self._get(*args, **kwargs)
def _entries_to_items(self, entries, url):
for entry in entries:
if entry.find('a', class_ = 'load-more'):
continue
tootKwargs = {}
info = entry.find('div', class_ = 'status__info')
if not info: # Before 2.5.0 (commit bb71538b)
info = entry.find('div', class_ = 'status__header')
if not info: # Detailed status (i.e. toot page rather than timeline)?
info = entry.find('div', class_ = 'detailed-status__meta')
link = info.find('a', class_ = 'status__relative-time')
if not link: # Detailed status?
link = info.find('a', class_ = 'detailed-status__datetime')
tootKwargs['url'] = link['href']
tootKwargs['id'] = tootKwargs['url'].rsplit('/', 1)[1]
tootKwargs['date'] = datetime.datetime.strptime(info.find('data', class_ = 'dt-published')['value'], '%Y-%m-%dT%H:%M:%S+00:00').replace(tzinfo = datetime.timezone.utc)
userKwargs = {}
userLink = info.find('a', class_ = 'status__display-name')
if not userLink: # Detailed status?
userLink = entry.find('a', class_ = 'detailed-status__display-name')
userNameSpan = userLink.find('span', class_ = 'display-name')
userKwargs['account'] = userNameSpan.find('span').text.strip()
if userKwargs['account'].count('@') == 1: # Ancient versions don't include the instance for posts from accounts on the instance itself
userKwargs['account'] = self._url_to_account(userLink['href'])
userKwargs['_url'] = urllib.parse.urljoin(url, userLink['href'])
userKwargs['displayName'], userKwargs['displayNameWithCustomEmojis'] = self._display_name(userNameSpan.find('strong'), url)
userKwargs['avatarUrl'] = urllib.parse.urljoin(url, userLink.find('img', class_ = 'u-photo')['src'])
tootKwargs['user'] = User(**userKwargs)
content = entry.find('div', class_ = 'status__content')
if not content.find(class_ = 'status__content__spoiler-link'):
tootKwargs['text'] = '\n\n'.join(p.text for p in content.find_all('p'))
else:
tootKwargs['text'] = content.find('span', class_ = 'p-summary').text
tootKwargs['spoilerText'] = '\n\n'.join(p.text for p in content.find('div', class_ = 'e-content').find_all('p'))
if (attachmentsDiv := entry.find('div', class_ = 'attachment-list')):
attachments = []
for a in attachmentsDiv.find_all('a'):
attachments.append(Attachment(url = urllib.parse.urljoin(url, a['href']), name = a.text.strip()))
tootKwargs['attachments'] = attachments
elif (mediaGalleryDiv := entry.find('div', attrs = {'data-component': 'MediaGallery'})): # Before 2.7.0 (https://github.com/mastodon/mastodon/issues/6714)
o = json.loads(mediaGalleryDiv['data-props'])
attachments = []
for medium in o['media']:
attachments.append(Attachment(url = urllib.parse.urljoin(url, medium['url']), name = medium['url'].rsplit('/', 1)[-1].strip()))
tootKwargs['attachments'] = attachments
elif (attachmentsDiv := entry.find('div', class_ = 'status__attachments')): # Before 2.3.0 (commit 2bbf987a)
attachments = []
for a in attachmentsDiv.find_all('a'):
attachments.append(Attachment(url = urllib.parse.urljoin(url, a['href']), name = a['href'].rsplit('/', 1)[1]))
tootKwargs['attachments'] = attachments
links = []
mentionedUsers = []
hashtags = []
for a in content.find_all('a'):
cls = a.get('class', [])
if 'mention' in cls and 'u-url' in cls:
mentionUrl = urllib.parse.urljoin(url, a['href'])
mentionedUsers.append(User(account = self._url_to_account(mentionUrl), _url = mentionUrl))
elif 'mention' in cls and 'hashtag' in cls:
hashtags.append(a.text.strip())
else:
links.append(urllib.parse.urljoin(url, a['href']))
if links:
tootKwargs['links'] = links
if mentionedUsers:
tootKwargs['mentionedUsers'] = mentionedUsers
if hashtags:
tootKwargs['hashtags'] = hashtags
if (pollDiv := entry.find('div', attrs = {'data-component': 'Poll'})):
o = json.loads(pollDiv['data-props'])
pollKwargs = {}
pollKwargs['id'] = o['poll']['id']
pollKwargs['expirationDate'] = datetime.datetime.strptime(o['poll']['expires_at'], '%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo = datetime.timezone.utc)
pollKwargs['multiple'] = o['poll']['multiple']
pollKwargs['options'] = [PollOption(title = op['title'], votesCount = op['votes_count']) for op in o['poll']['options']]
pollKwargs['votesCount'] = o['poll']['votes_count']
if 'voters_count' in o['poll']: # 3.0.0 (commit 3babf846)
pollKwargs['votersCount'] = o['poll']['voters_count']
tootKwargs['poll'] = Poll(**pollKwargs)
toot = Toot(**tootKwargs)
# Boosts
prepend = entry.find('div', class_ = 'status__prepend')
if not prepend: # Before 2.5.0 (commit bb71538b)
prepend = entry.find('div', class_ = 'pre-header')
if prepend and prepend.find('i', class_ = 'fa-retweet'): # Is a boost
userKwargs = {}
userLink = prepend.find('a', class_ = 'status__display-name')
# The user is always on this instance since that's the only place where boosts are shown, hence there is no explicit account span. Reconstruct from URL.
userUrl = urllib.parse.urljoin(url, userLink['href'])
assert userUrl.count('/') == 3 and userUrl.count('/@') == 1
userKwargs['account'] = '@'.join(reversed(userUrl.split('/')[2:]))
userKwargs['displayName'], userKwargs['displayNameWithCustomEmojis'] = self._display_name(userLink.find('strong'), url)
toot = Boost(user = User(**userKwargs), toot = toot)
yield toot
def _display_name(self, strong, url):
outPlain = []
outFull = []
hasCustomEmoji = False
for child in strong.children:
if isinstance(child, bs4.element.NavigableString):
outPlain.append(str(child))
outFull.append(str(child))
elif child.name == 'img' and 'custom-emoji' in child.get('class', []):
hasCustomEmoji = True
outPlain.append(child['alt'])
outFull.append(CustomEmoji(shortName = child['alt'], url = urllib.parse.urljoin(url, child['data-original']), staticUrl = urllib.parse.urljoin(url, child['data-static'])))
elif child.name == 'img' and 'emojione' in child.get('class', []):
# Version 2.0.0 (which first added custom emojis) to 2.9.4: no data-* attributes, only gets one of the URLs with no (easy, reliable) way of knowing which it is.
hasCustomEmoji = True
outPlain.append(child['alt'])
outFull.append(CustomEmoji(shortName = child['alt'], url = urllib.parse.urljoin(url, child['src'])))
else:
_logger.warning(f'Unexpected display name child: {child!r}')
return ''.join(outPlain), outFull if hasCustomEmoji else None
@staticmethod
def _url_to_account(url):
if url.count('/') == 3 and url.count('/@') == 1:
return '@'.join(reversed(url.split('/')[2:]))
if url.count('/') == 4 and '/users/' in url: # E.g. Pleroma, also supported by Mastodon
return '@' + '@'.join(reversed(url.split('/')[2::2]))
if url.count('/') == 4 and '/accounts/' in url: # E.g. Peertube
return '@' + '@'.join(reversed(url.split('/')[2::2]))
if url.count('/') == 4 and '/profile/' in url: # E.g. Friendica
return '@' + '@'.join(reversed(url.split('/')[2::2]))
raise ValueError('Unrecognised account URL format')
class MastodonProfileScraper(_MastodonCommonScraper):
name = 'mastodon-profile'
def __init__(self, account, **kwargs):
super().__init__(**kwargs)
if account.startswith('@') and account.count('@') == 2:
account, domain = account[1:].split('@')
url = f'https://{domain}/@{account}'
else:
url = account
self._url = url
def get_items(self):
initial = True
while True:
if initial:
r = self._rate_limited_get(f'{self._url}/with_replies', headers = self._headers)
if r.status_code not in (200, 404):
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
if r.status_code == 404: # Possibly an old instance where with_replies doesn't exist, try without that.
r = self._rate_limited_get(self._url, headers = self._headers)
if r.status_code not in (200, 404):
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
if r.status_code == 404:
_logger.warning('Account does not exist')
return
_logger.warning('Old Mastodon instance, cannot retrieve reply toots')
initial = False
else:
r = self._rate_limited_get(url, headers = self._headers)
if r.status_code != 200:
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
soup = bs4.BeautifulSoup(r.text, 'lxml')
yield from self._entries_to_items(soup.find('div', class_ = 'activity-stream').find_all('div', class_ = 'entry'), r.url)
nextA = soup.find('a', class_ = 'load-more', href = lambda x: '?max_id=' in x or '&max_id=' in x)
if not nextA: # Before 2.5.0 (commit bb71538b)
paginationDiv = soup.find('div', class_ = 'pagination')
if paginationDiv:
nextA = paginationDiv.find('a', class_ = 'next')
if not nextA: # End of pagination
break
url = urllib.parse.urljoin(r.url, nextA['href'])
@classmethod
def _cli_setup_parser(cls, subparser):
subparser.add_argument('account', type = snscrape.base.nonempty_string('account'), help = 'A Mastodon account. This can be either a URL to the profile page or a string of the form @account@instance.example.org')
@classmethod
def _cli_from_args(cls, args):
return cls._cli_construct(args, args.account)
class MastodonTootScraperMode(enum.Enum):
SINGLE = 'single'
THREAD = 'thread'
@classmethod
def _cli_from_args(cls, args):
if args.thread:
return cls.THREAD
return cls.SINGLE
class MastodonTootScraper(_MastodonCommonScraper):
name = 'mastodon-toot'
def __init__(self, url, *, mode = MastodonTootScraperMode.SINGLE, **kwargs):
super().__init__(**kwargs)
self._url = url
self._mode = mode
def get_items(self):
r = self._rate_limited_get(self._url, headers = self._headers)
if r.status_code == 404:
_logger.warning('Toot does not exist')
return
if r.status_code != 200:
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
soup = bs4.BeautifulSoup(r.text, 'lxml')
if self._mode is MastodonTootScraperMode.SINGLE:
status = soup.find('div', class_ = 'detailed-status')
entry = status.parent
yield from self._entries_to_items([entry], r.url)
elif self._mode is MastodonTootScraperMode.THREAD:
yield from self._entries_to_items(soup.find('div', class_ = 'activity-stream').find_all('div', class_ = 'entry'), r.url)
@classmethod
def _cli_setup_parser(cls, subparser):
subparser.add_argument('--thread', action = 'store_true', help = 'Collect thread around the toot referenced by the URL')
subparser.add_argument('url', type = snscrape.base.nonempty_string('url'), help = 'A URL for a toot')
@classmethod
def _cli_from_args(cls, args):
return cls._cli_construct(args, args.url, mode = MastodonTootScraperMode._cli_from_args(args))

285
snscrape/modules/reddit.py Normal file
View File

@@ -0,0 +1,285 @@
__all__ = ['Submission', 'Comment', 'RedditUserScraper', 'RedditSubredditScraper', 'RedditSearchScraper', 'RedditSubmissionScraper']
import dataclasses
import datetime
import logging
import re
import snscrape.base
import snscrape.version
import string
import time
import typing
_logger = logging.getLogger(__name__)
# Most of these fields should never be None, but due to broken data, they sometimes are anyway...
@dataclasses.dataclass
class Submission(snscrape.base.Item):
author: typing.Optional[str] # E.g. submission hf7k6
date: datetime.datetime
id: str
link: typing.Optional[str]
selftext: typing.Optional[str]
subreddit: typing.Optional[str] # E.g. submission 617p51
title: str
url: str
created = snscrape.base._DeprecatedProperty('created', lambda self: self.date, 'date')
def __str__(self):
return self.url
@dataclasses.dataclass
class Comment(snscrape.base.Item):
author: typing.Optional[str]
body: str
date: datetime.datetime
id: str
parentId: typing.Optional[str]
subreddit: typing.Optional[str]
url: str
created = snscrape.base._DeprecatedProperty('created', lambda self: self.date, 'date')
def __str__(self):
return self.url
def _cmp_id(id1, id2):
'''Compare two Reddit IDs. Returns -1 if id1 is less than id2, 0 if they are equal, and 1 if id1 is greater than id2.
id1 and id2 may have prefixes like t1_, but if included, they must be present on both and equal.'''
if id1.startswith('t') and '_' in id1:
prefix, id1 = id1.split('_', 1)
if not id2.startswith(f'{prefix}_'):
raise ValueError('id2 must have the same prefix as id1')
_, id2 = id2.split('_', 1)
if id1.strip(string.ascii_lowercase + string.digits) != '':
raise ValueError('invalid characters in id1')
if id2.strip(string.ascii_lowercase + string.digits) != '':
raise ValueError('invalid characters in id2')
if len(id1) < len(id2):
return -1
if len(id1) > len(id2):
return 1
if id1 < id2:
return -1
if id1 > id2:
return 1
return 0
class _RedditPushshiftScraper(snscrape.base.Scraper):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self._headers = {'User-Agent': f'snscrape/{snscrape.version.__version__}'}
def _handle_rate_limiting(self, r):
if r.status_code == 429:
_logger.info('Got 429 response, sleeping')
time.sleep(10)
return False, 'rate-limited'
if r.status_code != 200:
return False, 'non-200 status code'
return True, None
def _get_api(self, url, params = None):
r = self._get(url, params = params, headers = self._headers, responseOkCallback = self._handle_rate_limiting)
if r.status_code != 200:
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
return r.json()
def _api_obj_to_item(self, d):
cls = Submission if 'title' in d else Comment
# Pushshift doesn't always return a permalink; sometimes, there's a permalink_url instead, and sometimes there's nothing at all
permalink = d.get('permalink')
if permalink is None:
# E.g. comment dovj2v7
permalink = d.get('permalink_url')
if permalink is None:
if 'link_id' in d and d['link_id'].startswith('t3_'): # E.g. comment doraazf
if 'subreddit' in d:
permalink = f'/r/{d["subreddit"]}/comments/{d["link_id"][3:]}/_/{d["id"]}/'
else: # E.g. submission 617p51 but can likely happen for comments as well
permalink = f'/comments/{d["link_id"][3:]}/_/{d["id"]}/'
else:
_logger.warning('Unable to find or construct permalink')
permalink = '/'
kwargs = {
'author': d.get('author'),
'date': datetime.datetime.fromtimestamp(d['created_utc'], datetime.timezone.utc),
'url': f'https://old.reddit.com{permalink}',
'subreddit': d.get('subreddit'),
}
if cls is Submission:
kwargs['selftext'] = d.get('selftext') or None
kwargs['link'] = (d['url'] if not d['url'].startswith('/') else f'https://old.reddit.com{d["url"]}') if not kwargs['selftext'] else None
if kwargs['link'] == kwargs['url'] or kwargs['url'].replace('//old.reddit.com/', '//www.reddit.com/') == kwargs['link']:
kwargs['link'] = None
kwargs['title'] = d['title']
kwargs['id'] = f't3_{d["id"]}'
else:
kwargs['body'] = d['body']
kwargs['parentId'] = d.get('parent_id')
kwargs['id'] = f't1_{d["id"]}'
return cls(**kwargs)
class _RedditPushshiftSearchScraper(_RedditPushshiftScraper):
def __init__(self, name, *, submissions = True, comments = True, before = None, after = None, **kwargs):
super().__init__(**kwargs)
self._name = name
self._submissions = submissions
self._comments = comments
self._before = before
self._after = after
if not type(self)._validationFunc(self._name):
raise ValueError(f'invalid {type(self).name.split("-", 1)[1]} name')
if not self._submissions and not self._comments:
raise ValueError('At least one of submissions and comments must be True')
def _iter_api(self, url, params = None):
'''Iterate through the Pushshift API using the 'before' parameter and yield the items.'''
lowestIdSeen = None
if params is None:
params = {}
if self._before is not None:
params['before'] = self._before
if self._after is not None:
params['after'] = self._after
params['sort'] = 'desc'
while True:
obj = self._get_api(url, params = params)
if not obj['data'] or (lowestIdSeen is not None and all(_cmp_id(d['id'], lowestIdSeen) >= 0 for d in obj['data'])): # end of pagination
break
for d in obj['data']:
if lowestIdSeen is None or _cmp_id(d['id'], lowestIdSeen) == -1:
yield self._api_obj_to_item(d)
lowestIdSeen = d['id']
params['before'] = obj["data"][-1]["created_utc"] + 1
def _iter_api_submissions_and_comments(self, params: dict):
# Retrieve both submissions and comments, interleave the results to get a reverse-chronological order
params['size'] = '1000'
if self._submissions:
submissionsIter = self._iter_api('https://api.pushshift.io/reddit/search/submission/', params.copy()) # Pass copies to prevent the two iterators from messing each other up by using the same dict
else:
submissionsIter = iter(())
if self._comments:
commentsIter = self._iter_api('https://api.pushshift.io/reddit/search/comment/', params.copy())
else:
commentsIter = iter(())
try:
tipSubmission = next(submissionsIter)
except StopIteration:
# There are no submissions, just yield comments and return
yield from commentsIter
return
try:
tipComment = next(commentsIter)
except StopIteration:
# There are no comments, just yield submissions and return
yield tipSubmission
yield from submissionsIter
return
while True:
# Return newer first; if both have the same creation datetime, return the comment first
if tipSubmission.date > tipComment.date:
yield tipSubmission
try:
tipSubmission = next(submissionsIter)
except StopIteration:
# Reached the end of submissions, just yield the remaining comments and stop
yield tipComment
yield from commentsIter
break
else:
yield tipComment
try:
tipComment = next(commentsIter)
except StopIteration:
yield tipSubmission
yield from submissionsIter
break
def get_items(self):
yield from self._iter_api_submissions_and_comments({type(self)._apiField: self._name})
@classmethod
def _cli_setup_parser(cls, subparser):
subparser.add_argument('--no-submissions', dest = 'noSubmissions', action = 'store_true', default = False, help = 'Don\'t list submissions')
subparser.add_argument('--no-comments', dest = 'noComments', action = 'store_true', default = False, help = 'Don\'t list comments')
subparser.add_argument('--before', metavar = 'TIMESTAMP', type = int, help = 'Fetch results before a Unix timestamp')
subparser.add_argument('--after', metavar = 'TIMESTAMP', type = int, help = 'Fetch results after a Unix timestamp')
name = cls.name.split('-', 1)[1]
subparser.add_argument(name, type = snscrape.base.nonempty_string(name))
@classmethod
def _cli_from_args(cls, args):
name = cls.name.split('-', 1)[1]
return cls._cli_construct(args, getattr(args, name), submissions = not args.noSubmissions, comments = not args.noComments, before = args.before, after = args.after)
class RedditUserScraper(_RedditPushshiftSearchScraper):
name = 'reddit-user'
_validationFunc = lambda x: re.match('^[A-Za-z0-9_-]{3,20}$', x)
_apiField = 'author'
class RedditSubredditScraper(_RedditPushshiftSearchScraper):
name = 'reddit-subreddit'
_validationFunc = lambda x: re.match('^[A-Za-z0-9][A-Za-z0-9_]{2,20}$', x)
_apiField = 'subreddit'
class RedditSearchScraper(_RedditPushshiftSearchScraper):
name = 'reddit-search'
_validationFunc = lambda x: True
_apiField = 'q'
class RedditSubmissionScraper(_RedditPushshiftScraper):
name = 'reddit-submission'
def __init__(self, submissionId, **kwargs):
if (submissionId[3:] if submissionId.startswith('t3_') else submissionId).strip(string.ascii_lowercase + string.digits) != '':
raise ValueError('invalid submissionId')
super().__init__(**kwargs)
self._submissionId = submissionId
def get_items(self):
obj = self._get_api(f'https://api.pushshift.io/reddit/search/submission/?ids={self._submissionId}')
if not obj['data']:
return
if len(obj['data']) != 1:
raise snscrape.base.ScraperException(f'Got {len(obj["data"])} results instead of 1')
yield self._api_obj_to_item(obj['data'][0])
obj = self._get_api(f'https://api.pushshift.io/reddit/submission/comment_ids/{self._submissionId}')
if not obj['data']:
return
commentIds = obj['data']
for i in range(0, len(commentIds), 500):
ids = commentIds[i : i + 500]
obj = self._get_api(f'https://api.pushshift.io/reddit/comment/search?ids={",".join(ids)}')
yield from map(self._api_obj_to_item, obj['data'])
@classmethod
def _cli_setup_parser(cls, subparser):
subparser.add_argument('submissionId', type = snscrape.base.nonempty_string('submissionId'))
@classmethod
def _cli_from_args(cls, args):
return cls._cli_construct(args, args.submissionId)

View File

@@ -0,0 +1,339 @@
__all__ = ['LinkPreview', 'TelegramPost', 'Channel', 'TelegramChannelScraper']
import bs4
import dataclasses
import datetime
import logging
import re
import snscrape.base
import typing
import urllib.parse
_logger = logging.getLogger(__name__)
_SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
_STYLE_MEDIA_URL_PATTERN = re.compile(r'url\(\'(.*?)\'\)')
@dataclasses.dataclass
class LinkPreview:
href: str
siteName: typing.Optional[str] = None
title: typing.Optional[str] = None
description: typing.Optional[str] = None
image: typing.Optional[str] = None
@dataclasses.dataclass
class Channel(snscrape.base.Entity):
username: str
title: typing.Optional[str] = None
verified: typing.Optional[bool] = None
photo: typing.Optional[str] = None
description: typing.Optional[str] = None
members: typing.Optional[int] = None
photos: typing.Optional[snscrape.base.IntWithGranularity] = None
videos: typing.Optional[snscrape.base.IntWithGranularity] = None
links: typing.Optional[snscrape.base.IntWithGranularity] = None
files: typing.Optional[snscrape.base.IntWithGranularity] = None
photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
videosGranularity = snscrape.base._DeprecatedProperty('videosGranularity', lambda self: self.videos.granularity, 'videos.granularity')
linksGranularity = snscrape.base._DeprecatedProperty('linksGranularity', lambda self: self.links.granularity, 'links.granularity')
filesGranularity = snscrape.base._DeprecatedProperty('filesGranularity', lambda self: self.files.granularity, 'files.granularity')
def __str__(self):
return f'https://t.me/s/{self.username}'
@dataclasses.dataclass
class TelegramPost(snscrape.base.Item):
url: str
date: datetime.datetime
content: str
outlinks: typing.List[str] = None
mentions: typing.List[str] = None
hashtags: typing.List[str] = None
forwarded: typing.Optional['Channel'] = None
forwardedUrl: typing.Optional[str] = None
media: typing.Optional[typing.List['Medium']] = None
views: typing.Optional[snscrape.base.IntWithGranularity] = None
linkPreview: typing.Optional[LinkPreview] = None
outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks), 'outlinks')
def __str__(self):
return self.url
class Medium:
pass
@dataclasses.dataclass
class Photo(Medium):
url: str
@dataclasses.dataclass
class Video(Medium):
thumbnailUrl: str
duration: float
url: typing.Optional[str] = None
@dataclasses.dataclass
class VoiceMessage(Medium):
url: str
duration: str
bars:typing.List[float]
@dataclasses.dataclass
class Gif(Medium):
thumbnailUrl: str
url: typing.Optional[str] = None
class TelegramChannelScraper(snscrape.base.Scraper):
name = 'telegram-channel'
def __init__(self, name, **kwargs):
super().__init__(**kwargs)
self._name = name
self._headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
self._initialPage = None
self._initialPageSoup = None
def _initial_page(self):
if self._initialPage is None:
r = self._get(f'https://t.me/s/{self._name}', headers = self._headers)
if r.status_code != 200:
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
self._initialPage, self._initialPageSoup = r, bs4.BeautifulSoup(r.text, 'lxml')
return self._initialPage, self._initialPageSoup
def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
posts = soup.find_all('div', attrs = {'class': 'tgme_widget_message', 'data-post': True})
for post in reversed(posts):
if onlyUsername:
yield post['data-post'].split('/')[0]
return
dateDiv = post.find('div', class_ = 'tgme_widget_message_footer').find('a', class_ = 'tgme_widget_message_date')
rawUrl = dateDiv['href']
if not rawUrl.startswith('https://t.me/') or sum(x == '/' for x in rawUrl) != 4 or rawUrl.rsplit('/', 1)[1].strip('0123456789') != '':
_logger.warning(f'Possibly incorrect URL: {rawUrl!r}')
url = rawUrl.replace('//t.me/', '//t.me/s/')
date = datetime.datetime.strptime(dateDiv.find('time', datetime = True)['datetime'].replace('-', '', 2).replace(':', ''), '%Y%m%dT%H%M%S%z')
media = []
outlinks = []
mentions = []
hashtags = []
forwarded = None
forwardedUrl = None
if (forwardTag := post.find('a', class_ = 'tgme_widget_message_forwarded_from_name')):
forwardedUrl = forwardTag['href']
forwardedName = forwardedUrl.split('t.me/')[1].split('/')[0]
forwarded = Channel(username = forwardedName)
if (message := post.find('div', class_ = 'tgme_widget_message_text')):
content = message.get_text(separator="\n")
else:
content = None
for link in post.find_all('a'):
if any(x in link.parent.attrs.get('class', []) for x in ('tgme_widget_message_user', 'tgme_widget_message_author')):
# Author links at the top (avatar and name)
continue
if link['href'] == rawUrl or link['href'] == url:
style = link.attrs.get('style', '')
# Generic filter of links to the post itself, catches videos, photos, and the date link
if style != '':
imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style)
if len(imageUrls) == 1:
media.append(Photo(url = imageUrls[0]))
continue
if _SINGLE_MEDIA_LINK_PATTERN.match(link['href']):
style = link.attrs.get('style', '')
imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style)
if len(imageUrls) == 1:
media.append(Photo(url = imageUrls[0]))
# resp = self._get(image[0])
# encoded_string = base64.b64encode(resp.content)
# Individual photo or video link
continue
if link.text.startswith('@'):
mentions.append(link.text.strip('@'))
continue
if link.text.startswith('#'):
hashtags.append(link.text.strip('#'))
continue
href = urllib.parse.urljoin(pageUrl, link['href'])
if (href not in outlinks) and (href != rawUrl) and (href != forwardedUrl):
outlinks.append(href)
for voicePlayer in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}):
audioUrl = voicePlayer.find('audio')['src']
durationStr = voicePlayer.find('time').text
duration = _durationStrToSeconds(durationStr)
barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')]
media.append(VoiceMessage(url = audioUrl, duration = duration, bars = barHeights))
for videoPlayer in post.find_all('a', {'class': 'tgme_widget_message_video_player'}):
iTag = videoPlayer.find('i')
if iTag is None:
videoUrl = None
videoThumbnailUrl = None
else:
style = iTag['style']
videoThumbnailUrl = _STYLE_MEDIA_URL_PATTERN.findall(style)[0]
videoTag = videoPlayer.find('video')
videoUrl = None if videoTag is None else videoTag['src']
mKwargs = {
'thumbnailUrl': videoThumbnailUrl,
'url': videoUrl,
}
timeTag = videoPlayer.find('time')
if timeTag is None:
cls = Gif
else:
cls = Video
durationStr = videoPlayer.find('time').text
mKwargs['duration'] = _durationStrToSeconds(durationStr)
media.append(cls(**mKwargs))
linkPreview = None
if (linkPreviewA := post.find('a', class_ = 'tgme_widget_message_link_preview')):
kwargs = {}
kwargs['href'] = urllib.parse.urljoin(pageUrl, linkPreviewA['href'])
if (siteNameDiv := linkPreviewA.find('div', class_ = 'link_preview_site_name')):
kwargs['siteName'] = siteNameDiv.text
if (titleDiv := linkPreviewA.find('div', class_ = 'link_preview_title')):
kwargs['title'] = titleDiv.text
if (descriptionDiv := linkPreviewA.find('div', class_ = 'link_preview_description')):
kwargs['description'] = descriptionDiv.text
if (imageI := linkPreviewA.find('i', class_ = 'link_preview_image')):
if imageI['style'].startswith("background-image:url('"):
kwargs['image'] = imageI['style'][22 : imageI['style'].index("'", 22)]
else:
_logger.warning(f'Could not process link preview image on {url}')
linkPreview = LinkPreview(**kwargs)
if kwargs['href'] in outlinks:
outlinks.remove(kwargs['href'])
viewsSpan = post.find('span', class_ = 'tgme_widget_message_views')
views = None if viewsSpan is None else _parse_num(viewsSpan.text)
outlinks = outlinks if outlinks else None
media = media if media else None
mentions = mentions if mentions else None
hashtags = hashtags if hashtags else None
yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, mentions = mentions, hashtags = hashtags, linkPreview = linkPreview, media = media, forwarded = forwarded, forwardedUrl = forwardedUrl, views = views)
def get_items(self):
r, soup = self._initial_page()
if '/s/' not in r.url:
_logger.warning('No public post list for this user')
return
nextPageUrl = ''
while True:
yield from self._soup_to_items(soup, r.url)
try:
if soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)['href'].split('/')[-1] == '1':
# if message 1 is the first message in the page, terminate scraping
break
except:
pass
pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})
if not pageLink:
# some pages are missing a "tme_messages_more" tag, causing early termination
if '=' not in nextPageUrl:
nextPageUrl = soup.find('link', attrs = {'rel': 'canonical'}, href = True)['href']
nextPostIndex = int(nextPageUrl.split('=')[-1]) - 20
if nextPostIndex > 20:
pageLink = {'href': nextPageUrl.split('=')[0] + f'={nextPostIndex}'}
else:
break
nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = _telegramResponseOkCallback)
if r.status_code != 200:
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
soup = bs4.BeautifulSoup(r.text, 'lxml')
def _get_entity(self):
kwargs = {}
# /channel has a more accurate member count and bigger profile picture
r = self._get(f'https://t.me/{self._name}', headers = self._headers)
if r.status_code != 200:
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
soup = bs4.BeautifulSoup(r.text, 'lxml')
membersDiv = soup.find('div', class_ = 'tgme_page_extra')
if membersDiv.text.split(',')[0].endswith((' members', ' subscribers')):
membersStr = ''.join(membersDiv.text.split(',')[0].split(' ')[:-1])
if membersStr == 'no':
kwargs['members'] = 0
else:
kwargs['members'] = int(membersStr)
photoImg = soup.find('img', class_ = 'tgme_page_photo_image')
if photoImg is not None:
kwargs['photo'] = photoImg.attrs['src']
else:
kwargs['photo'] = None
r, soup = self._initial_page()
if '/s/' not in r.url: # Redirect on channels without public posts
return
channelInfoDiv = soup.find('div', class_ = 'tgme_channel_info')
assert channelInfoDiv, 'channel info div not found'
titleDiv = channelInfoDiv.find('div', class_ = 'tgme_channel_info_header_title')
kwargs['title'] = titleDiv.find('span').text
kwargs['verified'] = bool(titleDiv.find('i', class_ = 'verified-icon'))
# The username in the channel info is not canonicalised, nor is the one on the /channel page anywhere.
# However, the post URLs are, so extract the first post and use that.
try:
kwargs['username'] = next(self._soup_to_items(soup, r.url, onlyUsername = True))
except StopIteration:
# If there are no posts, fall back to the channel info div, although that should never happen due to the 'Channel created' entry.
_logger.warning('Could not find a post; extracting username from channel info div, which may not be capitalised correctly')
kwargs['username'] = channelInfoDiv.find('div', class_ = 'tgme_channel_info_header_username').text[1:] # Remove @
if (descriptionDiv := channelInfoDiv.find('div', class_ = 'tgme_channel_info_description')):
kwargs['description'] = descriptionDiv.text
for div in channelInfoDiv.find_all('div', class_ = 'tgme_channel_info_counter'):
value, granularity = _parse_num(div.find('span', class_ = 'counter_value').text)
type_ = div.find('span', class_ = 'counter_type').text
if type_ == 'members':
# Already extracted more accurately from /channel, skip
continue
elif type_ in ('photos', 'videos', 'links', 'files'):
kwargs[type_] = snscrape.base.IntWithGranularity(value, granularity)
return Channel(**kwargs)
@classmethod
def _cli_setup_parser(cls, subparser):
subparser.add_argument('channel', type = snscrape.base.nonempty_string('channel'), help = 'A channel name')
@classmethod
def _cli_from_args(cls, args):
return cls._cli_construct(args, args.channel)
def _parse_num(s):
s = s.replace(' ', '')
if s.endswith('M'):
return int(float(s[:-1]) * 1e6), 10 ** (6 if '.' not in s else 6 - len(s[:-1].split('.')[1]))
elif s.endswith('K'):
return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1]))
return int(s), 1
def _durationStrToSeconds(durationStr):
durationList = durationStr.split(':')
return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(durationList))])
def _telegramResponseOkCallback(r):
if r.status_code == 200:
return (True, None)
return (False, f'{r.status_code=}')

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,406 @@
__all__ = ['VKontaktePost', 'Photo', 'PhotoVariant', 'Video', 'User', 'VKontakteUserScraper']
import bs4
import collections
import dataclasses
import datetime
import itertools
import json
import logging
import re
import snscrape.base
import typing
import urllib.parse
try:
import zoneinfo
except ImportError:
# Python 3.8 support; nowadays, Europe/Moscow is always UTC+3, but it's more complicated before 2014, so need proper zone info
import pytz
def _timezone(s):
return pytz.timezone(s)
def _localised_datetime(tz, *args, **kwargs):
return tz.localize(datetime.datetime(*args, **kwargs))
else:
def _timezone(s):
return zoneinfo.ZoneInfo(s)
def _localised_datetime(tz, *args, **kwargs):
return datetime.datetime(*args, tzinfo = tz, **kwargs)
_logger = logging.getLogger(__name__)
_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
_datePattern = re.compile(r'^(?P<date>today'
r'|yesterday'
r'|(?P<day1>\d+)\s+(?P<month1>' + '|'.join(_months) + r')(\s+(?P<year1>\d{4}))?'
r'|(?P<month2>' + '|'.join(_months) + r')\s+(?P<day2>\d+),\s+(?P<year2>\d{4})'
')'
r'\s+at\s+(?P<hour>\d+):(?P<minute>\d+)\s+(?P<ampm>[ap]m)$')
@dataclasses.dataclass
class User(snscrape.base.Entity):
username: str
name: str
verified: bool
description: typing.Optional[str] = None
websites: typing.Optional[typing.List[str]] = None
followers: typing.Optional[snscrape.base.IntWithGranularity] = None
posts: typing.Optional[snscrape.base.IntWithGranularity] = None
photos: typing.Optional[snscrape.base.IntWithGranularity] = None
tags: typing.Optional[snscrape.base.IntWithGranularity] = None
following: typing.Optional[snscrape.base.IntWithGranularity] = None
followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
tagsGranularity = snscrape.base._DeprecatedProperty('tagsGranularity', lambda self: self.tags.granularity, 'tags.granularity')
followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
def __str__(self):
return f'https://vk.com/{self.username}'
@dataclasses.dataclass
class VKontaktePost(snscrape.base.Item):
url: str
date: typing.Optional[typing.Union[datetime.datetime, datetime.date]]
content: str
user: User
outlinks: typing.Optional[typing.List[str]] = None
photos: typing.Optional[typing.List['Photo']] = None
video: typing.Optional['Video'] = None
quotedPost: typing.Optional['VKontaktePost'] = None
def __str__(self):
return self.url
@dataclasses.dataclass
class Photo:
variants: typing.List['PhotoVariant']
url: typing.Optional[str] = None
@dataclasses.dataclass
class PhotoVariant:
url: str
width: int
height: int
@dataclasses.dataclass
class Video:
id: str
list: str
duration: int
url: str
thumbUrl: str
class VKontakteUserScraper(snscrape.base.Scraper):
name = 'vkontakte-user'
def __init__(self, username, **kwargs):
super().__init__(**kwargs)
self._username = username
self._baseUrl = f'https://vk.com/{self._username}'
self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept-Language': 'en-US,en;q=0.5'}
self._initialPage = None
self._initialPageSoup = None
def _away_a_to_url(self, a):
# Transform an <a> tag with an href of /away.php?to=... to a plain URL; returns None if a doesn't have that form.
if a and a.get('href', '').startswith('/away.php?to='):
end = a['href'].find('&', 13)
if end == -1:
end = None
return urllib.parse.unquote(a['href'][13 : end])
return None
def _date_span_to_date(self, dateSpan):
if not dateSpan:
return None
if 'time' in dateSpan.attrs:
return datetime.datetime.fromtimestamp(int(dateSpan['time']), datetime.timezone.utc)
if (match := _datePattern.match(dateSpan.text)):
# Datetime information down to minutes
tz = _timezone('Europe/Moscow')
if match.group('date') in ('today', 'yesterday'):
date = datetime.datetime.now(tz = tz)
if match.group('date') == 'yesterday':
date -= datetime.timedelta(days = 1)
year, month, day = date.year, date.month, date.day
else:
year = int(match.group('year1') or match.group('year2') or datetime.datetime.now(tz = tz).year)
month = _months.index(match.group('month1') or match.group('month2')) + 1
day = int(match.group('day1') or match.group('day2'))
hour = int(match.group('hour'))
# Damn AM/PM...
if hour == 12:
hour -= 12
if match.group('ampm') == 'pm':
hour += 12
minute = int(match.group('minute'))
return _localised_datetime(tz, year, month, day, hour, minute)
if (match := re.match(r'^(?P<day>\d+)\s+(?P<month>' + '|'.join(_months) + r')\s+(?P<year>\d{4})$', dateSpan.text)):
# Date only
return datetime.date(int(match.group('year')), _months.index(match.group('month')) + 1, int(match.group('day')))
if dateSpan.text not in ('video', 'photo'): # Silently ignore video and photo reposts which have no original date attached
_logger.warning(f'Could not parse date string: {dateSpan.text!r}')
def _post_div_to_item(self, post, isCopy = False):
postLink = post.find('a', class_ = 'post_link' if not isCopy else 'published_by_date')
if not postLink:
_logger.warning(f'Skipping post without link: {str(post)[:200]!r}')
return
url = urllib.parse.urljoin(self._baseUrl, postLink['href'])
assert (url.startswith('https://vk.com/wall') or (isCopy and (url.startswith('https://vk.com/video') or url.startswith('https://vk.com/photo')))) and '_' in url and url[-1] != '_' and url.rsplit('_', 1)[1].strip('0123456789') in ('', '?reply=')
if not isCopy:
dateSpan = post.find('div', class_ = 'post_date').find('span', class_ = 'rel_date')
else:
dateSpan = post.find('div', class_ = 'copy_post_date').find('a', class_ = 'published_by_date')
textDiv = post.find('div', class_ = 'wall_post_text')
outlinks = [h for a in textDiv.find_all('a') if (h := self._away_a_to_url(a))] if textDiv else []
if (mediaLinkDiv := post.find('div', class_ = 'media_link')) and \
(mediaLinkA := mediaLinkDiv.find('a', class_ = 'media_link__title')) and \
(href := self._away_a_to_url(mediaLinkA)) and \
href not in outlinks:
outlinks.append(href)
photos = None
video = None
if (thumbsDiv := (post.find('div', class_ = 'wall_text') if not isCopy else post).find('div', class_ = 'page_post_sized_thumbs')) and \
not (not isCopy and thumbsDiv.parent.name == 'div' and 'class' in thumbsDiv.parent.attrs and 'copy_quote' in thumbsDiv.parent.attrs['class']): # Skip post quotes
photos = []
for a in thumbsDiv.find_all('a', class_ = 'page_post_thumb_wrap'):
if 'data-photo-id' not in a.attrs and 'data-video' not in a.attrs:
_logger.warning(f'Skipping non-photo and non-video thumb wrap on {url}')
continue
if 'data-video' in a.attrs:
# Video
video = Video(
id = a['data-video'],
list = a['data-list'],
duration = int(a['data-duration']),
url = f'https://vk.com{a["href"]}',
thumbUrl = a['style'][(begin := a['style'].find('background-image: url(') + 22) : a['style'].find(')', begin)],
)
continue
# From here on: photo
if 'onclick' not in a.attrs or not a['onclick'].startswith("return showPhoto('") or '{"temp":' not in a['onclick'] or not a['onclick'].endswith('}, event)'):
_logger.warning(f'Photo thumb wrap on {url} has no or unexpected onclick, skipping')
continue
photoData = a['onclick'][a['onclick'].find('{"temp":') : -8] # -8 = len(', event)')
photoObj = json.loads(photoData)
singleLetterKeys = [k for k in photoObj['temp'].keys() if len(k) == 1 and 97 <= ord(k) <= 122] # 97 = ord('a'), 122 = ord('z')
for x in singleLetterKeys:
# Merge base into URLs
if not photoObj['temp'][x].startswith('https://'):
photoObj['temp'][x] = f'{photoObj["temp"]["base"]}{photoObj["temp"][x]}'
x_ = f'{x}_'
if not photoObj['temp'][x_][0].startswith('https://'):
photoObj['temp'][x_][0] = f'{photoObj["temp"]["base"]}{photoObj["temp"][x_][0]}'
if any(k not in {'base', 'w', 'w_', 'x', 'x_', 'y', 'y_', 'z', 'z_'} for k in photoObj['temp'].keys()) or \
not all(photoObj['temp'][x] in (photoObj['temp'][f'{x}_'][0], photoObj['temp'][f'{x}_'][0] + '.jpg') for x in singleLetterKeys) or \
not all(photoObj['temp'][x].startswith('https://sun') and '.userapi.com/' in photoObj['temp'][x] for x in singleLetterKeys) or \
not all(len(photoObj['temp'][(x_ := f'{x}_')]) == 3 and isinstance(photoObj['temp'][x_][1], int) and isinstance(photoObj['temp'][x_][2], int) for x in singleLetterKeys):
_logger.warning(f'Photo thumb wrap on {url} has unexpected data structure, skipping')
continue
photoVariants = []
for x in singleLetterKeys:
x_ = f'{x}_'
photoVariants.append(PhotoVariant(url = f'{photoObj["temp"][x_][0]}.jpg' if '.jpg' not in photoObj['temp'][x_][0] else photoObj['temp'][x_][0], width = photoObj['temp'][x_][1], height = photoObj['temp'][x_][2]))
photoUrl = f'https://vk.com{a["href"]}' if 'href' in a.attrs and a['href'].startswith('/photo') and a['href'][6:].strip('0123456789-_') == '' else None
photos.append(Photo(variants = photoVariants, url = photoUrl))
quotedPost = self._post_div_to_item(quoteDiv, isCopy = True) if (quoteDiv := post.find('div', class_ = 'copy_quote')) else None
authorHeading = post.find('h5', class_ = ['post_author', 'copy_post_author'])
authorLink = authorHeading.find('a', class_ = ['author', 'copy_author'])
username = authorLink['href'].split('/')[-1]
name = authorLink.text
if authorHeading.find('div', class_ = 'page_verified') is not None:
verified = True
else:
verified = False
user = User(username = username, name = name, verified = verified)
return VKontaktePost(
url = url,
date = self._date_span_to_date(dateSpan),
content = textDiv.text if textDiv else None,
user = user,
outlinks = outlinks or None,
photos = photos or None,
video = video or None,
quotedPost = quotedPost,
)
def _soup_to_items(self, soup):
for post in soup.find_all('div', class_ = 'post'):
yield self._post_div_to_item(post)
def _initial_page(self):
if self._initialPage is None:
_logger.info('Retrieving initial data')
r = self._get(self._baseUrl, headers = self._headers)
if r.status_code not in (200, 404):
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
# VK sends windows-1251-encoded data, but Requests's decoding doesn't seem to work correctly and causes lxml to choke, so we need to pass the binary content and the encoding explicitly.
self._initialPage, self._initialPageSoup = r, bs4.BeautifulSoup(r.content, 'lxml', from_encoding = r.encoding)
return self._initialPage, self._initialPageSoup
def get_items(self):
r, soup = self._initial_page()
if r.status_code == 404:
_logger.warning('Wall does not exist')
return
if soup.find('div', class_ = 'profile_closed_wall_dummy'):
_logger.warning('Private profile')
return
if (profileDeleted := soup.find('h5', class_ = 'profile_deleted_text')):
# Unclear what this state represents, so just log website text.
_logger.warning(profileDeleted.text)
return
newestPost = soup.find('div', class_ = 'post')
if not newestPost:
_logger.info('Wall has no posts')
return
ownerID = newestPost.attrs['data-post-id'].split('_')[0]
# If there is a pinned post, we need its ID for the pagination requests
if 'post_fixed' in newestPost.attrs['class']:
fixedPostID = int(newestPost.attrs['id'].split('_')[1])
else:
fixedPostID = ''
last1000PostIDs = collections.deque(maxlen = 1000)
def _process_soup(soup):
nonlocal last1000PostIDs
for item in self._soup_to_items(soup):
postID = int(item.url.rsplit('_', 1)[1])
if postID not in last1000PostIDs:
yield item
last1000PostIDs.append(postID)
yield from _process_soup(soup)
lastWorkingOffset = 0
for offset in itertools.count(start = 10, step = 10):
posts = self._get_wall_offset(fixedPostID, ownerID, offset)
if posts.startswith('<div class="page_block no_posts">'):
# Reached the end
break
if not posts.startswith('<div id="post'):
if posts == '"\\/blank.php?block=119910902"':
_logger.warning(f'Encountered geoblock on offset {offset}, trying to work around the block but might be missing content')
for geoblockOffset in range(lastWorkingOffset + 1, offset + 10):
geoPosts = self._get_wall_offset(fixedPostID, ownerID, geoblockOffset)
if geoPosts.startswith('<div class="page_block no_posts">'):
# No breaking the outer loop, it'll just make one extra request and exit as well
break
if not geoPosts.startswith('<div id="post'):
if geoPosts == '"\\/blank.php?block=119910902"':
continue
raise snscrape.base.ScraperException(f'Got an unknown response: {geoPosts[:200]!r}...')
yield from _process_soup(soup = bs4.BeautifulSoup(geoPosts, 'lxml'))
continue
raise snscrape.base.ScraperException(f'Got an unknown response: {posts[:200]!r}...')
lastWorkingOffset = offset
soup = bs4.BeautifulSoup(posts, 'lxml')
yield from _process_soup(soup)
def _get_wall_offset(self, fixedPostID, ownerID, offset):
headers = self._headers.copy()
headers['X-Requested-With'] = 'XMLHttpRequest'
_logger.info(f'Retrieving page offset {offset}')
r = self._post(
'https://vk.com/al_wall.php',
data = [('act', 'get_wall'), ('al', 1), ('fixed', fixedPostID), ('offset', offset), ('onlyCache', 'false'), ('owner_id', ownerID), ('type', 'own'), ('wall_start_from', offset)],
headers = headers
)
if r.status_code != 200:
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
# Convert to JSON and read the HTML payload. Note that this implicitly converts the data to a Python string (i.e., Unicode), away from a windows-1251-encoded bytes.
posts = r.json()['payload'][1][0]
return posts
def _get_entity(self):
r, soup = self._initial_page()
if r.status_code != 200:
return
kwargs = {}
kwargs['username'] = r.url.rsplit('/', 1)[1]
nameH1 = soup.find('h1', class_ = 'page_name')
kwargs['name'] = nameH1.text
kwargs['verified'] = bool(nameH1.find('div', class_ = 'page_verified'))
if (descriptionDiv := soup.find('div', id = 'page_current_info')):
kwargs['description'] = descriptionDiv.text
if (infoDiv := soup.find('div', id = 'page_info_wrap')):
websites = []
for rowDiv in infoDiv.find_all('div', class_ = ['profile_info_row', 'group_info_row']):
if 'profile_info_row' in rowDiv['class']:
labelDiv = rowDiv.find('div', class_ = 'fl_l')
if not labelDiv or labelDiv.text != 'Website:':
continue
else: # group_info_row
if rowDiv['title'] == 'Description':
kwargs['description'] = rowDiv.text
if rowDiv['title'] != 'Website':
continue
for a in rowDiv.find_all('a'):
if not a['href'].startswith('/away.php?to='):
_logger.warning(f'Skipping odd website link: {a["href"]!r}')
continue
websites.append(urllib.parse.unquote(a['href'].split('=', 1)[1].split('&', 1)[0]))
if websites:
kwargs['websites'] = websites
def parse_num(s: str) -> typing.Tuple[int, int]:
if s.endswith('K'):
return int(s[:-1]) * 1000, 1000
elif s.endswith('M'):
baseNum = s[:-1]
precision = 1000000
if '.' in s:
precision //= (10 ** len(baseNum.split('.')[1]))
return int(float(baseNum) * 1000000), precision
else:
return int(s.replace(',', '')), 1
if (countsDiv := soup.find('div', class_ = 'counts_module')):
for a in countsDiv.find_all('a', class_ = 'page_counter'):
count, granularity = parse_num(a.find('div', class_ = 'count').text)
label = a.find('div', class_ = 'label').text
if label in ('follower', 'post', 'photo', 'tag'):
label = f'{label}s'
if label in ('followers', 'posts', 'photos', 'tags'):
kwargs[label] = snscrape.base.IntWithGranularity(count, granularity)
if (idolsDiv := soup.find('div', id = 'profile_idols')):
if (topDiv := idolsDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Following':
kwargs['following'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
# On public pages, this is where followers are listed
if (followersDiv := soup.find('div', id = 'public_followers')):
if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Followers':
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
# On community groups, this is where followers are listed
elif (followersDiv := soup.find('div', class_ = 'group_friends_text')):
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(followersDiv.find('span', class_ = 'group_friends_count').text))
# On public groups, this is where followers are listed
elif (followersDiv := soup.find('div', id = 'group_followers')):
if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Members':
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
return User(**kwargs)
@classmethod
def _cli_setup_parser(cls, subparser):
subparser.add_argument('username', type = snscrape.base.nonempty_string('username'), help = 'A VK username')
@classmethod
def _cli_from_args(cls, args):
return cls._cli_construct(args, args.username)

151
snscrape/modules/weibo.py Normal file
View File

@@ -0,0 +1,151 @@
__all__ = ['Post', 'User', 'WeiboUserScraper']
import dataclasses
import logging
import re
import snscrape.base
import typing
_logger = logging.getLogger(__name__)
_userDoesNotExist = object()
_HTML_STRIP_PATTERN = re.compile(r'<[^>]*>')
@dataclasses.dataclass
class Post(snscrape.base.Item):
url: str
id: str
user: typing.Optional['User']
createdAt: str # Can have a variety of inconsistent formats
text: str
repostsCount: typing.Optional[int]
commentsCount: typing.Optional[typing.Union[int, str]]
likesCount: typing.Optional[int]
picturesCount: typing.Optional[int]
pictures: typing.Optional[typing.List[str]] # May be shorter than pictureCount if the API didn't return all of them (e.g. post Ipay2evb0)
video: typing.Optional[str]
link: typing.Optional[str]
repostedPost: typing.Optional['Post']
def __str__(self):
return self.url
@dataclasses.dataclass
class User(snscrape.base.Entity):
screenname: str
uid: int
verified: bool
verifiedReason: typing.Optional[str]
description: str
statusesCount: int
followersCount: int
followCount: int
avatar: str
def __str__(self):
return f'https://m.weibo.cn/u/{self.uid}'
class WeiboUserScraper(snscrape.base.Scraper):
name = 'weibo-user'
def __init__(self, user, **kwargs):
super().__init__(**kwargs)
self._user = user
self._isUserId = isinstance(user, int)
self._headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
def _ensure_user_id(self):
if self._isUserId:
return
r = self._get(f'https://m.weibo.cn/n/{self._user}', headers = self._headers, allowRedirects = False)
if r.status_code == 302 and r.headers['Location'].startswith('/u/') and len(r.headers['Location']) == 13 and r.headers['Location'][3:].strip('0123456789') == '':
# Redirect to uid URL
self._user = int(r.headers['Location'][3:])
self._isUserId = True
elif r.status_code == 200 and '<p class="h5-4con">用户不存在</p>' in r.text:
_logger.warning('User does not exist')
self._user = _userDoesNotExist
else:
raise snscrape.base.ScraperException(f'Got unexpected response on resolving username ({r.status_code})')
def _check_timeline_response(self, r):
if r.status_code == 200 and r.content == b'{"ok":0,"msg":"\\u8fd9\\u91cc\\u8fd8\\u6ca1\\u6709\\u5185\\u5bb9","data":{"cards":[]}}':
# 'No content here yet'. Appears to happen sometimes on pagination, possibly due to too fast requests; retry this
return False, 'no-content message'
if r.status_code != 200:
return False, 'non-200 status code'
return True, None
def _mblog_to_item(self, mblog):
return Post(
url = f'https://m.weibo.cn/status/{mblog["bid"]}',
id = mblog['id'],
user = self._user_info_to_entity(mblog['user']) if mblog['user'] is not None else None,
createdAt = mblog['created_at'],
text = mblog['raw_text'] if 'raw_text' in mblog else _HTML_STRIP_PATTERN.sub('', mblog['text']),
repostsCount = mblog.get('reposts_count'),
commentsCount = mblog.get('comments_count'),
likesCount = mblog.get('attitudes_count'),
picturesCount = mblog.get('pic_num'),
pictures = [x['large']['url'] for x in mblog['pics']] if 'pics' in mblog else None,
video = mblog['page_info']['media_info']['mp4_720p_mp4'] if 'page_info' in mblog and mblog['page_info']['type'] == 'video' else None,
link = mblog['page_info']['page_url'] if 'page_info' in mblog and mblog['page_info']['type'] == 'webpage' else None,
repostedPost = self._mblog_to_item(mblog['retweeted_status']) if 'retweeted_status' in mblog else None,
)
def get_items(self):
self._ensure_user_id()
if self._user is _userDoesNotExist:
return
sinceId = None
while True:
sinceParam = f'&since_id={sinceId}' if sinceId is not None else ''
r = self._get(f'https://m.weibo.cn/api/container/getIndex?type=uid&value={self._user}&containerid=107603{self._user}&count=25{sinceParam}', headers = self._headers, responseOkCallback = self._check_timeline_response)
if r.status_code != 200:
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
o = r.json()
for card in o['data']['cards']:
if card['card_type'] != 9:
_logger.warning(f'Skipping card of type {card["card_type"]}')
continue
yield self._mblog_to_item(card['mblog'])
if 'since_id' not in o['data']['cardlistInfo']:
# End of pagination
break
sinceId = o['data']['cardlistInfo']['since_id']
def _user_info_to_entity(self, userInfo):
return User(
screenname = userInfo['screen_name'],
uid = userInfo['id'],
verified = userInfo['verified'],
verifiedReason = userInfo.get('verified_reason'),
description = userInfo['description'],
statusesCount = userInfo['statuses_count'],
followersCount = userInfo['followers_count'],
followCount = userInfo['follow_count'],
avatar = userInfo['avatar_hd'],
)
def _get_entity(self):
self._ensure_user_id()
if self._user is _userDoesNotExist:
return
r = self._get(f'https://m.weibo.cn/api/container/getIndex?type=uid&value={self._user}', headers = self._headers)
if r.status_code != 200:
raise snscrape.base.ScraperException('Could not fetch user info')
o = r.json()
return self._user_info_to_entity(o['data']['userInfo'])
@classmethod
def _cli_setup_parser(cls, subparser):
subparser.add_argument('--name', dest = 'isName', action = 'store_true', help = 'Use username instead of user ID')
subparser.add_argument('user', type = snscrape.base.nonempty_string('user'), help = 'A user ID')
@classmethod
def _cli_from_args(cls, args):
return cls._cli_construct(args, user = args.user if args.isName else int(args.user))

7
snscrape/version.py Normal file
View File

@@ -0,0 +1,7 @@
import importlib.metadata
try:
__version__ = importlib.metadata.version('snscrape')
except importlib.metadata.PackageNotFoundError:
__version__ = None