mirror of
https://github.com/bellingcat/vk-url-scraper.git
synced 2026-06-12 13:28:37 +03:00
Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
187cfa83c8 |
@@ -2,7 +2,7 @@ version: 2
|
|||||||
|
|
||||||
sphinx:
|
sphinx:
|
||||||
configuration: docs/source/conf.py
|
configuration: docs/source/conf.py
|
||||||
fail_on_warning: true
|
fail_on_warning: false
|
||||||
|
|
||||||
python:
|
python:
|
||||||
version: "3.8"
|
version: "3.8"
|
||||||
|
|||||||
@@ -8,6 +8,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
## Unreleased
|
## Unreleased
|
||||||
|
|
||||||
## [0.1.2]
|
## [0.1.2]
|
||||||
### Added wall scraper with tests
|
* Added wall scraper with tests
|
||||||
### Added photo scraper with tests
|
* Added photo scraper with tests
|
||||||
### Added scraper with tests
|
* Added scraper with tests
|
||||||
8
Makefile
8
Makefile
@@ -5,8 +5,12 @@ docs :
|
|||||||
|
|
||||||
.PHONY : run-checks
|
.PHONY : run-checks
|
||||||
run-checks :
|
run-checks :
|
||||||
isort --check .
|
# do with --check to not change files
|
||||||
black --check .
|
# isort --check .
|
||||||
|
# black --check .
|
||||||
|
# do like this to fix files
|
||||||
|
isort .
|
||||||
|
black .
|
||||||
flake8 .
|
flake8 .
|
||||||
mypy .
|
mypy .
|
||||||
CUDA_VISIBLE_DEVICES='' pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/
|
CUDA_VISIBLE_DEVICES='' pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/
|
||||||
|
|||||||
56
README.md
56
README.md
@@ -1 +1,57 @@
|
|||||||
# vk-url-scraper
|
# vk-url-scraper
|
||||||
|
Library to scrape data and especially media links (videos and photos) from vk.com URLs.
|
||||||
|
|
||||||
|
|
||||||
|
# TODO
|
||||||
|
* docs online from sphinx
|
||||||
|
|
||||||
|
## Quick usage
|
||||||
|
`pip install vk-url-scraper` to install.
|
||||||
|
|
||||||
|
|
||||||
|
```python
|
||||||
|
from vk_url_scraper import VkScraper
|
||||||
|
|
||||||
|
vks = VkScraper("username", "password")
|
||||||
|
|
||||||
|
# scrape any "photo" URL
|
||||||
|
res = vks.scrape("https://vk.com/photo1_278184324?rev=1")
|
||||||
|
|
||||||
|
# scrape any "wall" URL
|
||||||
|
res = vks.scrape("https://vk.com/wall-1_398461")
|
||||||
|
|
||||||
|
# scrape any "video" URL
|
||||||
|
res = vks.scrape("https://vk.com/video-6596301_145810025")
|
||||||
|
print(res[0]["text]) # eg: -> to get the text from code
|
||||||
|
```
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Every scrape* function returns a list of dict like
|
||||||
|
{
|
||||||
|
"id": "wall_id",
|
||||||
|
"text": "text in this post" ,
|
||||||
|
"datetime": utc datetime of post,
|
||||||
|
"attachments": {
|
||||||
|
# if photo, video, link exists
|
||||||
|
"photo": [list of urls with max quality],
|
||||||
|
"video": [list of urls with max quality],
|
||||||
|
"link": [list of urls with max quality],
|
||||||
|
},
|
||||||
|
"payload": "original JSON response converted to dict which you can parse for more data
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
see [docs] for all available functions.
|
||||||
|
|
||||||
|
### Development
|
||||||
|
1. setup environment with `pip install -r requirements` or `pipenv install -r requirements`
|
||||||
|
2. To run all checks to `make run-checks` (fixes style) or individually
|
||||||
|
1. To fix style: `black .` and `isort .` -> `flake8 .` to validate lint
|
||||||
|
2. To do type checking: `mypy .`
|
||||||
|
3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to user verbose, colors, and test docstring examples)
|
||||||
|
3. `make docs` to generate shpynx docs -> edit [config.py](docs/source/conf.py) if needed
|
||||||
|
|
||||||
|
### Releasing new version
|
||||||
|
1. edit [version.py](vk_url_scraper/version.py) with proper versioning
|
||||||
|
2. `git tag vx.y.z` to tag version
|
||||||
|
3. `git push origin vx.y.z` -> this will trigger workflow and put project on [pypi](https://pypi.org/project/vk-url-scraper/)
|
||||||
@@ -24,6 +24,15 @@ def test_scrape_empty_urll():
|
|||||||
assert [] == vks.scrape("something")
|
assert [] == vks.scrape("something")
|
||||||
|
|
||||||
|
|
||||||
|
def test_scrape_no_vk_parseable_info():
|
||||||
|
assert len(vks.scrape("")) == 0
|
||||||
|
assert len(vks.scrape("google.com")) == 0
|
||||||
|
assert len(vks.scrape("vk.com")) == 0
|
||||||
|
assert len(vks.scrape("vk.com/wall")) == 0
|
||||||
|
assert len(vks.scrape("vk.com/photo")) == 0
|
||||||
|
assert len(vks.scrape("vk.com/video")) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_scrape_wall_url_with_text_only():
|
def test_scrape_wall_url_with_text_only():
|
||||||
res = vks.scrape("https://vk.com/wall-1_398461")
|
res = vks.scrape("https://vk.com/wall-1_398461")
|
||||||
assert len(res) == 1
|
assert len(res) == 1
|
||||||
|
|||||||
@@ -8,13 +8,32 @@ import vk_api # used to get api_token after authentication
|
|||||||
|
|
||||||
|
|
||||||
class VkScraper:
|
class VkScraper:
|
||||||
|
"""VkScraper class that allows to authenticate and scrape URLs.
|
||||||
|
|
||||||
|
All `scrape*` functions return a payload like:
|
||||||
|
|
||||||
|
.. highlight:: python
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": "wall_id",
|
||||||
|
"text": "text in this post" ,
|
||||||
|
"datetime": datetime of post,
|
||||||
|
"attachments": {
|
||||||
|
# only present values will appear, can be empty dict
|
||||||
|
"photo": [list of urls with max quality],
|
||||||
|
"video": [list of urls with max quality],
|
||||||
|
"link": [list of urls with max quality],
|
||||||
|
},
|
||||||
|
"payload": {"more": "original JSON response as dict which you can parse for more data"}
|
||||||
|
}
|
||||||
|
"""
|
||||||
WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)")
|
WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)")
|
||||||
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
|
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
|
||||||
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)")
|
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)")
|
||||||
|
|
||||||
def __init__(self, username: str, password: str, verbose: bool = True) -> None:
|
def __init__(self, username: str, password: str) -> None:
|
||||||
"""
|
"""Initializes the scraper.
|
||||||
Initializes the scraper.
|
|
||||||
|
|
||||||
This function receives a username and password and performs authentication on vk.com to then call api endpoints
|
This function receives a username and password and performs authentication on vk.com to then call api endpoints
|
||||||
|
|
||||||
@@ -24,44 +43,54 @@ class VkScraper:
|
|||||||
Username on vk.com, can be a phone number or email
|
Username on vk.com, can be a phone number or email
|
||||||
password : str
|
password : str
|
||||||
Matching password on vk.com
|
Matching password on vk.com
|
||||||
verbose : bool = False
|
|
||||||
If True will log debug info
|
|
||||||
|
|
||||||
Examples
|
|
||||||
--------
|
|
||||||
>>> VkScraper("+12345678", "password")
|
|
||||||
"""
|
"""
|
||||||
self.session = vk_api.VkApi(username, password)
|
self.session = vk_api.VkApi(username, password)
|
||||||
self.session.auth(token_only=True)
|
self.session.auth(token_only=True)
|
||||||
self.verbose = verbose
|
|
||||||
|
|
||||||
def scrape(self, url: str) -> List:
|
def scrape(self, url: str) -> List:
|
||||||
|
"""Scrapes a URL for multiple possibilities of inner links such as wall, video, photo, ...
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
url : str
|
||||||
|
The URL to parse and analyze content from, typically shared from vk.com feature
|
||||||
|
or copy-pasted from the browser
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
a list of dict as specified in the class documentation.
|
||||||
|
"""
|
||||||
return self.scrape_walls(url) + self.scrape_photos(url) + self.scrape_videos(url)
|
return self.scrape_walls(url) + self.scrape_photos(url) + self.scrape_videos(url)
|
||||||
|
|
||||||
def scrape_walls(self, url: str) -> List:
|
def scrape_walls(self, url: str) -> List:
|
||||||
|
"""Scrapes a URL for multiple wall data
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
url : str
|
||||||
|
The URL to parse - should contain something like "...wall1212_3434..."
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
a list of dict as specified in the class documentation.
|
||||||
|
"""
|
||||||
wall_ids = self.WALL_PATTERN.findall(url)
|
wall_ids = self.WALL_PATTERN.findall(url)
|
||||||
return self.scrape_wall_ids(wall_ids)
|
return self.scrape_wall_ids(wall_ids)
|
||||||
|
|
||||||
def scrape_wall_ids(self, wall_ids: List[str], copy_history_depth: int = 2) -> List:
|
def scrape_wall_ids(self, wall_ids: List[str], copy_history_depth: int = 2) -> List[dict]:
|
||||||
"""
|
"""
|
||||||
Receives a list of wall ids like wall123123_1231
|
Receives a list of wall ids like wall123123_1231 see `api docs <https://dev.vk.com/method/wall.getById>`__
|
||||||
Returns a list with one item per wall_id where each item contains:
|
|
||||||
|
|
||||||
:returns `{
|
Parameters
|
||||||
"id": "wall_id",
|
----------
|
||||||
"text": "text in this post" ,
|
wall_ids : List[str]
|
||||||
"datetime": datetime of post,
|
list with valid wall ids like "wall123123_1231"
|
||||||
"attachments": {
|
copy_history_depth : int
|
||||||
"photo": [list of urls with max quality],
|
see `api docs <https://dev.vk.com/method/wall.getById>`__
|
||||||
"album": [list of urls with max quality],
|
|
||||||
# untested:
|
|
||||||
# "video": [list of urls with max quality],
|
|
||||||
# "link": [list of urls with max quality],
|
|
||||||
},
|
|
||||||
"payload": original response code which you can parse for more data
|
|
||||||
}
|
|
||||||
`
|
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
a list of dict as specified in the class documentation.
|
||||||
"""
|
"""
|
||||||
if not len(wall_ids):
|
if not len(wall_ids):
|
||||||
return []
|
return []
|
||||||
@@ -134,14 +163,34 @@ class VkScraper:
|
|||||||
)
|
)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def scrape_videos(self, url: str) -> List:
|
def scrape_videos(self, url: str) -> List[dict]:
|
||||||
# TODO: https://vk.com/video-1_456239018
|
"""Scrapes a URL for multiple video data
|
||||||
# TODO https://vk.com/asdasdasd?w=wall-17315087_74182 has 1 video
|
|
||||||
# https://vk.com/video38556806_456251917?list=ba2b77043648ff3789
|
Parameters
|
||||||
|
----------
|
||||||
|
url : str
|
||||||
|
The URL to parse - should contain something like "...video1212_3434..."
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
a list of dict as specified in the class documentation.
|
||||||
|
"""
|
||||||
video_ids = self.VIDEO_PATTERN.findall(url)
|
video_ids = self.VIDEO_PATTERN.findall(url)
|
||||||
return self.scrape_video_ids(video_ids)
|
return self.scrape_video_ids(video_ids)
|
||||||
|
|
||||||
def scrape_video_ids(self, video_ids: List[str]) -> List:
|
def scrape_video_ids(self, video_ids: List[str]) -> List[dict]:
|
||||||
|
"""
|
||||||
|
Receives a list of video ids like video123123_1231 see `api docs <https://dev.vk.com/method/video.get>`__
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
video_ids : List[str]
|
||||||
|
list with valid video ids like "video123123_1231"
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
a list of dict as specified in the class documentation.
|
||||||
|
"""
|
||||||
if not len(video_ids):
|
if not len(video_ids):
|
||||||
return []
|
return []
|
||||||
video_ids = [video_id.replace("video", "") for video_id in video_ids]
|
video_ids = [video_id.replace("video", "") for video_id in video_ids]
|
||||||
@@ -170,11 +219,34 @@ class VkScraper:
|
|||||||
)
|
)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def scrape_photos(self, url: str) -> List:
|
def scrape_photos(self, url: str) -> List[dict]:
|
||||||
|
"""Scrapes a URL for multiple photo data
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
url : str
|
||||||
|
The URL to parse - should contain something like "...photo1212_3434..."
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
a list of dict as specified in the class documentation.
|
||||||
|
"""
|
||||||
photo_ids = self.PHOTO_PATTERN.findall(url)
|
photo_ids = self.PHOTO_PATTERN.findall(url)
|
||||||
return self.scrape_photo_ids(photo_ids)
|
return self.scrape_photo_ids(photo_ids)
|
||||||
|
|
||||||
def scrape_photo_ids(self, photo_ids: List[str]) -> List:
|
def scrape_photo_ids(self, photo_ids: List[str]) -> List[dict]:
|
||||||
|
"""
|
||||||
|
Receives a list of photo ids like photo123123_1231 see `api docs <https://dev.vk.com/method/photos.getById>`__
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
photo_ids : List[str]
|
||||||
|
list with valid photo ids like "photo123123_1231"
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
a list of dict as specified in the class documentation.
|
||||||
|
"""
|
||||||
if not len(photo_ids):
|
if not len(photo_ids):
|
||||||
return []
|
return []
|
||||||
photo_ids = [photo_id.replace("photo", "") for photo_id in photo_ids]
|
photo_ids = [photo_id.replace("photo", "") for photo_id in photo_ids]
|
||||||
|
|||||||
Reference in New Issue
Block a user