This commit is contained in:
msramalho
2022-06-18 00:11:24 +02:00
parent b63289829c
commit 187cfa83c8
6 changed files with 181 additions and 40 deletions

View File

@@ -8,13 +8,32 @@ import vk_api # used to get api_token after authentication
class VkScraper:
"""VkScraper class that allows to authenticate and scrape URLs.
All `scrape*` functions return a payload like:
.. highlight:: python
.. code-block:: python
{
"id": "wall_id",
"text": "text in this post" ,
"datetime": datetime of post,
"attachments": {
# only present values will appear, can be empty dict
"photo": [list of urls with max quality],
"video": [list of urls with max quality],
"link": [list of urls with max quality],
},
"payload": {"more": "original JSON response as dict which you can parse for more data"}
}
"""
WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)")
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)")
def __init__(self, username: str, password: str, verbose: bool = True) -> None:
"""
Initializes the scraper.
def __init__(self, username: str, password: str) -> None:
"""Initializes the scraper.
This function receives a username and password and performs authentication on vk.com to then call api endpoints
@@ -24,44 +43,54 @@ class VkScraper:
Username on vk.com, can be a phone number or email
password : str
Matching password on vk.com
verbose : bool = False
If True will log debug info
Examples
--------
>>> VkScraper("+12345678", "password")
"""
self.session = vk_api.VkApi(username, password)
self.session.auth(token_only=True)
self.verbose = verbose
def scrape(self, url: str) -> List:
"""Scrapes a URL for multiple possibilities of inner links such as wall, video, photo, ...
Parameters
----------
url : str
The URL to parse and analyze content from, typically shared from vk.com feature
or copy-pasted from the browser
Returns
-------
a list of dict as specified in the class documentation.
"""
return self.scrape_walls(url) + self.scrape_photos(url) + self.scrape_videos(url)
def scrape_walls(self, url: str) -> List:
"""Scrapes a URL for multiple wall data
Parameters
----------
url : str
The URL to parse - should contain something like "...wall1212_3434..."
Returns
-------
a list of dict as specified in the class documentation.
"""
wall_ids = self.WALL_PATTERN.findall(url)
return self.scrape_wall_ids(wall_ids)
def scrape_wall_ids(self, wall_ids: List[str], copy_history_depth: int = 2) -> List:
def scrape_wall_ids(self, wall_ids: List[str], copy_history_depth: int = 2) -> List[dict]:
"""
Receives a list of wall ids like wall123123_1231
Returns a list with one item per wall_id where each item contains:
Receives a list of wall ids like wall123123_1231 see `api docs <https://dev.vk.com/method/wall.getById>`__
:returns `{
"id": "wall_id",
"text": "text in this post" ,
"datetime": datetime of post,
"attachments": {
"photo": [list of urls with max quality],
"album": [list of urls with max quality],
# untested:
# "video": [list of urls with max quality],
# "link": [list of urls with max quality],
},
"payload": original response code which you can parse for more data
}
`
Parameters
----------
wall_ids : List[str]
list with valid wall ids like "wall123123_1231"
copy_history_depth : int
see `api docs <https://dev.vk.com/method/wall.getById>`__
Returns
-------
a list of dict as specified in the class documentation.
"""
if not len(wall_ids):
return []
@@ -134,14 +163,34 @@ class VkScraper:
)
return res
def scrape_videos(self, url: str) -> List:
# TODO: https://vk.com/video-1_456239018
# TODO https://vk.com/asdasdasd?w=wall-17315087_74182 has 1 video
# https://vk.com/video38556806_456251917?list=ba2b77043648ff3789
def scrape_videos(self, url: str) -> List[dict]:
"""Scrapes a URL for multiple video data
Parameters
----------
url : str
The URL to parse - should contain something like "...video1212_3434..."
Returns
-------
a list of dict as specified in the class documentation.
"""
video_ids = self.VIDEO_PATTERN.findall(url)
return self.scrape_video_ids(video_ids)
def scrape_video_ids(self, video_ids: List[str]) -> List:
def scrape_video_ids(self, video_ids: List[str]) -> List[dict]:
"""
Receives a list of video ids like video123123_1231 see `api docs <https://dev.vk.com/method/video.get>`__
Parameters
----------
video_ids : List[str]
list with valid video ids like "video123123_1231"
Returns
-------
a list of dict as specified in the class documentation.
"""
if not len(video_ids):
return []
video_ids = [video_id.replace("video", "") for video_id in video_ids]
@@ -170,11 +219,34 @@ class VkScraper:
)
return res
def scrape_photos(self, url: str) -> List:
def scrape_photos(self, url: str) -> List[dict]:
"""Scrapes a URL for multiple photo data
Parameters
----------
url : str
The URL to parse - should contain something like "...photo1212_3434..."
Returns
-------
a list of dict as specified in the class documentation.
"""
photo_ids = self.PHOTO_PATTERN.findall(url)
return self.scrape_photo_ids(photo_ids)
def scrape_photo_ids(self, photo_ids: List[str]) -> List:
def scrape_photo_ids(self, photo_ids: List[str]) -> List[dict]:
"""
Receives a list of photo ids like photo123123_1231 see `api docs <https://dev.vk.com/method/photos.getById>`__
Parameters
----------
photo_ids : List[str]
list with valid photo ids like "photo123123_1231"
Returns
-------
a list of dict as specified in the class documentation.
"""
if not len(photo_ids):
return []
photo_ids = [photo_id.replace("photo", "") for photo_id in photo_ids]