From 187cfa83c88d8b9c6db074c8e9086f913b0bc9bf Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Sat, 18 Jun 2022 00:11:24 +0200 Subject: [PATCH] docs --- .readthedocs.yaml | 2 +- CHANGELOG.md | 6 +- Makefile | 8 ++- README.md | 56 +++++++++++++++ tests/scraper_test.py | 9 +++ vk_url_scraper/scraper.py | 140 +++++++++++++++++++++++++++++--------- 6 files changed, 181 insertions(+), 40 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 6331da1..c9596f7 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -2,7 +2,7 @@ version: 2 sphinx: configuration: docs/source/conf.py - fail_on_warning: true + fail_on_warning: false python: version: "3.8" diff --git a/CHANGELOG.md b/CHANGELOG.md index 0791060..817830f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased ## [0.1.2] -### Added wall scraper with tests -### Added photo scraper with tests -### Added scraper with tests \ No newline at end of file +* Added wall scraper with tests +* Added photo scraper with tests +* Added scraper with tests \ No newline at end of file diff --git a/Makefile b/Makefile index 364c1d0..7af2781 100644 --- a/Makefile +++ b/Makefile @@ -5,8 +5,12 @@ docs : .PHONY : run-checks run-checks : - isort --check . - black --check . + # do with --check to not change files + # isort --check . + # black --check . + # do like this to fix files + isort . + black . flake8 . mypy . CUDA_VISIBLE_DEVICES='' pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/ diff --git a/README.md b/README.md index 8a9eb49..5cafeee 100644 --- a/README.md +++ b/README.md @@ -1 +1,57 @@ # vk-url-scraper +Library to scrape data and especially media links (videos and photos) from vk.com URLs. + + +# TODO +* docs online from sphinx + +## Quick usage +`pip install vk-url-scraper` to install. + + +```python +from vk_url_scraper import VkScraper + +vks = VkScraper("username", "password") + +# scrape any "photo" URL +res = vks.scrape("https://vk.com/photo1_278184324?rev=1") + +# scrape any "wall" URL +res = vks.scrape("https://vk.com/wall-1_398461") + +# scrape any "video" URL +res = vks.scrape("https://vk.com/video-6596301_145810025") +print(res[0]["text]) # eg: -> to get the text from code +``` + +```python +# Every scrape* function returns a list of dict like +{ + "id": "wall_id", + "text": "text in this post" , + "datetime": utc datetime of post, + "attachments": { + # if photo, video, link exists + "photo": [list of urls with max quality], + "video": [list of urls with max quality], + "link": [list of urls with max quality], + }, + "payload": "original JSON response converted to dict which you can parse for more data +} +``` + +see [docs] for all available functions. + +### Development +1. setup environment with `pip install -r requirements` or `pipenv install -r requirements` +2. To run all checks to `make run-checks` (fixes style) or individually + 1. To fix style: `black .` and `isort .` -> `flake8 .` to validate lint + 2. To do type checking: `mypy .` + 3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to user verbose, colors, and test docstring examples) +3. `make docs` to generate shpynx docs -> edit [config.py](docs/source/conf.py) if needed + +### Releasing new version +1. edit [version.py](vk_url_scraper/version.py) with proper versioning +2. `git tag vx.y.z` to tag version +3. `git push origin vx.y.z` -> this will trigger workflow and put project on [pypi](https://pypi.org/project/vk-url-scraper/) \ No newline at end of file diff --git a/tests/scraper_test.py b/tests/scraper_test.py index 7a937a7..76ecf96 100644 --- a/tests/scraper_test.py +++ b/tests/scraper_test.py @@ -24,6 +24,15 @@ def test_scrape_empty_urll(): assert [] == vks.scrape("something") +def test_scrape_no_vk_parseable_info(): + assert len(vks.scrape("")) == 0 + assert len(vks.scrape("google.com")) == 0 + assert len(vks.scrape("vk.com")) == 0 + assert len(vks.scrape("vk.com/wall")) == 0 + assert len(vks.scrape("vk.com/photo")) == 0 + assert len(vks.scrape("vk.com/video")) == 0 + + def test_scrape_wall_url_with_text_only(): res = vks.scrape("https://vk.com/wall-1_398461") assert len(res) == 1 diff --git a/vk_url_scraper/scraper.py b/vk_url_scraper/scraper.py index da0088e..30e3b2a 100644 --- a/vk_url_scraper/scraper.py +++ b/vk_url_scraper/scraper.py @@ -8,13 +8,32 @@ import vk_api # used to get api_token after authentication class VkScraper: + """VkScraper class that allows to authenticate and scrape URLs. + + All `scrape*` functions return a payload like: + + .. highlight:: python + .. code-block:: python + + { + "id": "wall_id", + "text": "text in this post" , + "datetime": datetime of post, + "attachments": { + # only present values will appear, can be empty dict + "photo": [list of urls with max quality], + "video": [list of urls with max quality], + "link": [list of urls with max quality], + }, + "payload": {"more": "original JSON response as dict which you can parse for more data"} + } + """ WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)") PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)") VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)") - def __init__(self, username: str, password: str, verbose: bool = True) -> None: - """ - Initializes the scraper. + def __init__(self, username: str, password: str) -> None: + """Initializes the scraper. This function receives a username and password and performs authentication on vk.com to then call api endpoints @@ -24,44 +43,54 @@ class VkScraper: Username on vk.com, can be a phone number or email password : str Matching password on vk.com - verbose : bool = False - If True will log debug info - - Examples - -------- - >>> VkScraper("+12345678", "password") """ self.session = vk_api.VkApi(username, password) self.session.auth(token_only=True) - self.verbose = verbose def scrape(self, url: str) -> List: + """Scrapes a URL for multiple possibilities of inner links such as wall, video, photo, ... + + Parameters + ---------- + url : str + The URL to parse and analyze content from, typically shared from vk.com feature + or copy-pasted from the browser + + Returns + ------- + a list of dict as specified in the class documentation. + """ return self.scrape_walls(url) + self.scrape_photos(url) + self.scrape_videos(url) def scrape_walls(self, url: str) -> List: + """Scrapes a URL for multiple wall data + + Parameters + ---------- + url : str + The URL to parse - should contain something like "...wall1212_3434..." + + Returns + ------- + a list of dict as specified in the class documentation. + """ wall_ids = self.WALL_PATTERN.findall(url) return self.scrape_wall_ids(wall_ids) - def scrape_wall_ids(self, wall_ids: List[str], copy_history_depth: int = 2) -> List: + def scrape_wall_ids(self, wall_ids: List[str], copy_history_depth: int = 2) -> List[dict]: """ - Receives a list of wall ids like wall123123_1231 - Returns a list with one item per wall_id where each item contains: + Receives a list of wall ids like wall123123_1231 see `api docs `__ - :returns `{ - "id": "wall_id", - "text": "text in this post" , - "datetime": datetime of post, - "attachments": { - "photo": [list of urls with max quality], - "album": [list of urls with max quality], - # untested: - # "video": [list of urls with max quality], - # "link": [list of urls with max quality], - }, - "payload": original response code which you can parse for more data - } - ` + Parameters + ---------- + wall_ids : List[str] + list with valid wall ids like "wall123123_1231" + copy_history_depth : int + see `api docs `__ + Returns + ------- + a list of dict as specified in the class documentation. """ if not len(wall_ids): return [] @@ -134,14 +163,34 @@ class VkScraper: ) return res - def scrape_videos(self, url: str) -> List: - # TODO: https://vk.com/video-1_456239018 - # TODO https://vk.com/asdasdasd?w=wall-17315087_74182 has 1 video - # https://vk.com/video38556806_456251917?list=ba2b77043648ff3789 + def scrape_videos(self, url: str) -> List[dict]: + """Scrapes a URL for multiple video data + + Parameters + ---------- + url : str + The URL to parse - should contain something like "...video1212_3434..." + + Returns + ------- + a list of dict as specified in the class documentation. + """ video_ids = self.VIDEO_PATTERN.findall(url) return self.scrape_video_ids(video_ids) - def scrape_video_ids(self, video_ids: List[str]) -> List: + def scrape_video_ids(self, video_ids: List[str]) -> List[dict]: + """ + Receives a list of video ids like video123123_1231 see `api docs `__ + + Parameters + ---------- + video_ids : List[str] + list with valid video ids like "video123123_1231" + + Returns + ------- + a list of dict as specified in the class documentation. + """ if not len(video_ids): return [] video_ids = [video_id.replace("video", "") for video_id in video_ids] @@ -170,11 +219,34 @@ class VkScraper: ) return res - def scrape_photos(self, url: str) -> List: + def scrape_photos(self, url: str) -> List[dict]: + """Scrapes a URL for multiple photo data + + Parameters + ---------- + url : str + The URL to parse - should contain something like "...photo1212_3434..." + + Returns + ------- + a list of dict as specified in the class documentation. + """ photo_ids = self.PHOTO_PATTERN.findall(url) return self.scrape_photo_ids(photo_ids) - def scrape_photo_ids(self, photo_ids: List[str]) -> List: + def scrape_photo_ids(self, photo_ids: List[str]) -> List[dict]: + """ + Receives a list of photo ids like photo123123_1231 see `api docs `__ + + Parameters + ---------- + photo_ids : List[str] + list with valid photo ids like "photo123123_1231" + + Returns + ------- + a list of dict as specified in the class documentation. + """ if not len(photo_ids): return [] photo_ids = [photo_id.replace("photo", "") for photo_id in photo_ids]