docs

2026-06-12 13:28:37 +03:00 · 2022-06-18 00:11:24 +02:00
6 changed files with 181 additions and 40 deletions
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -2,7 +2,7 @@ version: 2
 sphinx:
  configuration: docs/source/conf.py
-  fail_on_warning: true
+  fail_on_warning: false
 python:
  version: "3.8"
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## Unreleased
 ## [0.1.2]
-### Added wall scraper with tests
+* Added wall scraper with tests
-### Added photo scraper with tests
+* Added photo scraper with tests
-### Added scraper with tests
+* Added scraper with tests
--- a/8
+++ b/8
@@ -5,8 +5,12 @@ docs :
 .PHONY : run-checks
 run-checks :
-	isort --check .
+	# do with --check to not change files
-	black --check .
+	# isort --check .
 	# black --check .
 	# do like this to fix files
 	isort .
 	black .
 	flake8 .
 	mypy .
 	CUDA_VISIBLE_DEVICES='' pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/
--- a/README.md
+++ b/README.md
@@ -1 +1,57 @@
 # vk-url-scraper
 Library to scrape data and especially media links (videos and photos) from vk.com URLs.
 # TODO
 * docs online from sphinx
 ## Quick usage
 `pip install vk-url-scraper` to install.
 ```python
 from vk_url_scraper import VkScraper
 vks = VkScraper("username", "password")
 # scrape any "photo" URL
 res = vks.scrape("https://vk.com/photo1_278184324?rev=1")
 # scrape any "wall" URL
 res = vks.scrape("https://vk.com/wall-1_398461")
 # scrape any "video" URL
 res = vks.scrape("https://vk.com/video-6596301_145810025")
 print(res[0]["text]) # eg: -> to get the text from code
 ```
 ```python
 # Every scrape* function returns a list of dict like
 {
 	"id": "wall_id",
 	"text": "text in this post" ,
 	"datetime": utc datetime of post,
 	"attachments": {
 		# if photo, video, link exists
 		"photo": [list of urls with max quality],
 		"video": [list of urls with max quality],
 		"link": [list of urls with max quality],
 	},
 	"payload": "original JSON response converted to dict which you can parse for more data
 }
 ```
 see [docs] for all available functions. 
 ### Development
 1. setup environment with `pip install -r requirements` or `pipenv install -r requirements`
 2. To run all checks to `make run-checks` (fixes style) or individually
   1. To fix style: `black .` and `isort .` -> `flake8 .` to validate lint
   2. To do type checking: `mypy .`
   3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to user verbose, colors, and test docstring examples)
 3. `make docs` to generate shpynx docs -> edit [config.py](docs/source/conf.py) if needed
 ### Releasing new version
 1. edit [version.py](vk_url_scraper/version.py) with proper versioning
 2. `git tag vx.y.z` to tag version
 3. `git push origin vx.y.z` -> this will trigger workflow and put project on [pypi](https://pypi.org/project/vk-url-scraper/)
--- a/tests/scraper_test.py
+++ b/tests/scraper_test.py
@@ -24,6 +24,15 @@ def test_scrape_empty_urll():
    assert [] == vks.scrape("something")
 def test_scrape_no_vk_parseable_info():
    assert len(vks.scrape("")) == 0
    assert len(vks.scrape("google.com")) == 0
    assert len(vks.scrape("vk.com")) == 0
    assert len(vks.scrape("vk.com/wall")) == 0
    assert len(vks.scrape("vk.com/photo")) == 0
    assert len(vks.scrape("vk.com/video")) == 0
 def test_scrape_wall_url_with_text_only():
    res = vks.scrape("https://vk.com/wall-1_398461")
    assert len(res) == 1
--- a/vk_url_scraper/scraper.py
+++ b/vk_url_scraper/scraper.py
@@ -8,13 +8,32 @@ import vk_api  # used to get api_token after authentication
 class VkScraper:
    """VkScraper class that allows to authenticate and scrape URLs.
    All `scrape*` functions return a payload like:
    .. highlight:: python
    .. code-block:: python
        {
            "id": "wall_id",
            "text": "text in this post" ,
            "datetime": datetime of post,
            "attachments": {
                # only present values will appear, can be empty dict
                "photo": [list of urls with max quality],
                "video": [list of urls with max quality],
                "link": [list of urls with max quality],
            },
            "payload": {"more": "original JSON response as dict which you can parse for more data"}
        }
    """
    WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)")
    PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
    VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)")
-    def __init__(self, username: str, password: str, verbose: bool = True) -> None:
+    def __init__(self, username: str, password: str) -> None:
-        """
+        """Initializes the scraper.
        Initializes the scraper.
        This function receives a username and password and performs authentication on vk.com to then call api endpoints
@@ -24,44 +43,54 @@ class VkScraper:
            Username on vk.com, can be a phone number or email
        password : str
            Matching password on vk.com
        verbose : bool = False
            If True will log debug info
        Examples
        --------
        >>> VkScraper("+12345678", "password")
        """
        self.session = vk_api.VkApi(username, password)
        self.session.auth(token_only=True)
        self.verbose = verbose
    def scrape(self, url: str) -> List:
        """Scrapes a URL for multiple possibilities of inner links such as wall, video, photo, ...
        Parameters
        ----------
        url : str
            The URL to parse and analyze content from, typically shared from vk.com feature
            or copy-pasted from the browser
        Returns
        -------
        a list of dict as specified in the class documentation.
        """
        return self.scrape_walls(url) + self.scrape_photos(url) + self.scrape_videos(url)
    def scrape_walls(self, url: str) -> List:
        """Scrapes a URL for multiple wall data
        Parameters
        ----------
        url : str
            The URL to parse - should contain something like "...wall1212_3434..."
        Returns
        -------
        a list of dict as specified in the class documentation.
        """
        wall_ids = self.WALL_PATTERN.findall(url)
        return self.scrape_wall_ids(wall_ids)
-    def scrape_wall_ids(self, wall_ids: List[str], copy_history_depth: int = 2) -> List:
+    def scrape_wall_ids(self, wall_ids: List[str], copy_history_depth: int = 2) -> List[dict]:
        """
-        Receives a list of wall ids like wall123123_1231
+        Receives a list of wall ids like wall123123_1231 see `api docs <https://dev.vk.com/method/wall.getById>`__
        Returns a list with one item per wall_id where each item contains:
-        :returns `{
+        Parameters
-            "id": "wall_id",
+        ----------
-            "text": "text in this post" ,
+        wall_ids : List[str]
-            "datetime": datetime of post,
+            list with valid wall ids like "wall123123_1231"
-            "attachments": {
+        copy_history_depth : int
-                "photo": [list of urls with max quality],
+            see `api docs <https://dev.vk.com/method/wall.getById>`__
                "album": [list of urls with max quality],
                # untested:
                # "video": [list of urls with max quality],
                # "link": [list of urls with max quality],
            },
            "payload": original response code which you can parse for more data
        }
        `
        Returns
        -------
        a list of dict as specified in the class documentation.
        """
        if not len(wall_ids):
            return []
@@ -134,14 +163,34 @@ class VkScraper:
            )
        return res
-    def scrape_videos(self, url: str) -> List:
+    def scrape_videos(self, url: str) -> List[dict]:
-        # TODO: https://vk.com/video-1_456239018
+        """Scrapes a URL for multiple video data
-        # TODO https://vk.com/asdasdasd?w=wall-17315087_74182 has 1 video
+
-        # https://vk.com/video38556806_456251917?list=ba2b77043648ff3789
+        Parameters
        ----------
        url : str
            The URL to parse - should contain something like "...video1212_3434..."
        Returns
        -------
        a list of dict as specified in the class documentation.
        """
        video_ids = self.VIDEO_PATTERN.findall(url)
        return self.scrape_video_ids(video_ids)
-    def scrape_video_ids(self, video_ids: List[str]) -> List:
+    def scrape_video_ids(self, video_ids: List[str]) -> List[dict]:
        """
        Receives a list of video ids like video123123_1231 see `api docs <https://dev.vk.com/method/video.get>`__
        Parameters
        ----------
        video_ids : List[str]
            list with valid video ids like "video123123_1231"
        Returns
        -------
        a list of dict as specified in the class documentation.
        """
        if not len(video_ids):
            return []
        video_ids = [video_id.replace("video", "") for video_id in video_ids]
@@ -170,11 +219,34 @@ class VkScraper:
            )
        return res
-    def scrape_photos(self, url: str) -> List:
+    def scrape_photos(self, url: str) -> List[dict]:
        """Scrapes a URL for multiple photo data
        Parameters
        ----------
        url : str
            The URL to parse - should contain something like "...photo1212_3434..."
        Returns
        -------
        a list of dict as specified in the class documentation.
        """
        photo_ids = self.PHOTO_PATTERN.findall(url)
        return self.scrape_photo_ids(photo_ids)
-    def scrape_photo_ids(self, photo_ids: List[str]) -> List:
+    def scrape_photo_ids(self, photo_ids: List[str]) -> List[dict]:
        """
        Receives a list of photo ids like photo123123_1231 see `api docs <https://dev.vk.com/method/photos.getById>`__
        Parameters
        ----------
        photo_ids : List[str]
            list with valid photo ids like "photo123123_1231"
        Returns
        -------
        a list of dict as specified in the class documentation.
        """
        if not len(photo_ids):
            return []
        photo_ids = [photo_id.replace("photo", "") for photo_id in photo_ids]