docs

2026-06-10 04:18:35 +03:00 · 2022-06-18 00:11:24 +02:00
6 changed files with 181 additions and 40 deletions
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -2,7 +2,7 @@ version: 2

 sphinx:
  configuration: docs/source/conf.py
-  fail_on_warning: true
+  fail_on_warning: false

 python:
  version: "3.8"
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## Unreleased

 ## [0.1.2]
-### Added wall scraper with tests
-### Added photo scraper with tests
-### Added scraper with tests
+* Added wall scraper with tests
+* Added photo scraper with tests
+* Added scraper with tests
--- a/8
+++ b/8
@@ -5,8 +5,12 @@ docs :

 .PHONY : run-checks
 run-checks :
-	isort --check .
-	black --check .
+	# do with --check to not change files
+	# isort --check .
+	# black --check .
+	# do like this to fix files
+	isort .
+	black .
 	flake8 .
 	mypy .
 	CUDA_VISIBLE_DEVICES='' pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/
--- a/README.md
+++ b/README.md
@@ -1 +1,57 @@
 # vk-url-scraper
+Library to scrape data and especially media links (videos and photos) from vk.com URLs.
+
+
+# TODO
+* docs online from sphinx
+
+## Quick usage
+`pip install vk-url-scraper` to install.
+
+
+```python
+from vk_url_scraper import VkScraper
+
+vks = VkScraper("username", "password")
+
+# scrape any "photo" URL
+res = vks.scrape("https://vk.com/photo1_278184324?rev=1")
+
+# scrape any "wall" URL
+res = vks.scrape("https://vk.com/wall-1_398461")
+
+# scrape any "video" URL
+res = vks.scrape("https://vk.com/video-6596301_145810025")
+print(res[0]["text]) # eg: -> to get the text from code
+```
+
+```python
+# Every scrape* function returns a list of dict like
+{
+	"id": "wall_id",
+	"text": "text in this post" ,
+	"datetime": utc datetime of post,
+	"attachments": {
+		# if photo, video, link exists
+		"photo": [list of urls with max quality],
+		"video": [list of urls with max quality],
+		"link": [list of urls with max quality],
+	},
+	"payload": "original JSON response converted to dict which you can parse for more data
+}
+```
+
+see [docs] for all available functions. 
+
+### Development
+1. setup environment with `pip install -r requirements` or `pipenv install -r requirements`
+2. To run all checks to `make run-checks` (fixes style) or individually
+   1. To fix style: `black .` and `isort .` -> `flake8 .` to validate lint
+   2. To do type checking: `mypy .`
+   3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to user verbose, colors, and test docstring examples)
+3. `make docs` to generate shpynx docs -> edit [config.py](docs/source/conf.py) if needed
+
+### Releasing new version
+1. edit [version.py](vk_url_scraper/version.py) with proper versioning
+2. `git tag vx.y.z` to tag version
+3. `git push origin vx.y.z` -> this will trigger workflow and put project on [pypi](https://pypi.org/project/vk-url-scraper/)
--- a/tests/scraper_test.py
+++ b/tests/scraper_test.py
@@ -24,6 +24,15 @@ def test_scrape_empty_urll():
    assert [] == vks.scrape("something")


+def test_scrape_no_vk_parseable_info():
+    assert len(vks.scrape("")) == 0
+    assert len(vks.scrape("google.com")) == 0
+    assert len(vks.scrape("vk.com")) == 0
+    assert len(vks.scrape("vk.com/wall")) == 0
+    assert len(vks.scrape("vk.com/photo")) == 0
+    assert len(vks.scrape("vk.com/video")) == 0
+
+
 def test_scrape_wall_url_with_text_only():
    res = vks.scrape("https://vk.com/wall-1_398461")
    assert len(res) == 1
--- a/vk_url_scraper/scraper.py
+++ b/vk_url_scraper/scraper.py
@@ -8,13 +8,32 @@ import vk_api  # used to get api_token after authentication


 class VkScraper:
+    """VkScraper class that allows to authenticate and scrape URLs.
+
+    All `scrape*` functions return a payload like:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        {
+            "id": "wall_id",
+            "text": "text in this post" ,
+            "datetime": datetime of post,
+            "attachments": {
+                # only present values will appear, can be empty dict
+                "photo": [list of urls with max quality],
+                "video": [list of urls with max quality],
+                "link": [list of urls with max quality],
+            },
+            "payload": {"more": "original JSON response as dict which you can parse for more data"}
+        }
+    """
    WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)")
    PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
    VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)")

-    def __init__(self, username: str, password: str, verbose: bool = True) -> None:
-        """
-        Initializes the scraper.
+    def __init__(self, username: str, password: str) -> None:
+        """Initializes the scraper.

        This function receives a username and password and performs authentication on vk.com to then call api endpoints

@@ -24,44 +43,54 @@ class VkScraper:
            Username on vk.com, can be a phone number or email
        password : str
            Matching password on vk.com
-        verbose : bool = False
-            If True will log debug info
-
-        Examples
-        --------
-        >>> VkScraper("+12345678", "password")
        """
        self.session = vk_api.VkApi(username, password)
        self.session.auth(token_only=True)
-        self.verbose = verbose

    def scrape(self, url: str) -> List:
+        """Scrapes a URL for multiple possibilities of inner links such as wall, video, photo, ...
+
+        Parameters
+        ----------
+        url : str
+            The URL to parse and analyze content from, typically shared from vk.com feature
+            or copy-pasted from the browser
+
+        Returns
+        -------
+        a list of dict as specified in the class documentation.
+        """
        return self.scrape_walls(url) + self.scrape_photos(url) + self.scrape_videos(url)

    def scrape_walls(self, url: str) -> List:
+        """Scrapes a URL for multiple wall data
+
+        Parameters
+        ----------
+        url : str
+            The URL to parse - should contain something like "...wall1212_3434..."
+
+        Returns
+        -------
+        a list of dict as specified in the class documentation.
+        """
        wall_ids = self.WALL_PATTERN.findall(url)
        return self.scrape_wall_ids(wall_ids)

-    def scrape_wall_ids(self, wall_ids: List[str], copy_history_depth: int = 2) -> List:
+    def scrape_wall_ids(self, wall_ids: List[str], copy_history_depth: int = 2) -> List[dict]:
        """
-        Receives a list of wall ids like wall123123_1231
-        Returns a list with one item per wall_id where each item contains:
+        Receives a list of wall ids like wall123123_1231 see `api docs <https://dev.vk.com/method/wall.getById>`__

-        :returns `{
-            "id": "wall_id",
-            "text": "text in this post" ,
-            "datetime": datetime of post,
-            "attachments": {
-                "photo": [list of urls with max quality],
-                "album": [list of urls with max quality],
-                # untested:
-                # "video": [list of urls with max quality],
-                # "link": [list of urls with max quality],
-            },
-            "payload": original response code which you can parse for more data
-        }
-        `
+        Parameters
+        ----------
+        wall_ids : List[str]
+            list with valid wall ids like "wall123123_1231"
+        copy_history_depth : int
+            see `api docs <https://dev.vk.com/method/wall.getById>`__

+        Returns
+        -------
+        a list of dict as specified in the class documentation.
        """
        if not len(wall_ids):
            return []
@@ -134,14 +163,34 @@ class VkScraper:
            )
        return res

-    def scrape_videos(self, url: str) -> List:
-        # TODO: https://vk.com/video-1_456239018
-        # TODO https://vk.com/asdasdasd?w=wall-17315087_74182 has 1 video
-        # https://vk.com/video38556806_456251917?list=ba2b77043648ff3789
+    def scrape_videos(self, url: str) -> List[dict]:
+        """Scrapes a URL for multiple video data
+
+        Parameters
+        ----------
+        url : str
+            The URL to parse - should contain something like "...video1212_3434..."
+
+        Returns
+        -------
+        a list of dict as specified in the class documentation.
+        """
        video_ids = self.VIDEO_PATTERN.findall(url)
        return self.scrape_video_ids(video_ids)

-    def scrape_video_ids(self, video_ids: List[str]) -> List:
+    def scrape_video_ids(self, video_ids: List[str]) -> List[dict]:
+        """
+        Receives a list of video ids like video123123_1231 see `api docs <https://dev.vk.com/method/video.get>`__
+
+        Parameters
+        ----------
+        video_ids : List[str]
+            list with valid video ids like "video123123_1231"
+            
+        Returns
+        -------
+        a list of dict as specified in the class documentation.
+        """
        if not len(video_ids):
            return []
        video_ids = [video_id.replace("video", "") for video_id in video_ids]
@@ -170,11 +219,34 @@ class VkScraper:
            )
        return res

-    def scrape_photos(self, url: str) -> List:
+    def scrape_photos(self, url: str) -> List[dict]:
+        """Scrapes a URL for multiple photo data
+
+        Parameters
+        ----------
+        url : str
+            The URL to parse - should contain something like "...photo1212_3434..."
+
+        Returns
+        -------
+        a list of dict as specified in the class documentation.
+        """
        photo_ids = self.PHOTO_PATTERN.findall(url)
        return self.scrape_photo_ids(photo_ids)

-    def scrape_photo_ids(self, photo_ids: List[str]) -> List:
+    def scrape_photo_ids(self, photo_ids: List[str]) -> List[dict]:
+        """
+        Receives a list of photo ids like photo123123_1231 see `api docs <https://dev.vk.com/method/photos.getById>`__
+
+        Parameters
+        ----------
+        photo_ids : List[str]
+            list with valid photo ids like "photo123123_1231"
+            
+        Returns
+        -------
+        a list of dict as specified in the class documentation.
+        """
        if not len(photo_ids):
            return []
        photo_ids = [photo_id.replace("photo", "") for photo_id in photo_ids]