From 80b43f7c95f67aa5ef83a7093c2c7dde9beea2ce Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 21 Jun 2022 14:23:54 +0200 Subject: [PATCH] token functionality --- .env.example | 2 +- .github/workflows/main.yml | 5 ++--- tests/scraper_test.py | 4 +++- vk_url_scraper/__main__.py | 10 +++++++++- vk_url_scraper/scraper.py | 8 ++++++-- vk_url_scraper/utils.py | 30 +++--------------------------- 6 files changed, 24 insertions(+), 35 deletions(-) diff --git a/.env.example b/.env.example index b53ec4f..4d367ca 100644 --- a/.env.example +++ b/.env.example @@ -1,3 +1,3 @@ VK_USERNAME="your username" VK_PASSWORD="your password" -CAPTCHA_HANDLE_URL="url to a place you control and can updated with the captcha value during development for where to get the captcha" \ No newline at end of file +VK_TOKEN="optional token" \ No newline at end of file diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 9d1aacd..87521a4 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -21,7 +21,6 @@ env: VK_USERNAME: ${{ secrets.VK_USERNAME }} VK_PASSWORD: ${{ secrets.VK_PASSWORD }} VK_TOKEN: ${{ secrets.VK_TOKEN }} - CAPTCHA_HANDLE_URL: ${{ secrets.CAPTCHA_HANDLE_URL }} jobs: checks: @@ -33,10 +32,10 @@ jobs: matrix: # python: ['3.7', '3.10'] python: ['3.10'] - task: # --show-capture=no on purpose + task: # --show-capture=no on purpose - name: Test run: | - pytest -s --color=yes tests/ + pytest --show-capture=no --color=yes tests/ include: - python: '3.10' diff --git a/tests/scraper_test.py b/tests/scraper_test.py index c12a261..5178f45 100644 --- a/tests/scraper_test.py +++ b/tests/scraper_test.py @@ -17,7 +17,9 @@ vks = None def test_login_success(): global vks - vks = VkScraper(os.environ["VK_USERNAME"], os.environ["VK_PASSWORD"], os.environ.get("VK_TOKEN")) + vks = VkScraper( + os.environ["VK_USERNAME"], os.environ["VK_PASSWORD"], os.environ.get("VK_TOKEN") + ) def test_scrape_empty_urll(): diff --git a/vk_url_scraper/__main__.py b/vk_url_scraper/__main__.py index ce47a24..3ebf819 100644 --- a/vk_url_scraper/__main__.py +++ b/vk_url_scraper/__main__.py @@ -29,6 +29,14 @@ def get_argument_parser(): required=True, help="password for the valid vk.com account", ) + parser.add_argument( + "-t", + "--token", + action="store", + dest="token", + required=False, + help="optional token, when passed authentication will not be performed - good to avoid captcha issues", + ) parser.add_argument( "-d", "--download", @@ -50,7 +58,7 @@ def get_argument_parser(): def main(): parser = get_argument_parser() args = parser.parse_args() - vks = VkScraper(args.username, args.password) + vks = VkScraper(args.username, args.password, args.token) text = " ".join(args.urls) res = vks.scrape(text) res_json = json.dumps(res, ensure_ascii=False, indent=4, cls=DateTimeEncoder) diff --git a/vk_url_scraper/scraper.py b/vk_url_scraper/scraper.py index ba6353b..2d73b92 100644 --- a/vk_url_scraper/scraper.py +++ b/vk_url_scraper/scraper.py @@ -38,7 +38,9 @@ class VkScraper: PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)") VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)") - def __init__(self, username: str, password: str, token: str = None, captcha_handler=captcha_handler) -> None: + def __init__( + self, username: str, password: str, token: str = None, captcha_handler=captcha_handler + ) -> None: """Initializes the scraper. This function receives a username and password (or access token) and performs @@ -53,7 +55,9 @@ class VkScraper: token : str Access token received after authenticating, can be found in the vl_config.v2.json file """ - self.session = vk_api.VkApi(username, password, token=token, captcha_handler=captcha_handler) + self.session = vk_api.VkApi( + username, password, token=token, captcha_handler=captcha_handler + ) if token is None or len(token) == 0: self.session.auth(token_only=True) diff --git a/vk_url_scraper/utils.py b/vk_url_scraper/utils.py index 36400a2..77aa5af 100644 --- a/vk_url_scraper/utils.py +++ b/vk_url_scraper/utils.py @@ -1,11 +1,7 @@ import json import os -import re -import time from datetime import datetime -import requests - class DateTimeEncoder(json.JSONEncoder): # to allow json.dump with datetimes do json.dumps(obj, cls=DateTimeEncoder) @@ -22,28 +18,8 @@ def mkdir_if_not_exists(folder): def captcha_handler(captcha): print( - f"""CAPTCHA DETECTED, please solve it and put the solution into the webpage specified in the 'CAPTCHA_HANDLE_URL' env variable in the next 10min. Put the answer in the format "{captcha.sid}=SOLUTION". - - {captcha.sid=} - {captcha.get_url()=} - """, + f"CAPTCHA DETECTED, please solve it and input the solution. {captcha.sid=} {captcha.get_url()=}", flush=True, ) - if "CAPTCHA_HANDLE_URL" in os.environ: - url = os.environ["CAPTCHA_HANDLE_URL"] - regex_string = re.compile(f"{captcha.sid}=(.*)") - for wait in (10 * 6) * [10]: # tries every 10s for 10min - print(f"sending request to {url=}", flush=True) - r = requests.get(url) - if r.status_code == 200: - print(f"got response {r.text=}", flush=True) - if key := regex_string.search(r.text): - print(f"got captcha result {key=} {key[1]=}", flush=True) - return captcha.try_again(key[1]) - print(f"sleeping {wait} seconds", flush=True) - time.sleep(wait) - else: - key = input(f"Enter captcha code for {captcha.get_url()}:").strip() - return captcha.try_again(key) - - return False + key = input(f"Enter captcha code for {captcha.get_url()}:").strip() + return captcha.try_again(key)