diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..b53ec4f --- /dev/null +++ b/.env.example @@ -0,0 +1,3 @@ +VK_USERNAME="your username" +VK_PASSWORD="your password" +CAPTCHA_HANDLE_URL="url to a place you control and can updated with the captcha value during development for where to get the captcha" \ No newline at end of file diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 937466f..b155032 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -20,6 +20,7 @@ env: PYTHONPATH: ./ VK_USERNAME: ${{ secrets.VK_USERNAME }} VK_PASSWORD: ${{ secrets.VK_PASSWORD }} + CAPTCHA_HANDLE_URL: ${{ secrets.CAPTCHA_HANDLE_URL }} jobs: checks: diff --git a/vk_url_scraper/scraper.py b/vk_url_scraper/scraper.py index 8a185a1..b6e3651 100644 --- a/vk_url_scraper/scraper.py +++ b/vk_url_scraper/scraper.py @@ -9,7 +9,7 @@ import requests import vk_api # used to get api_token after authentication import yt_dlp # to download videos from url -from .utils import mkdir_if_not_exists +from .utils import captcha_handler, mkdir_if_not_exists class VkScraper: @@ -38,7 +38,7 @@ class VkScraper: PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)") VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)") - def __init__(self, username: str, password: str) -> None: + def __init__(self, username: str, password: str, captcha_handler=captcha_handler) -> None: """Initializes the scraper. This function receives a username and password and performs authentication on vk.com to then call api endpoints @@ -50,7 +50,7 @@ class VkScraper: password : str Matching password on vk.com """ - self.session = vk_api.VkApi(username, password) + self.session = vk_api.VkApi(username, password, captcha_handler=captcha_handler) self.session.auth(token_only=True) def scrape(self, url: str) -> List: diff --git a/vk_url_scraper/utils.py b/vk_url_scraper/utils.py index b2d5147..89f397d 100644 --- a/vk_url_scraper/utils.py +++ b/vk_url_scraper/utils.py @@ -1,7 +1,11 @@ import json import os +import re +import time from datetime import datetime +import requests + class DateTimeEncoder(json.JSONEncoder): # to allow json.dump with datetimes do json.dumps(obj, cls=DateTimeEncoder) @@ -14,3 +18,31 @@ class DateTimeEncoder(json.JSONEncoder): def mkdir_if_not_exists(folder): if not os.path.exists(folder): os.makedirs(folder) + + +def captcha_handler(captcha): + print( + f"""CAPTCHA DETECTED, please solve it and put the solution into the webpage specified in the 'CAPTCHA_HANDLE_URL' env variable in the next 60s. Put the answer in the format "{captcha.sid}=SOLUTION". + + {captcha.sid=} + {captcha.get_url()=} + {captcha.get_image()=} + """ + ) + if "CAPTCHA_HANDLE_URL" in os.environ: + url = os.environ["CAPTCHA_HANDLE_URL"] + regex_string = re.compile(f"{captcha.sid}=(.*)") + for wait in 24 * [5]: # tries every 5s for 2min + print(f"sending request to {url=}") + r = requests.get(url) + print(f"got response {r.text=}") + if key := regex_string.search(r.text): + print(f"got captcha result {key=}") + return captcha.try_again(key[0]) + print(f"sleeping {wait} seconds") + time.sleep(wait) + else: + key = input("Enter captcha code {0}: ".format(captcha.get_url())).strip() + return captcha.try_again(key[0]) + + return False