attempts at captch fix in workflow

This commit is contained in:
msramalho
2022-06-21 12:16:58 +02:00
parent 24a1313a65
commit 59d53be68b
4 changed files with 39 additions and 3 deletions

3
.env.example Normal file
View File

@@ -0,0 +1,3 @@
VK_USERNAME="your username"
VK_PASSWORD="your password"
CAPTCHA_HANDLE_URL="url to a place you control and can updated with the captcha value during development for where to get the captcha"

View File

@@ -20,6 +20,7 @@ env:
PYTHONPATH: ./
VK_USERNAME: ${{ secrets.VK_USERNAME }}
VK_PASSWORD: ${{ secrets.VK_PASSWORD }}
CAPTCHA_HANDLE_URL: ${{ secrets.CAPTCHA_HANDLE_URL }}
jobs:
checks:

View File

@@ -9,7 +9,7 @@ import requests
import vk_api # used to get api_token after authentication
import yt_dlp # to download videos from url
from .utils import mkdir_if_not_exists
from .utils import captcha_handler, mkdir_if_not_exists
class VkScraper:
@@ -38,7 +38,7 @@ class VkScraper:
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)")
def __init__(self, username: str, password: str) -> None:
def __init__(self, username: str, password: str, captcha_handler=captcha_handler) -> None:
"""Initializes the scraper.
This function receives a username and password and performs authentication on vk.com to then call api endpoints
@@ -50,7 +50,7 @@ class VkScraper:
password : str
Matching password on vk.com
"""
self.session = vk_api.VkApi(username, password)
self.session = vk_api.VkApi(username, password, captcha_handler=captcha_handler)
self.session.auth(token_only=True)
def scrape(self, url: str) -> List:

View File

@@ -1,7 +1,11 @@
import json
import os
import re
import time
from datetime import datetime
import requests
class DateTimeEncoder(json.JSONEncoder):
# to allow json.dump with datetimes do json.dumps(obj, cls=DateTimeEncoder)
@@ -14,3 +18,31 @@ class DateTimeEncoder(json.JSONEncoder):
def mkdir_if_not_exists(folder):
if not os.path.exists(folder):
os.makedirs(folder)
def captcha_handler(captcha):
print(
f"""CAPTCHA DETECTED, please solve it and put the solution into the webpage specified in the 'CAPTCHA_HANDLE_URL' env variable in the next 60s. Put the answer in the format "{captcha.sid}=SOLUTION".
{captcha.sid=}
{captcha.get_url()=}
{captcha.get_image()=}
"""
)
if "CAPTCHA_HANDLE_URL" in os.environ:
url = os.environ["CAPTCHA_HANDLE_URL"]
regex_string = re.compile(f"{captcha.sid}=(.*)")
for wait in 24 * [5]: # tries every 5s for 2min
print(f"sending request to {url=}")
r = requests.get(url)
print(f"got response {r.text=}")
if key := regex_string.search(r.text):
print(f"got captcha result {key=}")
return captcha.try_again(key[0])
print(f"sleeping {wait} seconds")
time.sleep(wait)
else:
key = input("Enter captcha code {0}: ".format(captcha.get_url())).strip()
return captcha.try_again(key[0])
return False