mirror of
https://github.com/bellingcat/vk-url-scraper.git
synced 2026-06-08 03:18:37 +03:00
token functionality
This commit is contained in:
@@ -1,3 +1,3 @@
|
|||||||
VK_USERNAME="your username"
|
VK_USERNAME="your username"
|
||||||
VK_PASSWORD="your password"
|
VK_PASSWORD="your password"
|
||||||
CAPTCHA_HANDLE_URL="url to a place you control and can updated with the captcha value during development for where to get the captcha"
|
VK_TOKEN="optional token"
|
||||||
5
.github/workflows/main.yml
vendored
5
.github/workflows/main.yml
vendored
@@ -21,7 +21,6 @@ env:
|
|||||||
VK_USERNAME: ${{ secrets.VK_USERNAME }}
|
VK_USERNAME: ${{ secrets.VK_USERNAME }}
|
||||||
VK_PASSWORD: ${{ secrets.VK_PASSWORD }}
|
VK_PASSWORD: ${{ secrets.VK_PASSWORD }}
|
||||||
VK_TOKEN: ${{ secrets.VK_TOKEN }}
|
VK_TOKEN: ${{ secrets.VK_TOKEN }}
|
||||||
CAPTCHA_HANDLE_URL: ${{ secrets.CAPTCHA_HANDLE_URL }}
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
checks:
|
checks:
|
||||||
@@ -33,10 +32,10 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
# python: ['3.7', '3.10']
|
# python: ['3.7', '3.10']
|
||||||
python: ['3.10']
|
python: ['3.10']
|
||||||
task: # --show-capture=no on purpose
|
task: # --show-capture=no on purpose
|
||||||
- name: Test
|
- name: Test
|
||||||
run: |
|
run: |
|
||||||
pytest -s --color=yes tests/
|
pytest --show-capture=no --color=yes tests/
|
||||||
|
|
||||||
include:
|
include:
|
||||||
- python: '3.10'
|
- python: '3.10'
|
||||||
|
|||||||
@@ -17,7 +17,9 @@ vks = None
|
|||||||
|
|
||||||
def test_login_success():
|
def test_login_success():
|
||||||
global vks
|
global vks
|
||||||
vks = VkScraper(os.environ["VK_USERNAME"], os.environ["VK_PASSWORD"], os.environ.get("VK_TOKEN"))
|
vks = VkScraper(
|
||||||
|
os.environ["VK_USERNAME"], os.environ["VK_PASSWORD"], os.environ.get("VK_TOKEN")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_scrape_empty_urll():
|
def test_scrape_empty_urll():
|
||||||
|
|||||||
@@ -29,6 +29,14 @@ def get_argument_parser():
|
|||||||
required=True,
|
required=True,
|
||||||
help="password for the valid vk.com account",
|
help="password for the valid vk.com account",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-t",
|
||||||
|
"--token",
|
||||||
|
action="store",
|
||||||
|
dest="token",
|
||||||
|
required=False,
|
||||||
|
help="optional token, when passed authentication will not be performed - good to avoid captcha issues",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-d",
|
"-d",
|
||||||
"--download",
|
"--download",
|
||||||
@@ -50,7 +58,7 @@ def get_argument_parser():
|
|||||||
def main():
|
def main():
|
||||||
parser = get_argument_parser()
|
parser = get_argument_parser()
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
vks = VkScraper(args.username, args.password)
|
vks = VkScraper(args.username, args.password, args.token)
|
||||||
text = " ".join(args.urls)
|
text = " ".join(args.urls)
|
||||||
res = vks.scrape(text)
|
res = vks.scrape(text)
|
||||||
res_json = json.dumps(res, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
|
res_json = json.dumps(res, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
|
||||||
|
|||||||
@@ -38,7 +38,9 @@ class VkScraper:
|
|||||||
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
|
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
|
||||||
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)")
|
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)")
|
||||||
|
|
||||||
def __init__(self, username: str, password: str, token: str = None, captcha_handler=captcha_handler) -> None:
|
def __init__(
|
||||||
|
self, username: str, password: str, token: str = None, captcha_handler=captcha_handler
|
||||||
|
) -> None:
|
||||||
"""Initializes the scraper.
|
"""Initializes the scraper.
|
||||||
|
|
||||||
This function receives a username and password (or access token) and performs
|
This function receives a username and password (or access token) and performs
|
||||||
@@ -53,7 +55,9 @@ class VkScraper:
|
|||||||
token : str
|
token : str
|
||||||
Access token received after authenticating, can be found in the vl_config.v2.json file
|
Access token received after authenticating, can be found in the vl_config.v2.json file
|
||||||
"""
|
"""
|
||||||
self.session = vk_api.VkApi(username, password, token=token, captcha_handler=captcha_handler)
|
self.session = vk_api.VkApi(
|
||||||
|
username, password, token=token, captcha_handler=captcha_handler
|
||||||
|
)
|
||||||
if token is None or len(token) == 0:
|
if token is None or len(token) == 0:
|
||||||
self.session.auth(token_only=True)
|
self.session.auth(token_only=True)
|
||||||
|
|
||||||
|
|||||||
@@ -1,11 +1,7 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import time
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
import requests
|
|
||||||
|
|
||||||
|
|
||||||
class DateTimeEncoder(json.JSONEncoder):
|
class DateTimeEncoder(json.JSONEncoder):
|
||||||
# to allow json.dump with datetimes do json.dumps(obj, cls=DateTimeEncoder)
|
# to allow json.dump with datetimes do json.dumps(obj, cls=DateTimeEncoder)
|
||||||
@@ -22,28 +18,8 @@ def mkdir_if_not_exists(folder):
|
|||||||
|
|
||||||
def captcha_handler(captcha):
|
def captcha_handler(captcha):
|
||||||
print(
|
print(
|
||||||
f"""CAPTCHA DETECTED, please solve it and put the solution into the webpage specified in the 'CAPTCHA_HANDLE_URL' env variable in the next 10min. Put the answer in the format "{captcha.sid}=SOLUTION".
|
f"CAPTCHA DETECTED, please solve it and input the solution. {captcha.sid=} {captcha.get_url()=}",
|
||||||
|
|
||||||
{captcha.sid=}
|
|
||||||
{captcha.get_url()=}
|
|
||||||
""",
|
|
||||||
flush=True,
|
flush=True,
|
||||||
)
|
)
|
||||||
if "CAPTCHA_HANDLE_URL" in os.environ:
|
key = input(f"Enter captcha code for {captcha.get_url()}:").strip()
|
||||||
url = os.environ["CAPTCHA_HANDLE_URL"]
|
return captcha.try_again(key)
|
||||||
regex_string = re.compile(f"{captcha.sid}=(.*)")
|
|
||||||
for wait in (10 * 6) * [10]: # tries every 10s for 10min
|
|
||||||
print(f"sending request to {url=}", flush=True)
|
|
||||||
r = requests.get(url)
|
|
||||||
if r.status_code == 200:
|
|
||||||
print(f"got response {r.text=}", flush=True)
|
|
||||||
if key := regex_string.search(r.text):
|
|
||||||
print(f"got captcha result {key=} {key[1]=}", flush=True)
|
|
||||||
return captcha.try_again(key[1])
|
|
||||||
print(f"sleeping {wait} seconds", flush=True)
|
|
||||||
time.sleep(wait)
|
|
||||||
else:
|
|
||||||
key = input(f"Enter captcha code for {captcha.get_url()}:").strip()
|
|
||||||
return captcha.try_again(key)
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|||||||
Reference in New Issue
Block a user