Compare commits

..

18 Commits

Author SHA1 Message Date
Miguel Ramalho
24a1313a65 Bump version to v0.2.4 for release 2022-06-21 01:33:38 +02:00
msramalho
64df4eec28 3.10 only due to test issues 2022-06-21 01:33:16 +02:00
Miguel Ramalho
42bdc1441c Bump version to v0.2.3 for release 2022-06-21 01:23:29 +02:00
msramalho
c25880ee6d fix tests 2022-06-21 01:21:53 +02:00
msramalho
e1e3648852 remove print 2022-06-21 01:17:47 +02:00
msramalho
c74dc280d8 fix ytdlp naming 2022-06-21 01:17:26 +02:00
Miguel Ramalho
ab15b35008 Bump version to v0.2.2 for release 2022-06-21 01:04:24 +02:00
msramalho
62c4536d0b fix ytdl filenames 2022-06-21 01:03:48 +02:00
Miguel Ramalho
eac0fc4904 Bump version to v0.2.1 for release 2022-06-20 23:57:04 +02:00
Miguel Ramalho
1341cd866f Bump version to v0.2.0 for release 2022-06-20 23:56:13 +02:00
Miguel Ramalho
7824c2922d Bump version to v0.2.0 for release 2022-06-20 23:54:52 +02:00
msramalho
c9a3ece9af adds command line interface 2022-06-20 23:52:14 +02:00
msramalho
50b78d618a .txt 2022-06-20 13:45:23 +02:00
msramalho
c4a1333428 cleanup 2022-06-20 13:44:05 +02:00
msramalho
edb02ae049 version 2022-06-18 00:16:41 +02:00
Miguel Ramalho
284fd3fdf7 fix tests 2022-06-18 00:16:12 +02:00
msramalho
3078495a2a version update 2022-06-18 00:11:51 +02:00
msramalho
187cfa83c8 docs 2022-06-18 00:11:24 +02:00
26 changed files with 594 additions and 263 deletions

View File

@@ -1,18 +1,18 @@
<!-- To ensure we can review your pull request promptly please complete this template entirely. -->
<!-- Please reference the issue number here. You can replace "Fixes" with "Closes" if it makes more sense. -->
Fixes #
Changes proposed in this pull request:
<!-- Please list all changes/additions here. -->
-
## Before submitting
<!-- Please complete this checklist BEFORE submitting your PR to speed along the review process. -->
- [ ] I've read and followed all steps in the [Making a pull request](https://github.com/bellingcat/vk-url-scraper/blob/main/CONTRIBUTING.md#making-a-pull-request)
section of the `CONTRIBUTING` docs.
- [ ] I've updated or added any relevant docstrings following the syntax described in the
[Writing docstrings](https://github.com/bellingcat/vk-url-scraper/blob/main/CONTRIBUTING.md#writing-docstrings) section of the `CONTRIBUTING` docs.
- [ ] If this PR fixes a bug, I've added a test that will fail without my fix.
- [ ] If this PR adds a new feature, I've added tests that sufficiently cover my new functionality.
<!-- To ensure we can review your pull request promptly please complete this template entirely. -->
<!-- Please reference the issue number here. You can replace "Fixes" with "Closes" if it makes more sense. -->
Fixes #
Changes proposed in this pull request:
<!-- Please list all changes/additions here. -->
-
## Before submitting
<!-- Please complete this checklist BEFORE submitting your PR to speed along the review process. -->
- [ ] I've read and followed all steps in the [Making a pull request](https://github.com/bellingcat/vk-url-scraper/blob/main/CONTRIBUTING.md#making-a-pull-request)
section of the `CONTRIBUTING` docs.
- [ ] I've updated or added any relevant docstrings following the syntax described in the
[Writing docstrings](https://github.com/bellingcat/vk-url-scraper/blob/main/CONTRIBUTING.md#writing-docstrings) section of the `CONTRIBUTING` docs.
- [ ] If this PR fixes a bug, I've added a test that will fail without my fix.
- [ ] If this PR adds a new feature, I've added tests that sufficiently cover my new functionality.

View File

@@ -29,11 +29,12 @@ jobs:
strategy:
fail-fast: false
matrix:
python: ['3.7', '3.10']
task:
# python: ['3.7', '3.10']
python: ['3.10']
task: # --show-capture=no on purpose
- name: Test
run: |
pytest --color=yes tests/
pytest --show-capture=no --color=yes tests/
include:
- python: '3.10'
@@ -121,10 +122,6 @@ jobs:
name: package
path: dist
# - name: Generate release notes
# run: |
# python scripts/release_notes.py > ${{ github.workspace }}-RELEASE_NOTES.md
- name: Publish package to PyPI
run: |
twine upload -u '${{ secrets.PYPI_USERNAME }}' -p '${{ secrets.PYPI_PASSWORD }}' dist/*

View File

@@ -1,27 +0,0 @@
name: PR Checks
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
on:
pull_request:
branches:
- main
paths:
- 'vk_url_scraper/**'
jobs:
changelog:
name: CHANGELOG
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'
steps:
- uses: actions/checkout@v1
- name: Check that CHANGELOG has been updated
run: |
# If this step fails, this means you haven't updated the CHANGELOG.md
# file with notes on your contribution.
git diff --name-only $(git merge-base origin/main HEAD) | grep '^CHANGELOG.md$' && echo "Thanks for helping keep our CHANGELOG up-to-date!"

1
.gitignore vendored
View File

@@ -1,5 +1,6 @@
.env
vk_config.v2.json
output/
# build artifacts
.eggs/

View File

@@ -2,7 +2,7 @@ version: 2
sphinx:
configuration: docs/source/conf.py
fail_on_warning: true
fail_on_warning: false
python:
version: "3.8"

View File

@@ -1,13 +0,0 @@
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## Unreleased
## [0.1.2]
### Added wall scraper with tests
### Added photo scraper with tests
### Added scraper with tests

View File

@@ -156,8 +156,6 @@ When you're ready to contribute code to address an open issue, please follow the
If the build fails, it's most likely due to small formatting issues. If the error message isn't clear, feel free to comment on this in your pull request.
And finally, please update the [CHANGELOG](https://github.com/bellingcat/vk-url-scraper/blob/main/CHANGELOG.md) with notes on your contribution in the "Unreleased" section at the top.
After all of the above checks have passed, you can now open [a new GitHub pull request](https://github.com/bellingcat/vk-url-scraper/pulls).
Make sure you have a clear description of the problem and the solution, and include a link to relevant issues.

View File

@@ -5,8 +5,12 @@ docs :
.PHONY : run-checks
run-checks :
isort --check .
black --check .
# do with --check to not change files
# isort --check .
# black --check .
# do like this to fix files
isort .
black .
flake8 .
mypy .
CUDA_VISIBLE_DEVICES='' pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/

View File

@@ -5,6 +5,7 @@ name = "pypi"
[packages]
vk-api = "*"
yt-dlp = "*"
[dev-packages]
sphinx-copybutton = "==0.5.0"

177
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "bab533e734f6da55647cc76a9f5a51d46c641723d485e38a16e2e31bca097130"
"sha256": "4224e1159b48a3e903601184bf0d3f7613a817b5fca7062a119c549563527798"
},
"pipfile-spec": 6,
"requires": {
@@ -16,6 +16,74 @@
]
},
"default": {
"brotli": {
"hashes": [
"sha256:12effe280b8ebfd389022aa65114e30407540ccb89b177d3fbc9a4f177c4bd5d",
"sha256:160c78292e98d21e73a4cc7f76a234390e516afcd982fa17e1422f7c6a9ce9c8",
"sha256:16d528a45c2e1909c2798f27f7bf0a3feec1dc9e50948e738b961618e38b6a7b",
"sha256:19598ecddd8a212aedb1ffa15763dd52a388518c4550e615aed88dc3753c0f0c",
"sha256:1c48472a6ba3b113452355b9af0a60da5c2ae60477f8feda8346f8fd48e3e87c",
"sha256:268fe94547ba25b58ebc724680609c8ee3e5a843202e9a381f6f9c5e8bdb5c70",
"sha256:269a5743a393c65db46a7bb982644c67ecba4b8d91b392403ad8a861ba6f495f",
"sha256:26d168aac4aaec9a4394221240e8a5436b5634adc3cd1cdf637f6645cecbf181",
"sha256:29d1d350178e5225397e28ea1b7aca3648fcbab546d20e7475805437bfb0a130",
"sha256:2aad0e0baa04517741c9bb5b07586c642302e5fb3e75319cb62087bd0995ab19",
"sha256:3496fc835370da351d37cada4cf744039616a6db7d13c430035e901443a34daa",
"sha256:35a3edbe18e876e596553c4007a087f8bcfd538f19bc116917b3c7522fca0429",
"sha256:3b78a24b5fd13c03ee2b7b86290ed20efdc95da75a3557cc06811764d5ad1126",
"sha256:40d15c79f42e0a2c72892bf407979febd9cf91f36f495ffb333d1d04cebb34e4",
"sha256:44bb8ff420c1d19d91d79d8c3574b8954288bdff0273bf788954064d260d7ab0",
"sha256:4688c1e42968ba52e57d8670ad2306fe92e0169c6f3af0089be75bbac0c64a3b",
"sha256:495ba7e49c2db22b046a53b469bbecea802efce200dffb69b93dd47397edc9b6",
"sha256:4d1b810aa0ed773f81dceda2cc7b403d01057458730e309856356d4ef4188438",
"sha256:503fa6af7da9f4b5780bb7e4cbe0c639b010f12be85d02c99452825dd0feef3f",
"sha256:56d027eace784738457437df7331965473f2c0da2c70e1a1f6fdbae5402e0389",
"sha256:5913a1177fc36e30fcf6dc868ce23b0453952c78c04c266d3149b3d39e1410d6",
"sha256:5b6ef7d9f9c38292df3690fe3e302b5b530999fa90014853dcd0d6902fb59f26",
"sha256:5cb1e18167792d7d21e21365d7650b72d5081ed476123ff7b8cac7f45189c0c7",
"sha256:61a7ee1f13ab913897dac7da44a73c6d44d48a4adff42a5701e3239791c96e14",
"sha256:622a231b08899c864eb87e85f81c75e7b9ce05b001e59bbfbf43d4a71f5f32b2",
"sha256:68715970f16b6e92c574c30747c95cf8cf62804569647386ff032195dc89a430",
"sha256:6b2ae9f5f67f89aade1fab0f7fd8f2832501311c363a21579d02defa844d9296",
"sha256:6c772d6c0a79ac0f414a9f8947cc407e119b8598de7621f39cacadae3cf57d12",
"sha256:6d847b14f7ea89f6ad3c9e3901d1bc4835f6b390a9c71df999b0162d9bb1e20f",
"sha256:76ffebb907bec09ff511bb3acc077695e2c32bc2142819491579a695f77ffd4d",
"sha256:7bbff90b63328013e1e8cb50650ae0b9bac54ffb4be6104378490193cd60f85a",
"sha256:7cb81373984cc0e4682f31bc3d6be9026006d96eecd07ea49aafb06897746452",
"sha256:7ee83d3e3a024a9618e5be64648d6d11c37047ac48adff25f12fa4226cf23d1c",
"sha256:854c33dad5ba0fbd6ab69185fec8dab89e13cda6b7d191ba111987df74f38761",
"sha256:85f7912459c67eaab2fb854ed2bc1cc25772b300545fe7ed2dc03954da638649",
"sha256:87fdccbb6bb589095f413b1e05734ba492c962b4a45a13ff3408fa44ffe6479b",
"sha256:88c63a1b55f352b02c6ffd24b15ead9fc0e8bf781dbe070213039324922a2eea",
"sha256:8a674ac10e0a87b683f4fa2b6fa41090edfd686a6524bd8dedbd6138b309175c",
"sha256:93130612b837103e15ac3f9cbacb4613f9e348b58b3aad53721d92e57f96d46a",
"sha256:9744a863b489c79a73aba014df554b0e7a0fc44ef3f8a0ef2a52919c7d155031",
"sha256:9749a124280a0ada4187a6cfd1ffd35c350fb3af79c706589d98e088c5044267",
"sha256:97f715cf371b16ac88b8c19da00029804e20e25f30d80203417255d239f228b5",
"sha256:9bf919756d25e4114ace16a8ce91eb340eb57a08e2c6950c3cebcbe3dff2a5e7",
"sha256:9d12cf2851759b8de8ca5fde36a59c08210a97ffca0eb94c532ce7b17c6a3d1d",
"sha256:9ed4c92a0665002ff8ea852353aeb60d9141eb04109e88928026d3c8a9e5433c",
"sha256:a72661af47119a80d82fa583b554095308d6a4c356b2a554fdc2799bc19f2a43",
"sha256:afde17ae04d90fbe53afb628f7f2d4ca022797aa093e809de5c3cf276f61bbfa",
"sha256:b336c5e9cf03c7be40c47b5fd694c43c9f1358a80ba384a21969e0b4e66a9b17",
"sha256:b663f1e02de5d0573610756398e44c130add0eb9a3fc912a09665332942a2efb",
"sha256:b83bb06a0192cccf1eb8d0a28672a1b79c74c3a8a5f2619625aeb6f28b3a82bb",
"sha256:c2415d9d082152460f2bd4e382a1e85aed233abc92db5a3880da2257dc7daf7b",
"sha256:c83aa123d56f2e060644427a882a36b3c12db93727ad7a7b9efd7d7f3e9cc2c4",
"sha256:cfc391f4429ee0a9370aa93d812a52e1fee0f37a81861f4fdd1f4fb28e8547c3",
"sha256:db844eb158a87ccab83e868a762ea8024ae27337fc7ddcbfcddd157f841fdfe7",
"sha256:defed7ea5f218a9f2336301e6fd379f55c655bea65ba2476346340a0ce6f74a1",
"sha256:e16eb9541f3dd1a3e92b89005e37b1257b157b7256df0e36bd7b33b50be73bcb",
"sha256:e23281b9a08ec338469268f98f194658abfb13658ee98e2b7f85ee9dd06caa91",
"sha256:e2d9e1cbc1b25e22000328702b014227737756f4b5bf5c485ac1d8091ada078b",
"sha256:e48f4234f2469ed012a98f4b7874e7f7e173c167bed4934912a29e03167cf6b1",
"sha256:e4c4e92c14a57c9bd4cb4be678c25369bf7a092d55fd0866f759e425b9660806",
"sha256:ec1947eabbaf8e0531e8e899fc1d9876c179fc518989461f5d24e2223395a9e3",
"sha256:f909bbbc433048b499cb9db9e713b5d8d949e8c109a2a548502fb9aa8630f0b1"
],
"markers": "platform_python_implementation == 'CPython'",
"version": "==1.0.9"
},
"certifi": {
"hashes": [
"sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d",
@@ -40,6 +108,47 @@
"markers": "python_full_version >= '3.5.0'",
"version": "==3.3"
},
"mutagen": {
"hashes": [
"sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1",
"sha256:9c9f243fcec7f410f138cb12c21c84c64fde4195481a30c9bfb05b5f003adfed"
],
"markers": "python_version < '4' and python_full_version >= '3.5.0'",
"version": "==1.45.1"
},
"pycryptodomex": {
"hashes": [
"sha256:1ca8e1b4c62038bb2da55451385246f51f412c5f5eabd64812c01766a5989b4a",
"sha256:298c00ea41a81a491d5b244d295d18369e5aac4b61b77b2de5b249ca61cd6659",
"sha256:2aa887683eee493e015545bd69d3d21ac8d5ad582674ec98f4af84511e353e45",
"sha256:2ce76ed0081fd6ac8c74edc75b9d14eca2064173af79843c24fa62573263c1f2",
"sha256:3da13c2535b7aea94cc2a6d1b1b37746814c74b6e80790daddd55ca5c120a489",
"sha256:406ec8cfe0c098fadb18d597dc2ee6de4428d640c0ccafa453f3d9b2e58d29e2",
"sha256:4d0db8df9ffae36f416897ad184608d9d7a8c2b46c4612c6bc759b26c073f750",
"sha256:530756d2faa40af4c1f74123e1d889bd07feae45bac2fd32f259a35f7aa74151",
"sha256:77931df40bb5ce5e13f4de2bfc982b2ddc0198971fbd947776c8bb5050896eb2",
"sha256:797a36bd1f69df9e2798e33edb4bd04e5a30478efc08f9428c087f17f65a7045",
"sha256:8085bd0ad2034352eee4d4f3e2da985c2749cb7344b939f4d95ead38c2520859",
"sha256:8536bc08d130cae6dcba1ea689f2913dfd332d06113904d171f2f56da6228e89",
"sha256:a4d412eba5679ede84b41dbe48b1bed8f33131ab9db06c238a235334733acc5e",
"sha256:aebecde2adc4a6847094d3bd6a8a9538ef3438a5ea84ac1983fcb167db614461",
"sha256:b276cc4deb4a80f9dfd47a41ebb464b1fe91efd8b1b8620cf5ccf8b824b850d6",
"sha256:b5a185ae79f899b01ca49f365bdf15a45d78d9856f09b0de1a41b92afce1a07f",
"sha256:c4d8977ccda886d88dc3ca789de2f1adc714df912ff3934b3d0a3f3d777deafb",
"sha256:c5dd3ffa663c982d7f1be9eb494a8924f6d40e2e2f7d1d27384cfab1b2ac0662",
"sha256:ca88f2f7020002638276439a01ffbb0355634907d1aa5ca91f3dc0c2e44e8f3b",
"sha256:d2cce1c82a7845d7e2e8a0956c6b7ed3f1661c9acf18eb120fc71e098ab5c6fe",
"sha256:d709572d64825d8d59ea112e11cc7faf6007f294e9951324b7574af4251e4de8",
"sha256:da8db8374295fb532b4b0c467e66800ef17d100e4d5faa2bbbd6df35502da125",
"sha256:e36c7e3b5382cd5669cf199c4a04a0279a43b2a3bdd77627e9b89778ac9ec08c",
"sha256:e95a4a6c54d27a84a4624d2af8bb9ee178111604653194ca6880c98dcad92f48",
"sha256:ee835def05622e0c8b1435a906491760a43d0c462f065ec9143ec4b8d79f8bff",
"sha256:f75009715dcf4a3d680c2338ab19dac5498f8121173a929872950f4fb3a48fbf",
"sha256:f8524b8bc89470cec7ac51734907818d3620fb1637f8f8b542d650ebec42a126"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==3.14.1"
},
"requests": {
"hashes": [
"sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f",
@@ -63,6 +172,68 @@
],
"index": "pypi",
"version": "==11.9.8"
},
"websockets": {
"hashes": [
"sha256:07cdc0a5b2549bcfbadb585ad8471ebdc7bdf91e32e34ae3889001c1c106a6af",
"sha256:210aad7fdd381c52e58777560860c7e6110b6174488ef1d4b681c08b68bf7f8c",
"sha256:28dd20b938a57c3124028680dc1600c197294da5db4292c76a0b48efb3ed7f76",
"sha256:2f94fa3ae454a63ea3a19f73b95deeebc9f02ba2d5617ca16f0bbdae375cda47",
"sha256:31564a67c3e4005f27815634343df688b25705cccb22bc1db621c781ddc64c69",
"sha256:347974105bbd4ea068106ec65e8e8ebd86f28c19e529d115d89bd8cc5cda3079",
"sha256:379e03422178436af4f3abe0aa8f401aa77ae2487843738542a75faf44a31f0c",
"sha256:3eda1cb7e9da1b22588cefff09f0951771d6ee9fa8dbe66f5ae04cc5f26b2b55",
"sha256:51695d3b199cd03098ae5b42833006a0f43dc5418d3102972addc593a783bc02",
"sha256:54c000abeaff6d8771a4e2cef40900919908ea7b6b6a30eae72752607c6db559",
"sha256:5b936bf552e4f6357f5727579072ff1e1324717902127ffe60c92d29b67b7be3",
"sha256:6075fd24df23133c1b078e08a9b04a3bc40b31a8def4ee0b9f2c8865acce913e",
"sha256:661f641b44ed315556a2fa630239adfd77bd1b11cb0b9d96ed8ad90b0b1e4978",
"sha256:6ea6b300a6bdd782e49922d690e11c3669828fe36fc2471408c58b93b5535a98",
"sha256:6ed1d6f791eabfd9808afea1e068f5e59418e55721db8b7f3bfc39dc831c42ae",
"sha256:7934e055fd5cd9dee60f11d16c8d79c4567315824bacb1246d0208a47eca9755",
"sha256:7ab36e17af592eec5747c68ef2722a74c1a4a70f3772bc661079baf4ae30e40d",
"sha256:7f6d96fdb0975044fdd7953b35d003b03f9e2bcf85f2d2cf86285ece53e9f991",
"sha256:83e5ca0d5b743cde3d29fda74ccab37bdd0911f25bd4cdf09ff8b51b7b4f2fa1",
"sha256:85506b3328a9e083cc0a0fb3ba27e33c8db78341b3eb12eb72e8afd166c36680",
"sha256:8af75085b4bc0b5c40c4a3c0e113fa95e84c60f4ed6786cbb675aeb1ee128247",
"sha256:8b1359aba0ff810d5830d5ab8e2c4a02bebf98a60aa0124fb29aa78cfdb8031f",
"sha256:8fbd7d77f8aba46d43245e86dd91a8970eac4fb74c473f8e30e9c07581f852b2",
"sha256:907e8247480f287aa9bbc9391bd6de23c906d48af54c8c421df84655eef66af7",
"sha256:93d5ea0b5da8d66d868b32c614d2b52d14304444e39e13a59566d4acb8d6e2e4",
"sha256:97bc9d41e69a7521a358f9b8e44871f6cdeb42af31815c17aed36372d4eec667",
"sha256:994cdb1942a7a4c2e10098d9162948c9e7b235df755de91ca33f6e0481366fdb",
"sha256:a141de3d5a92188234afa61653ed0bbd2dde46ad47b15c3042ffb89548e77094",
"sha256:a1e15b230c3613e8ea82c9fc6941b2093e8eb939dd794c02754d33980ba81e36",
"sha256:aad5e300ab32036eb3fdc350ad30877210e2f51bceaca83fb7fef4d2b6c72b79",
"sha256:b529fdfa881b69fe563dbd98acce84f3e5a67df13de415e143ef053ff006d500",
"sha256:b9c77f0d1436ea4b4dc089ed8335fa141e6a251a92f75f675056dac4ab47a71e",
"sha256:bb621ec2dbbbe8df78a27dbd9dd7919f9b7d32a73fafcb4d9252fc4637343582",
"sha256:c7250848ce69559756ad0086a37b82c986cd33c2d344ab87fea596c5ac6d9442",
"sha256:c8d1d14aa0f600b5be363077b621b1b4d1eb3fbf90af83f9281cda668e6ff7fd",
"sha256:d1655a6fc7aecd333b079d00fb3c8132d18988e47f19740c69303bf02e9883c6",
"sha256:d6353ba89cfc657a3f5beabb3b69be226adbb5c6c7a66398e17809b0ce3c4731",
"sha256:da4377904a3379f0c1b75a965fff23b28315bcd516d27f99a803720dfebd94d4",
"sha256:e49ea4c1a9543d2bd8a747ff24411509c29e4bdcde05b5b0895e2120cb1a761d",
"sha256:e4e08305bfd76ba8edab08dcc6496f40674f44eb9d5e23153efa0a35750337e8",
"sha256:e6fa05a680e35d0fcc1470cb070b10e6fe247af54768f488ed93542e71339d6f",
"sha256:e7e6f2d6fd48422071cc8a6f8542016f350b79cc782752de531577d35e9bd677",
"sha256:e904c0381c014b914136c492c8fa711ca4cced4e9b3d110e5e7d436d0fc289e8",
"sha256:ec2b0ab7edc8cd4b0eb428b38ed89079bdc20c6bdb5f889d353011038caac2f9",
"sha256:ef5ce841e102278c1c2e98f043db99d6755b1c58bde475516aef3a008ed7f28e",
"sha256:f351c7d7d92f67c0609329ab2735eee0426a03022771b00102816a72715bb00b",
"sha256:fab7c640815812ed5f10fbee7abbf58788d602046b7bb3af9b1ac753a6d5e916",
"sha256:fc06cc8073c8e87072138ba1e431300e2d408f054b27047d047b549455066ff4"
],
"markers": "python_version >= '3.7'",
"version": "==10.3"
},
"yt-dlp": {
"hashes": [
"sha256:3a7b59d2fb4b39ce8ba8e0b9c5a37fe20e5624f46a2346b4ae66ab1320e35134",
"sha256:deec1009442312c1e2ee5298966842194d0e950b433f0d4fc844ef464b9c32a7"
],
"index": "pypi",
"version": "==2022.5.18"
}
},
"develop": {
@@ -351,7 +522,7 @@
"sha256:5d26852efe48c0a32b0509ffbc583fda1a2266545a78d104a6f4aff3db17d700",
"sha256:c58c8eb8a762858f49e18436ff552e83914778e50e9d2f1660535ffb364552ec"
],
"markers": "python_version >= '3.7'",
"markers": "python_version < '3.10'",
"version": "==4.11.4"
},
"iniconfig": {
@@ -720,7 +891,7 @@
"sha256:4c586de507202505346f3e32d1363eb9ed6932f0c2f63184dea88983ff4971e2",
"sha256:d2bbd99c320a2532ac71ff6a3164867884357da3e3301f0240090c5d2fdac7ec"
],
"markers": "python_version < '4' and python_full_version >= '3.6.3'",
"markers": "python_full_version >= '3.6.3' and python_full_version < '4.0.0'",
"version": "==12.4.4"
},
"secretstorage": {

101
README.md
View File

@@ -1 +1,102 @@
# vk-url-scraper
Library to scrape data and especially media links (videos and photos) from vk.com URLs.
You can use it via the [command line](#command-line-usage) or as a [python library](#python-library-usage).
## Installation
You can install the most recent release from [pypi](https://pypi.org/project/vk-url-scraper/) via `pip install vk-url-scraper`.
To use the library you will need a valid username/password combination for vk.com.
## Command line usage
```bash
# run this to learn more about the parameters
vk_url_scraper --help
# scrape a URL and get the JSON result in the console
vk_url_scraper -username "username here" --password "password here" --urls https://vk.com/wall12345_6789
# OR
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789
# you can also have multiple urls
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 https://vk.com/photo-12345_6789 https://vk.com/video12345_6789
# save the JSON output into a file
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 > output.json
# download any photos or videos found in these URLS
# this will use or create an output/ folder and dump the files there
vk_url_scraper -u "username here" -p "password here" --download --urls https://vk.com/wall12345_6789
# or
vk_url_scraper -u "username here" -p "password here" -d --urls https://vk.com/wall12345_6789
```
## Python library usage
```python
from vk_url_scraper import VkScraper
vks = VkScraper("username", "password")
# scrape any "photo" URL
res = vks.scrape("https://vk.com/photo1_278184324?rev=1")
# scrape any "wall" URL
res = vks.scrape("https://vk.com/wall-1_398461")
# scrape any "video" URL
res = vks.scrape("https://vk.com/video-6596301_145810025")
print(res[0]["text]) # eg: -> to get the text from code
```
```python
# Every scrape* function returns a list of dict like
{
"id": "wall_id",
"text": "text in this post" ,
"datetime": utc datetime of post,
"attachments": {
# if photo, video, link exists
"photo": [list of urls with max quality],
"video": [list of urls with max quality],
"link": [list of urls with max quality],
},
"payload": "original JSON response converted to dict which you can parse for more data
}
```
see [docs] for all available functions.
### TODO
* scrape album links
* scrape profile links
* docs online from sphinx
## Development
(more info in [CONTRIBUTING.md](CONTRIBUTING.md)).
1. setup dev environment with `pip install -r dev-requirements.txt` or `pipenv install -r dev-requirements.txt`
1. setup environment with `pip install -r requirements.txt` or `pipenv install -r requirements.txt`
2. To run all checks to `make run-checks` (fixes style) or individually
1. To fix style: `black .` and `isort .` -> `flake8 .` to validate lint
2. To do type checking: `mypy .`
3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to user verbose, colors, and test docstring examples)
3. `make docs` to generate shpynx docs -> edit [config.py](docs/source/conf.py) if needed
To test the command line interface available in [__main__.py](__vk_url_scraper/__main__.py) you need to pass the `-m` option to python like so: `python -m vk_url_scraper -u "" -p "" --urls ...`
## Releasing new version
1. edit [version.py](vk_url_scraper/version.py) with proper versioning
2. run `./scripts/release.sh` to create a tag and push, alternatively
1. `git tag vx.y.z` to tag version
2. `git push origin vx.y.z` -> this will trigger workflow and put project on [pypi](https://pypi.org/project/vk-url-scraper/)
### Fixing a failed release
If for some reason the GitHub Actions release workflow failed with an error that needs to be fixed, you'll have to delete both the tag and corresponding release from GitHub. After you've pushed a fix, delete the tag from your local clone with
```bash
git tag -l | xargs git tag -d && git fetch -t
```
Then repeat the steps above.

View File

@@ -1,24 +0,0 @@
# GitHub Release Process
## Steps
1. Update the version in `vk_url_scraper/version.py`.
3. Run the release script:
```bash
./scripts/release.sh
```
This will commit the changes to the CHANGELOG and `version.py` files and then create a new tag in git
which will trigger a workflow on GitHub Actions that handles the rest.
## Fixing a failed release
If for some reason the GitHub Actions release workflow failed with an error that needs to be fixed, you'll have to delete both the tag and corresponding release from GitHub. After you've pushed a fix, delete the tag from your local clone with
```bash
git tag -l | xargs git tag -d && git fetch -t
```
Then repeat the steps above.

View File

@@ -1 +0,0 @@
../../CHANGELOG.md

View File

@@ -23,7 +23,6 @@ Contents
installation
overview
CHANGELOG
.. toctree::
:hidden:

View File

@@ -11,4 +11,5 @@ idna==3.3
requests==2.28.0
urllib3==1.26.9
vk-api==11.9.8
python-dotenv==0.20.0
python-dotenv==0.20.0
yt-dlp==2022.5.18

View File

@@ -1,39 +0,0 @@
from datetime import datetime
from pathlib import Path
from vk_url_scraper.version import VERSION
def main():
changelog = Path("CHANGELOG.md")
with changelog.open() as f:
lines = f.readlines()
insert_index: int = -1
for i in range(len(lines)):
line = lines[i]
if line.startswith("## Unreleased"):
insert_index = i + 1
elif line.startswith(f"## [v{VERSION}]"):
print("CHANGELOG already up-to-date")
return
elif line.startswith("## [v"):
break
if insert_index < 0:
raise RuntimeError("Couldn't find 'Unreleased' section")
lines.insert(insert_index, "\n")
lines.insert(
insert_index + 1,
f"## [v{VERSION}](https://github.com/bellingcat/vk-url-scraper/releases/tag/v{VERSION}) - "
f"{datetime.now().strftime('%Y-%m-%d')}\n",
)
with changelog.open("w") as f:
f.writelines(lines)
if __name__ == "__main__":
main()

View File

@@ -7,7 +7,6 @@ TAG=$(python -c 'from vk_url_scraper.version import VERSION; print("v" + VERSION
read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt
if [[ $prompt == "y" || $prompt == "Y" || $prompt == "yes" || $prompt == "Yes" ]]; then
python scripts/prepare_changelog.py
git add -A
git commit -m "Bump version to $TAG for release" || true && git push
echo "Creating new git tag $TAG"

View File

@@ -1,78 +0,0 @@
# encoding: utf-8
"""
Prepares markdown release notes for GitHub releases.
"""
import os
from typing import List, Optional
import packaging.version
TAG = os.environ["TAG"]
ADDED_HEADER = "### Added 🎉"
CHANGED_HEADER = "### Changed ⚠️"
FIXED_HEADER = "### Fixed ✅"
REMOVED_HEADER = "### Removed 👋"
def get_change_log_notes() -> str:
in_current_section = False
current_section_notes: List[str] = []
with open("CHANGELOG.md") as changelog:
for line in changelog:
if line.startswith("## "):
if line.startswith("## Unreleased"):
continue
if line.startswith(f"## [{TAG}]"):
in_current_section = True
continue
break
if in_current_section:
if line.startswith("### Added"):
line = ADDED_HEADER + "\n"
elif line.startswith("### Changed"):
line = CHANGED_HEADER + "\n"
elif line.startswith("### Fixed"):
line = FIXED_HEADER + "\n"
elif line.startswith("### Removed"):
line = REMOVED_HEADER + "\n"
current_section_notes.append(line)
assert current_section_notes
return "## What's new\n\n" + "".join(current_section_notes).strip() + "\n"
def get_commit_history() -> str:
new_version = packaging.version.parse(TAG)
# Get all tags sorted by version, latest first.
all_tags = os.popen("git tag -l --sort=-version:refname 'v*'").read().split("\n")
# Out of `all_tags`, find the latest previous version so that we can collect all
# commits between that version and the new version we're about to publish.
# Note that we ignore pre-releases unless the new version is also a pre-release.
last_tag: Optional[str] = None
for tag in all_tags:
if not tag.strip(): # could be blank line
continue
version = packaging.version.parse(tag)
if new_version.pre is None and version.pre is not None:
continue
if version < new_version:
last_tag = tag
break
if last_tag is not None:
commits = os.popen(f"git log {last_tag}..{TAG}^ --oneline --first-parent").read()
else:
commits = os.popen("git log --oneline --first-parent").read()
return "## Commits\n\n" + commits
def main():
print(get_change_log_notes())
print(get_commit_history())
if __name__ == "__main__":
main()

View File

@@ -47,7 +47,7 @@ setup(
url="https://github.com/bellingcat/vk-url-scraper",
author="Bellingcat",
author_email="tech@bellingcat.com",
license="Apache",
license="MIT",
packages=find_packages(
exclude=["*.tests", "*.tests.*", "tests.*", "tests"],
),

View File

@@ -1,12 +1,11 @@
import datetime
import os
import tempfile
import pytest
from vk_url_scraper import VkScraper
from .util import assert_equal_lists
vks = None
@@ -24,6 +23,15 @@ def test_scrape_empty_urll():
assert [] == vks.scrape("something")
def test_scrape_no_vk_parseable_info():
assert len(vks.scrape("")) == 0
assert len(vks.scrape("google.com")) == 0
assert len(vks.scrape("vk.com")) == 0
assert len(vks.scrape("vk.com/wall")) == 0
assert len(vks.scrape("vk.com/photo")) == 0
assert len(vks.scrape("vk.com/video")) == 0
def test_scrape_wall_url_with_text_only():
res = vks.scrape("https://vk.com/wall-1_398461")
assert len(res) == 1
@@ -73,12 +81,30 @@ def test_scrape_wall_url_with_photos_inner_videos_and_links_with_inner_photos():
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 11, 1, 9))
assert len(res[0]["payload"]) == 15
assert len(res[0]["attachments"].keys()) == 3
assert_equal_lists(list(res[0]["attachments"].keys()), ["photo", "link", "video"])
for k in ["photo", "link", "video"]:
assert k in list(res[0]["attachments"].keys())
assert len(res[0]["attachments"]["photo"]) == 5
assert len(res[0]["attachments"]["link"]) == 1
assert len(res[0]["attachments"]["video"]) == 1
def test_scrape_download_multiple_media():
res = vks.scrape("https://vk.com/w=wall-17315087_74182")
with tempfile.TemporaryDirectory(dir="./") as tempdir:
vks.download_media(res, tempdir)
expect_files = {
"wall-17315087_74182_0.jpg",
"wall-17315087_74182_1.jpg",
"wall-17315087_74182_2.jpg",
"wall-17315087_74182_3.jpg",
"wall-17315087_74182_4.jpg",
"wall-17315087_74182_0.mkv",
}
found_files = set(os.listdir(tempdir))
assert len(expect_files) == len(expect_files & found_files)
def test_scrape_photo_only():
res = vks.scrape("https://vk.com/apiclub?z=photo-1_457242435%2Falbum-1_00%2Frev")
assert len(res) == 1
@@ -105,5 +131,12 @@ def test_scrape_video_only():
def test_scrape_video_only2():
res = vks.scrape("https://vk.com/video-1_456239018")
print(res[0]["attachments"]["video"][0])
res = vks.scrape("https://vk.com/video-17546758_456239898")
with tempfile.TemporaryDirectory(dir="./") as tempdir:
vks.download_media(res, tempdir)
found_files = set(os.listdir(tempdir))
# different systems might attribute different extension
assert (
"video-17546758_456239898_0.webm" in found_files
or "video-17546758_456239898_0.mp4" in found_files
)

View File

@@ -1,3 +0,0 @@
def assert_equal_lists(l1, l2):
assert len(l1) == len(l2)
assert str(sorted(l1)) == str(sorted(l2))

View File

@@ -1 +1,2 @@
from .scraper import VkScraper
from .utils import DateTimeEncoder, mkdir_if_not_exists

View File

@@ -0,0 +1,63 @@
import argparse
import json
from .scraper import VkScraper
from .utils import DateTimeEncoder
def get_argument_parser():
"""
Creates the CMD line arguments. 'python vk_url_scraper.py --help'
"""
parser = argparse.ArgumentParser(
description="Authenticate and scrape information from vk.com based on a URL or set of URLs."
)
parser.add_argument(
"-u",
"--username",
action="store",
dest="username",
required=True,
help="username for a valid vk.com account",
)
parser.add_argument(
"-p",
"--password",
action="store",
dest="password",
required=True,
help="password for the valid vk.com account",
)
parser.add_argument(
"-d",
"--download",
action=argparse.BooleanOptionalAction,
dest="download",
help="if set then all photos and videos will be downloaded to folder output/",
)
parser.add_argument(
"--urls",
action="store",
dest="urls",
nargs=argparse.REMAINDER,
required=True,
help="must be the last argument: any text with one or more urls to scrape",
)
return parser
def main():
parser = get_argument_parser()
args = parser.parse_args()
vks = VkScraper(args.username, args.password)
text = " ".join(args.urls)
res = vks.scrape(text)
res_json = json.dumps(res, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
print(res_json)
if args.download:
vks.download_media(res)
if __name__ == "__main__":
main()

View File

@@ -1,20 +1,45 @@
import os
import re
from collections import defaultdict
from datetime import datetime
from typing import List
from urllib.parse import urlparse
import requests
import vk_api # used to get api_token after authentication
import yt_dlp # to download videos from url
from .utils import mkdir_if_not_exists
class VkScraper:
"""VkScraper class that allows to authenticate and scrape URLs.
All `scrape*` functions return a payload like:
.. highlight:: python
.. code-block:: python
{
"id": "wall_id",
"text": "text in this post" ,
"datetime": datetime of post,
"attachments": {
# only present values will appear, can be empty dict
"photo": [list of urls with max quality],
"video": [list of urls with max quality],
"link": [list of urls with max quality],
},
"payload": {"more": "original JSON response as dict which you can parse for more data"}
}
"""
WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)")
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)")
def __init__(self, username: str, password: str, verbose: bool = True) -> None:
"""
Initializes the scraper.
def __init__(self, username: str, password: str) -> None:
"""Initializes the scraper.
This function receives a username and password and performs authentication on vk.com to then call api endpoints
@@ -24,44 +49,54 @@ class VkScraper:
Username on vk.com, can be a phone number or email
password : str
Matching password on vk.com
verbose : bool = False
If True will log debug info
Examples
--------
>>> VkScraper("+12345678", "password")
"""
self.session = vk_api.VkApi(username, password)
self.session.auth(token_only=True)
self.verbose = verbose
def scrape(self, url: str) -> List:
"""Scrapes a URL for multiple possibilities of inner links such as wall, video, photo, ...
Parameters
----------
url : str
The URL to parse and analyze content from, typically shared from vk.com feature
or copy-pasted from the browser
Returns
-------
a list of dict as specified in the class documentation.
"""
return self.scrape_walls(url) + self.scrape_photos(url) + self.scrape_videos(url)
def scrape_walls(self, url: str) -> List:
"""Scrapes a URL for multiple wall data
Parameters
----------
url : str
The URL to parse - should contain something like "...wall1212_3434..."
Returns
-------
a list of dict as specified in the class documentation.
"""
wall_ids = self.WALL_PATTERN.findall(url)
return self.scrape_wall_ids(wall_ids)
def scrape_wall_ids(self, wall_ids: List[str], copy_history_depth: int = 2) -> List:
def scrape_wall_ids(self, wall_ids: List[str], copy_history_depth: int = 2) -> List[dict]:
"""
Receives a list of wall ids like wall123123_1231
Returns a list with one item per wall_id where each item contains:
Receives a list of wall ids like wall123123_1231 see `api docs <https://dev.vk.com/method/wall.getById>`__
:returns `{
"id": "wall_id",
"text": "text in this post" ,
"datetime": datetime of post,
"attachments": {
"photo": [list of urls with max quality],
"album": [list of urls with max quality],
# untested:
# "video": [list of urls with max quality],
# "link": [list of urls with max quality],
},
"payload": original response code which you can parse for more data
}
`
Parameters
----------
wall_ids : List[str]
list with valid wall ids like "wall123123_1231"
copy_history_depth : int
see `api docs <https://dev.vk.com/method/wall.getById>`__
Returns
-------
a list of dict as specified in the class documentation.
"""
if not len(wall_ids):
return []
@@ -134,14 +169,34 @@ class VkScraper:
)
return res
def scrape_videos(self, url: str) -> List:
# TODO: https://vk.com/video-1_456239018
# TODO https://vk.com/asdasdasd?w=wall-17315087_74182 has 1 video
# https://vk.com/video38556806_456251917?list=ba2b77043648ff3789
def scrape_videos(self, url: str) -> List[dict]:
"""Scrapes a URL for multiple video data
Parameters
----------
url : str
The URL to parse - should contain something like "...video1212_3434..."
Returns
-------
a list of dict as specified in the class documentation.
"""
video_ids = self.VIDEO_PATTERN.findall(url)
return self.scrape_video_ids(video_ids)
def scrape_video_ids(self, video_ids: List[str]) -> List:
def scrape_video_ids(self, video_ids: List[str]) -> List[dict]:
"""
Receives a list of video ids like video123123_1231 see `api docs <https://dev.vk.com/method/video.get>`__
Parameters
----------
video_ids : List[str]
list with valid video ids like "video123123_1231"
Returns
-------
a list of dict as specified in the class documentation.
"""
if not len(video_ids):
return []
video_ids = [video_id.replace("video", "") for video_id in video_ids]
@@ -170,11 +225,34 @@ class VkScraper:
)
return res
def scrape_photos(self, url: str) -> List:
def scrape_photos(self, url: str) -> List[dict]:
"""Scrapes a URL for multiple photo data
Parameters
----------
url : str
The URL to parse - should contain something like "...photo1212_3434..."
Returns
-------
a list of dict as specified in the class documentation.
"""
photo_ids = self.PHOTO_PATTERN.findall(url)
return self.scrape_photo_ids(photo_ids)
def scrape_photo_ids(self, photo_ids: List[str]) -> List:
def scrape_photo_ids(self, photo_ids: List[str]) -> List[dict]:
"""
Receives a list of photo ids like photo123123_1231 see `api docs <https://dev.vk.com/method/photos.getById>`__
Parameters
----------
photo_ids : List[str]
list with valid photo ids like "photo123123_1231"
Returns
-------
a list of dict as specified in the class documentation.
"""
if not len(photo_ids):
return []
photo_ids = [photo_id.replace("photo", "") for photo_id in photo_ids]
@@ -200,3 +278,56 @@ class VkScraper:
}
)
return res
def download_media(self, results: List[dict], destination: str = "./output/") -> List[str]:
"""
Receives a list of dicts as returned by any of the scrape* methods and downloads the URLS present
if they are of type photo or video into the destination folder
Parameters
----------
results : List[dict]
list with valid dictionary results (see class definition)
destination : str
the directory to save the downloaded files to. defaults to output/
Returns
-------
a list of filenames for the downloaded files
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
}
mkdir_if_not_exists(destination)
downloaded = []
for r in results:
for k, attachments in r["attachments"].items():
if k == "photo":
for i, url in enumerate(attachments):
ext = os.path.splitext(urlparse(url).path)[1]
filename = os.path.join(destination, f"{r['id']}_{i}{ext}")
d = requests.get(url, headers=headers)
with open(filename, "wb") as f:
f.write(d.content)
downloaded.append(filename)
elif k == "video":
for i, url in enumerate(attachments):
filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s")
ydl = yt_dlp.YoutubeDL(
{
"outtmpl": filename,
"quiet": True,
"restrictfilenames": True,
"forcefilename": True,
}
)
info = ydl.extract_info(url, download=True)
filename = ydl.prepare_filename(info)
if "unknown_video" in filename:
new_filename = filename.replace("unknown_video", "mkv")
with open(filename, "rb") as vin, open(new_filename, "wb") as vout:
vout.write(vin.read())
os.remove(filename)
filename = new_filename
downloaded.append(filename)
return downloaded

16
vk_url_scraper/utils.py Normal file
View File

@@ -0,0 +1,16 @@
import json
import os
from datetime import datetime
class DateTimeEncoder(json.JSONEncoder):
# to allow json.dump with datetimes do json.dumps(obj, cls=DateTimeEncoder)
def default(self, o):
if isinstance(o, datetime):
return str(o) # with timezone
return json.JSONEncoder.default(self, o)
def mkdir_if_not_exists(folder):
if not os.path.exists(folder):
os.makedirs(folder)

View File

@@ -1,8 +1,8 @@
_MAJOR = "0"
_MINOR = "1"
_MINOR = "2"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "2"
_PATCH = "4"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""