adds command line interface

This commit is contained in:
msramalho
2022-06-20 23:52:14 +02:00
parent 50b78d618a
commit c9a3ece9af
11 changed files with 354 additions and 12 deletions

1
.gitignore vendored
View File

@@ -1,5 +1,6 @@
.env
vk_config.v2.json
output/
# build artifacts
.eggs/

View File

@@ -5,6 +5,7 @@ name = "pypi"
[packages]
vk-api = "*"
yt-dlp = "*"
[dev-packages]
sphinx-copybutton = "==0.5.0"

177
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "bab533e734f6da55647cc76a9f5a51d46c641723d485e38a16e2e31bca097130"
"sha256": "4224e1159b48a3e903601184bf0d3f7613a817b5fca7062a119c549563527798"
},
"pipfile-spec": 6,
"requires": {
@@ -16,6 +16,74 @@
]
},
"default": {
"brotli": {
"hashes": [
"sha256:12effe280b8ebfd389022aa65114e30407540ccb89b177d3fbc9a4f177c4bd5d",
"sha256:160c78292e98d21e73a4cc7f76a234390e516afcd982fa17e1422f7c6a9ce9c8",
"sha256:16d528a45c2e1909c2798f27f7bf0a3feec1dc9e50948e738b961618e38b6a7b",
"sha256:19598ecddd8a212aedb1ffa15763dd52a388518c4550e615aed88dc3753c0f0c",
"sha256:1c48472a6ba3b113452355b9af0a60da5c2ae60477f8feda8346f8fd48e3e87c",
"sha256:268fe94547ba25b58ebc724680609c8ee3e5a843202e9a381f6f9c5e8bdb5c70",
"sha256:269a5743a393c65db46a7bb982644c67ecba4b8d91b392403ad8a861ba6f495f",
"sha256:26d168aac4aaec9a4394221240e8a5436b5634adc3cd1cdf637f6645cecbf181",
"sha256:29d1d350178e5225397e28ea1b7aca3648fcbab546d20e7475805437bfb0a130",
"sha256:2aad0e0baa04517741c9bb5b07586c642302e5fb3e75319cb62087bd0995ab19",
"sha256:3496fc835370da351d37cada4cf744039616a6db7d13c430035e901443a34daa",
"sha256:35a3edbe18e876e596553c4007a087f8bcfd538f19bc116917b3c7522fca0429",
"sha256:3b78a24b5fd13c03ee2b7b86290ed20efdc95da75a3557cc06811764d5ad1126",
"sha256:40d15c79f42e0a2c72892bf407979febd9cf91f36f495ffb333d1d04cebb34e4",
"sha256:44bb8ff420c1d19d91d79d8c3574b8954288bdff0273bf788954064d260d7ab0",
"sha256:4688c1e42968ba52e57d8670ad2306fe92e0169c6f3af0089be75bbac0c64a3b",
"sha256:495ba7e49c2db22b046a53b469bbecea802efce200dffb69b93dd47397edc9b6",
"sha256:4d1b810aa0ed773f81dceda2cc7b403d01057458730e309856356d4ef4188438",
"sha256:503fa6af7da9f4b5780bb7e4cbe0c639b010f12be85d02c99452825dd0feef3f",
"sha256:56d027eace784738457437df7331965473f2c0da2c70e1a1f6fdbae5402e0389",
"sha256:5913a1177fc36e30fcf6dc868ce23b0453952c78c04c266d3149b3d39e1410d6",
"sha256:5b6ef7d9f9c38292df3690fe3e302b5b530999fa90014853dcd0d6902fb59f26",
"sha256:5cb1e18167792d7d21e21365d7650b72d5081ed476123ff7b8cac7f45189c0c7",
"sha256:61a7ee1f13ab913897dac7da44a73c6d44d48a4adff42a5701e3239791c96e14",
"sha256:622a231b08899c864eb87e85f81c75e7b9ce05b001e59bbfbf43d4a71f5f32b2",
"sha256:68715970f16b6e92c574c30747c95cf8cf62804569647386ff032195dc89a430",
"sha256:6b2ae9f5f67f89aade1fab0f7fd8f2832501311c363a21579d02defa844d9296",
"sha256:6c772d6c0a79ac0f414a9f8947cc407e119b8598de7621f39cacadae3cf57d12",
"sha256:6d847b14f7ea89f6ad3c9e3901d1bc4835f6b390a9c71df999b0162d9bb1e20f",
"sha256:76ffebb907bec09ff511bb3acc077695e2c32bc2142819491579a695f77ffd4d",
"sha256:7bbff90b63328013e1e8cb50650ae0b9bac54ffb4be6104378490193cd60f85a",
"sha256:7cb81373984cc0e4682f31bc3d6be9026006d96eecd07ea49aafb06897746452",
"sha256:7ee83d3e3a024a9618e5be64648d6d11c37047ac48adff25f12fa4226cf23d1c",
"sha256:854c33dad5ba0fbd6ab69185fec8dab89e13cda6b7d191ba111987df74f38761",
"sha256:85f7912459c67eaab2fb854ed2bc1cc25772b300545fe7ed2dc03954da638649",
"sha256:87fdccbb6bb589095f413b1e05734ba492c962b4a45a13ff3408fa44ffe6479b",
"sha256:88c63a1b55f352b02c6ffd24b15ead9fc0e8bf781dbe070213039324922a2eea",
"sha256:8a674ac10e0a87b683f4fa2b6fa41090edfd686a6524bd8dedbd6138b309175c",
"sha256:93130612b837103e15ac3f9cbacb4613f9e348b58b3aad53721d92e57f96d46a",
"sha256:9744a863b489c79a73aba014df554b0e7a0fc44ef3f8a0ef2a52919c7d155031",
"sha256:9749a124280a0ada4187a6cfd1ffd35c350fb3af79c706589d98e088c5044267",
"sha256:97f715cf371b16ac88b8c19da00029804e20e25f30d80203417255d239f228b5",
"sha256:9bf919756d25e4114ace16a8ce91eb340eb57a08e2c6950c3cebcbe3dff2a5e7",
"sha256:9d12cf2851759b8de8ca5fde36a59c08210a97ffca0eb94c532ce7b17c6a3d1d",
"sha256:9ed4c92a0665002ff8ea852353aeb60d9141eb04109e88928026d3c8a9e5433c",
"sha256:a72661af47119a80d82fa583b554095308d6a4c356b2a554fdc2799bc19f2a43",
"sha256:afde17ae04d90fbe53afb628f7f2d4ca022797aa093e809de5c3cf276f61bbfa",
"sha256:b336c5e9cf03c7be40c47b5fd694c43c9f1358a80ba384a21969e0b4e66a9b17",
"sha256:b663f1e02de5d0573610756398e44c130add0eb9a3fc912a09665332942a2efb",
"sha256:b83bb06a0192cccf1eb8d0a28672a1b79c74c3a8a5f2619625aeb6f28b3a82bb",
"sha256:c2415d9d082152460f2bd4e382a1e85aed233abc92db5a3880da2257dc7daf7b",
"sha256:c83aa123d56f2e060644427a882a36b3c12db93727ad7a7b9efd7d7f3e9cc2c4",
"sha256:cfc391f4429ee0a9370aa93d812a52e1fee0f37a81861f4fdd1f4fb28e8547c3",
"sha256:db844eb158a87ccab83e868a762ea8024ae27337fc7ddcbfcddd157f841fdfe7",
"sha256:defed7ea5f218a9f2336301e6fd379f55c655bea65ba2476346340a0ce6f74a1",
"sha256:e16eb9541f3dd1a3e92b89005e37b1257b157b7256df0e36bd7b33b50be73bcb",
"sha256:e23281b9a08ec338469268f98f194658abfb13658ee98e2b7f85ee9dd06caa91",
"sha256:e2d9e1cbc1b25e22000328702b014227737756f4b5bf5c485ac1d8091ada078b",
"sha256:e48f4234f2469ed012a98f4b7874e7f7e173c167bed4934912a29e03167cf6b1",
"sha256:e4c4e92c14a57c9bd4cb4be678c25369bf7a092d55fd0866f759e425b9660806",
"sha256:ec1947eabbaf8e0531e8e899fc1d9876c179fc518989461f5d24e2223395a9e3",
"sha256:f909bbbc433048b499cb9db9e713b5d8d949e8c109a2a548502fb9aa8630f0b1"
],
"markers": "platform_python_implementation == 'CPython'",
"version": "==1.0.9"
},
"certifi": {
"hashes": [
"sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d",
@@ -40,6 +108,47 @@
"markers": "python_full_version >= '3.5.0'",
"version": "==3.3"
},
"mutagen": {
"hashes": [
"sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1",
"sha256:9c9f243fcec7f410f138cb12c21c84c64fde4195481a30c9bfb05b5f003adfed"
],
"markers": "python_version < '4' and python_full_version >= '3.5.0'",
"version": "==1.45.1"
},
"pycryptodomex": {
"hashes": [
"sha256:1ca8e1b4c62038bb2da55451385246f51f412c5f5eabd64812c01766a5989b4a",
"sha256:298c00ea41a81a491d5b244d295d18369e5aac4b61b77b2de5b249ca61cd6659",
"sha256:2aa887683eee493e015545bd69d3d21ac8d5ad582674ec98f4af84511e353e45",
"sha256:2ce76ed0081fd6ac8c74edc75b9d14eca2064173af79843c24fa62573263c1f2",
"sha256:3da13c2535b7aea94cc2a6d1b1b37746814c74b6e80790daddd55ca5c120a489",
"sha256:406ec8cfe0c098fadb18d597dc2ee6de4428d640c0ccafa453f3d9b2e58d29e2",
"sha256:4d0db8df9ffae36f416897ad184608d9d7a8c2b46c4612c6bc759b26c073f750",
"sha256:530756d2faa40af4c1f74123e1d889bd07feae45bac2fd32f259a35f7aa74151",
"sha256:77931df40bb5ce5e13f4de2bfc982b2ddc0198971fbd947776c8bb5050896eb2",
"sha256:797a36bd1f69df9e2798e33edb4bd04e5a30478efc08f9428c087f17f65a7045",
"sha256:8085bd0ad2034352eee4d4f3e2da985c2749cb7344b939f4d95ead38c2520859",
"sha256:8536bc08d130cae6dcba1ea689f2913dfd332d06113904d171f2f56da6228e89",
"sha256:a4d412eba5679ede84b41dbe48b1bed8f33131ab9db06c238a235334733acc5e",
"sha256:aebecde2adc4a6847094d3bd6a8a9538ef3438a5ea84ac1983fcb167db614461",
"sha256:b276cc4deb4a80f9dfd47a41ebb464b1fe91efd8b1b8620cf5ccf8b824b850d6",
"sha256:b5a185ae79f899b01ca49f365bdf15a45d78d9856f09b0de1a41b92afce1a07f",
"sha256:c4d8977ccda886d88dc3ca789de2f1adc714df912ff3934b3d0a3f3d777deafb",
"sha256:c5dd3ffa663c982d7f1be9eb494a8924f6d40e2e2f7d1d27384cfab1b2ac0662",
"sha256:ca88f2f7020002638276439a01ffbb0355634907d1aa5ca91f3dc0c2e44e8f3b",
"sha256:d2cce1c82a7845d7e2e8a0956c6b7ed3f1661c9acf18eb120fc71e098ab5c6fe",
"sha256:d709572d64825d8d59ea112e11cc7faf6007f294e9951324b7574af4251e4de8",
"sha256:da8db8374295fb532b4b0c467e66800ef17d100e4d5faa2bbbd6df35502da125",
"sha256:e36c7e3b5382cd5669cf199c4a04a0279a43b2a3bdd77627e9b89778ac9ec08c",
"sha256:e95a4a6c54d27a84a4624d2af8bb9ee178111604653194ca6880c98dcad92f48",
"sha256:ee835def05622e0c8b1435a906491760a43d0c462f065ec9143ec4b8d79f8bff",
"sha256:f75009715dcf4a3d680c2338ab19dac5498f8121173a929872950f4fb3a48fbf",
"sha256:f8524b8bc89470cec7ac51734907818d3620fb1637f8f8b542d650ebec42a126"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==3.14.1"
},
"requests": {
"hashes": [
"sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f",
@@ -63,6 +172,68 @@
],
"index": "pypi",
"version": "==11.9.8"
},
"websockets": {
"hashes": [
"sha256:07cdc0a5b2549bcfbadb585ad8471ebdc7bdf91e32e34ae3889001c1c106a6af",
"sha256:210aad7fdd381c52e58777560860c7e6110b6174488ef1d4b681c08b68bf7f8c",
"sha256:28dd20b938a57c3124028680dc1600c197294da5db4292c76a0b48efb3ed7f76",
"sha256:2f94fa3ae454a63ea3a19f73b95deeebc9f02ba2d5617ca16f0bbdae375cda47",
"sha256:31564a67c3e4005f27815634343df688b25705cccb22bc1db621c781ddc64c69",
"sha256:347974105bbd4ea068106ec65e8e8ebd86f28c19e529d115d89bd8cc5cda3079",
"sha256:379e03422178436af4f3abe0aa8f401aa77ae2487843738542a75faf44a31f0c",
"sha256:3eda1cb7e9da1b22588cefff09f0951771d6ee9fa8dbe66f5ae04cc5f26b2b55",
"sha256:51695d3b199cd03098ae5b42833006a0f43dc5418d3102972addc593a783bc02",
"sha256:54c000abeaff6d8771a4e2cef40900919908ea7b6b6a30eae72752607c6db559",
"sha256:5b936bf552e4f6357f5727579072ff1e1324717902127ffe60c92d29b67b7be3",
"sha256:6075fd24df23133c1b078e08a9b04a3bc40b31a8def4ee0b9f2c8865acce913e",
"sha256:661f641b44ed315556a2fa630239adfd77bd1b11cb0b9d96ed8ad90b0b1e4978",
"sha256:6ea6b300a6bdd782e49922d690e11c3669828fe36fc2471408c58b93b5535a98",
"sha256:6ed1d6f791eabfd9808afea1e068f5e59418e55721db8b7f3bfc39dc831c42ae",
"sha256:7934e055fd5cd9dee60f11d16c8d79c4567315824bacb1246d0208a47eca9755",
"sha256:7ab36e17af592eec5747c68ef2722a74c1a4a70f3772bc661079baf4ae30e40d",
"sha256:7f6d96fdb0975044fdd7953b35d003b03f9e2bcf85f2d2cf86285ece53e9f991",
"sha256:83e5ca0d5b743cde3d29fda74ccab37bdd0911f25bd4cdf09ff8b51b7b4f2fa1",
"sha256:85506b3328a9e083cc0a0fb3ba27e33c8db78341b3eb12eb72e8afd166c36680",
"sha256:8af75085b4bc0b5c40c4a3c0e113fa95e84c60f4ed6786cbb675aeb1ee128247",
"sha256:8b1359aba0ff810d5830d5ab8e2c4a02bebf98a60aa0124fb29aa78cfdb8031f",
"sha256:8fbd7d77f8aba46d43245e86dd91a8970eac4fb74c473f8e30e9c07581f852b2",
"sha256:907e8247480f287aa9bbc9391bd6de23c906d48af54c8c421df84655eef66af7",
"sha256:93d5ea0b5da8d66d868b32c614d2b52d14304444e39e13a59566d4acb8d6e2e4",
"sha256:97bc9d41e69a7521a358f9b8e44871f6cdeb42af31815c17aed36372d4eec667",
"sha256:994cdb1942a7a4c2e10098d9162948c9e7b235df755de91ca33f6e0481366fdb",
"sha256:a141de3d5a92188234afa61653ed0bbd2dde46ad47b15c3042ffb89548e77094",
"sha256:a1e15b230c3613e8ea82c9fc6941b2093e8eb939dd794c02754d33980ba81e36",
"sha256:aad5e300ab32036eb3fdc350ad30877210e2f51bceaca83fb7fef4d2b6c72b79",
"sha256:b529fdfa881b69fe563dbd98acce84f3e5a67df13de415e143ef053ff006d500",
"sha256:b9c77f0d1436ea4b4dc089ed8335fa141e6a251a92f75f675056dac4ab47a71e",
"sha256:bb621ec2dbbbe8df78a27dbd9dd7919f9b7d32a73fafcb4d9252fc4637343582",
"sha256:c7250848ce69559756ad0086a37b82c986cd33c2d344ab87fea596c5ac6d9442",
"sha256:c8d1d14aa0f600b5be363077b621b1b4d1eb3fbf90af83f9281cda668e6ff7fd",
"sha256:d1655a6fc7aecd333b079d00fb3c8132d18988e47f19740c69303bf02e9883c6",
"sha256:d6353ba89cfc657a3f5beabb3b69be226adbb5c6c7a66398e17809b0ce3c4731",
"sha256:da4377904a3379f0c1b75a965fff23b28315bcd516d27f99a803720dfebd94d4",
"sha256:e49ea4c1a9543d2bd8a747ff24411509c29e4bdcde05b5b0895e2120cb1a761d",
"sha256:e4e08305bfd76ba8edab08dcc6496f40674f44eb9d5e23153efa0a35750337e8",
"sha256:e6fa05a680e35d0fcc1470cb070b10e6fe247af54768f488ed93542e71339d6f",
"sha256:e7e6f2d6fd48422071cc8a6f8542016f350b79cc782752de531577d35e9bd677",
"sha256:e904c0381c014b914136c492c8fa711ca4cced4e9b3d110e5e7d436d0fc289e8",
"sha256:ec2b0ab7edc8cd4b0eb428b38ed89079bdc20c6bdb5f889d353011038caac2f9",
"sha256:ef5ce841e102278c1c2e98f043db99d6755b1c58bde475516aef3a008ed7f28e",
"sha256:f351c7d7d92f67c0609329ab2735eee0426a03022771b00102816a72715bb00b",
"sha256:fab7c640815812ed5f10fbee7abbf58788d602046b7bb3af9b1ac753a6d5e916",
"sha256:fc06cc8073c8e87072138ba1e431300e2d408f054b27047d047b549455066ff4"
],
"markers": "python_version >= '3.7'",
"version": "==10.3"
},
"yt-dlp": {
"hashes": [
"sha256:3a7b59d2fb4b39ce8ba8e0b9c5a37fe20e5624f46a2346b4ae66ab1320e35134",
"sha256:deec1009442312c1e2ee5298966842194d0e950b433f0d4fc844ef464b9c32a7"
],
"index": "pypi",
"version": "==2022.5.18"
}
},
"develop": {
@@ -351,7 +522,7 @@
"sha256:5d26852efe48c0a32b0509ffbc583fda1a2266545a78d104a6f4aff3db17d700",
"sha256:c58c8eb8a762858f49e18436ff552e83914778e50e9d2f1660535ffb364552ec"
],
"markers": "python_version >= '3.7'",
"markers": "python_version < '3.10'",
"version": "==4.11.4"
},
"iniconfig": {
@@ -720,7 +891,7 @@
"sha256:4c586de507202505346f3e32d1363eb9ed6932f0c2f63184dea88983ff4971e2",
"sha256:d2bbd99c320a2532ac71ff6a3164867884357da3e3301f0240090c5d2fdac7ec"
],
"markers": "python_version < '4' and python_full_version >= '3.6.3'",
"markers": "python_full_version >= '3.6.3' and python_full_version < '4.0.0'",
"version": "==12.4.4"
},
"secretstorage": {

View File

@@ -1,11 +1,37 @@
# vk-url-scraper
Library to scrape data and especially media links (videos and photos) from vk.com URLs.
You can use it via the [command line](#command-line-usage) or as a [python library](#python-library-usage).
## Quick usage API
`pip install vk-url-scraper` to install.
## Installation
You can install the most recent release from [pypi](https://pypi.org/project/vk-url-scraper/) via `pip install vk-url-scraper`.
To use the library you will need a valid username/password combination for vk.com.
## Command line usage
```bash
# run this to learn more about the parameters
vk_url_scraper --help
# scrape a URL and get the JSON result in the console
vk_url_scraper -username "username here" --password "password here" --urls https://vk.com/wall12345_6789
# OR
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789
# you can also have multiple urls
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 https://vk.com/photo-12345_6789 https://vk.com/video12345_6789
# save the JSON output into a file
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 > output.json
# download any photos or videos found in these URLS
# this will use or create an output/ folder and dump the files there
vk_url_scraper -u "username here" -p "password here" --download --urls https://vk.com/wall12345_6789
# or
vk_url_scraper -u "username here" -p "password here" -d --urls https://vk.com/wall12345_6789
```
## Python library usage
```python
from vk_url_scraper import VkScraper
@@ -41,6 +67,8 @@ print(res[0]["text]) # eg: -> to get the text from code
see [docs] for all available functions.
### TODO
* scrape album links
* scrape profile links
* docs online from sphinx
## Development
@@ -54,6 +82,9 @@ see [docs] for all available functions.
3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to user verbose, colors, and test docstring examples)
3. `make docs` to generate shpynx docs -> edit [config.py](docs/source/conf.py) if needed
To test the command line interface available in [__main__.py](__vk_url_scraper/__main__.py) you need to pass the `-m` option to python like so: `python -m vk_url_scraper -u "" -p "" --urls ...`
## Releasing new version
1. edit [version.py](vk_url_scraper/version.py) with proper versioning
2. run `./scripts/release.sh` to create a tag and push, alternatively

View File

@@ -47,7 +47,7 @@ setup(
url="https://github.com/bellingcat/vk-url-scraper",
author="Bellingcat",
author_email="tech@bellingcat.com",
license="Apache",
license="MIT",
packages=find_packages(
exclude=["*.tests", "*.tests.*", "tests.*", "tests"],
),

View File

@@ -1,12 +1,11 @@
import datetime
import os
import tempfile
import pytest
from vk_url_scraper import VkScraper
from .util import assert_equal_lists
vks = None
@@ -82,12 +81,30 @@ def test_scrape_wall_url_with_photos_inner_videos_and_links_with_inner_photos():
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 11, 1, 9))
assert len(res[0]["payload"]) == 15
assert len(res[0]["attachments"].keys()) == 3
assert_equal_lists(list(res[0]["attachments"].keys()), ["photo", "link", "video"])
for k in ["photo", "link", "video"]:
assert k in list(res[0]["attachments"].keys())
assert len(res[0]["attachments"]["photo"]) == 5
assert len(res[0]["attachments"]["link"]) == 1
assert len(res[0]["attachments"]["video"]) == 1
def test_scrape_download_multiple_media():
res = vks.scrape("https://vk.com/w=wall-17315087_74182")
with tempfile.TemporaryDirectory(dir="./") as tempdir:
vks.download_media(res, tempdir)
expect_files = {
"wall-17315087_74182_0.jpg",
"wall-17315087_74182_1.jpg",
"wall-17315087_74182_2.jpg",
"wall-17315087_74182_3.jpg",
"wall-17315087_74182_4.jpg",
"wall-17315087_74182_0.mkv",
}
found_files = set(os.listdir(tempdir))
assert len(expect_files) == len(expect_files & found_files)
def test_scrape_photo_only():
res = vks.scrape("https://vk.com/apiclub?z=photo-1_457242435%2Falbum-1_00%2Frev")
assert len(res) == 1

View File

@@ -1,3 +0,0 @@
def assert_equal_lists(l1, l2):
assert len(l1) == len(l2)
assert str(sorted(l1)) == str(sorted(l2))

View File

@@ -1 +1,2 @@
from .scraper import VkScraper
from .utils import DateTimeEncoder, mkdir_if_not_exists

View File

@@ -0,0 +1,63 @@
import argparse
import json
from .scraper import VkScraper
from .utils import DateTimeEncoder
def get_argument_parser():
"""
Creates the CMD line arguments. 'python vk_url_scraper.py --help'
"""
parser = argparse.ArgumentParser(
description="Authenticate and scrape information from vk.com based on a URL or set of URLs."
)
parser.add_argument(
"-u",
"--username",
action="store",
dest="username",
required=True,
help="username for a valid vk.com account",
)
parser.add_argument(
"-p",
"--password",
action="store",
dest="password",
required=True,
help="password for the valid vk.com account",
)
parser.add_argument(
"-d",
"--download",
action=argparse.BooleanOptionalAction,
dest="download",
help="if set then all photos and videos will be downloaded to folder output/",
)
parser.add_argument(
"--urls",
action="store",
dest="urls",
nargs=argparse.REMAINDER,
required=True,
help="must be the last argument: any text with one or more urls to scrape",
)
return parser
def main():
parser = get_argument_parser()
args = parser.parse_args()
vks = VkScraper(args.username, args.password)
text = " ".join(args.urls)
res = vks.scrape(text)
res_json = json.dumps(res, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
print(res_json)
if args.download:
vks.download_media(res)
if __name__ == "__main__":
main()

View File

@@ -1,10 +1,15 @@
import os
import re
from collections import defaultdict
from datetime import datetime
from typing import List
from urllib.parse import urlparse
import requests
import vk_api # used to get api_token after authentication
import yt_dlp # to download videos from url
from .utils import mkdir_if_not_exists
class VkScraper:
@@ -273,3 +278,42 @@ class VkScraper:
}
)
return res
def download_media(self, results: List[dict], destination: str = "./output/") -> List[str]:
"""
Receives a list of dicts as returned by any of the scrape* methods and downloads the URLS present
if they are of type photo or video into the destination folder
Parameters
----------
results : List[dict]
list with valid dictionary results (see class definition)
destination : str
the directory to save the downloaded files to. defaults to output/
Returns
-------
a list of filenames for the downloaded files
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
}
mkdir_if_not_exists(destination)
downloaded = []
for r in results:
for k, attachments in r["attachments"].items():
if k == "photo":
for i, url in enumerate(attachments):
ext = os.path.splitext(urlparse(url).path)[1]
filename = os.path.join(destination, f"{r['id']}_{i}{ext}")
d = requests.get(url, headers=headers)
with open(filename, "wb") as f:
f.write(d.content)
downloaded.append(filename)
elif k == "video":
for i, url in enumerate(attachments):
filename = os.path.join(destination, f"{r['id']}_{i}.mkv")
ydl = yt_dlp.YoutubeDL({"outtmpl": filename, "quiet": True})
ydl.extract_info(url, download=True)
downloaded.append(filename)
return downloaded

16
vk_url_scraper/utils.py Normal file
View File

@@ -0,0 +1,16 @@
import json
import os
from datetime import datetime
class DateTimeEncoder(json.JSONEncoder):
# to allow json.dump with datetimes do json.dumps(obj, cls=DateTimeEncoder)
def default(self, o):
if isinstance(o, datetime):
return str(o) # with timezone
return json.JSONEncoder.default(self, o)
def mkdir_if_not_exists(folder):
if not os.path.exists(folder):
os.makedirs(folder)