diff --git a/.gitignore b/.gitignore index b7b2c87..f61cfd6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .env vk_config.v2.json +output/ # build artifacts .eggs/ diff --git a/Pipfile b/Pipfile index ade4841..2d42ab4 100644 --- a/Pipfile +++ b/Pipfile @@ -5,6 +5,7 @@ name = "pypi" [packages] vk-api = "*" +yt-dlp = "*" [dev-packages] sphinx-copybutton = "==0.5.0" diff --git a/Pipfile.lock b/Pipfile.lock index fcd83cc..21872a2 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "bab533e734f6da55647cc76a9f5a51d46c641723d485e38a16e2e31bca097130" + "sha256": "4224e1159b48a3e903601184bf0d3f7613a817b5fca7062a119c549563527798" }, "pipfile-spec": 6, "requires": { @@ -16,6 +16,74 @@ ] }, "default": { + "brotli": { + "hashes": [ + "sha256:12effe280b8ebfd389022aa65114e30407540ccb89b177d3fbc9a4f177c4bd5d", + "sha256:160c78292e98d21e73a4cc7f76a234390e516afcd982fa17e1422f7c6a9ce9c8", + "sha256:16d528a45c2e1909c2798f27f7bf0a3feec1dc9e50948e738b961618e38b6a7b", + "sha256:19598ecddd8a212aedb1ffa15763dd52a388518c4550e615aed88dc3753c0f0c", + "sha256:1c48472a6ba3b113452355b9af0a60da5c2ae60477f8feda8346f8fd48e3e87c", + "sha256:268fe94547ba25b58ebc724680609c8ee3e5a843202e9a381f6f9c5e8bdb5c70", + "sha256:269a5743a393c65db46a7bb982644c67ecba4b8d91b392403ad8a861ba6f495f", + "sha256:26d168aac4aaec9a4394221240e8a5436b5634adc3cd1cdf637f6645cecbf181", + "sha256:29d1d350178e5225397e28ea1b7aca3648fcbab546d20e7475805437bfb0a130", + "sha256:2aad0e0baa04517741c9bb5b07586c642302e5fb3e75319cb62087bd0995ab19", + "sha256:3496fc835370da351d37cada4cf744039616a6db7d13c430035e901443a34daa", + "sha256:35a3edbe18e876e596553c4007a087f8bcfd538f19bc116917b3c7522fca0429", + "sha256:3b78a24b5fd13c03ee2b7b86290ed20efdc95da75a3557cc06811764d5ad1126", + "sha256:40d15c79f42e0a2c72892bf407979febd9cf91f36f495ffb333d1d04cebb34e4", + "sha256:44bb8ff420c1d19d91d79d8c3574b8954288bdff0273bf788954064d260d7ab0", + "sha256:4688c1e42968ba52e57d8670ad2306fe92e0169c6f3af0089be75bbac0c64a3b", + "sha256:495ba7e49c2db22b046a53b469bbecea802efce200dffb69b93dd47397edc9b6", + "sha256:4d1b810aa0ed773f81dceda2cc7b403d01057458730e309856356d4ef4188438", + "sha256:503fa6af7da9f4b5780bb7e4cbe0c639b010f12be85d02c99452825dd0feef3f", + "sha256:56d027eace784738457437df7331965473f2c0da2c70e1a1f6fdbae5402e0389", + "sha256:5913a1177fc36e30fcf6dc868ce23b0453952c78c04c266d3149b3d39e1410d6", + "sha256:5b6ef7d9f9c38292df3690fe3e302b5b530999fa90014853dcd0d6902fb59f26", + "sha256:5cb1e18167792d7d21e21365d7650b72d5081ed476123ff7b8cac7f45189c0c7", + "sha256:61a7ee1f13ab913897dac7da44a73c6d44d48a4adff42a5701e3239791c96e14", + "sha256:622a231b08899c864eb87e85f81c75e7b9ce05b001e59bbfbf43d4a71f5f32b2", + "sha256:68715970f16b6e92c574c30747c95cf8cf62804569647386ff032195dc89a430", + "sha256:6b2ae9f5f67f89aade1fab0f7fd8f2832501311c363a21579d02defa844d9296", + "sha256:6c772d6c0a79ac0f414a9f8947cc407e119b8598de7621f39cacadae3cf57d12", + "sha256:6d847b14f7ea89f6ad3c9e3901d1bc4835f6b390a9c71df999b0162d9bb1e20f", + "sha256:76ffebb907bec09ff511bb3acc077695e2c32bc2142819491579a695f77ffd4d", + "sha256:7bbff90b63328013e1e8cb50650ae0b9bac54ffb4be6104378490193cd60f85a", + "sha256:7cb81373984cc0e4682f31bc3d6be9026006d96eecd07ea49aafb06897746452", + "sha256:7ee83d3e3a024a9618e5be64648d6d11c37047ac48adff25f12fa4226cf23d1c", + "sha256:854c33dad5ba0fbd6ab69185fec8dab89e13cda6b7d191ba111987df74f38761", + "sha256:85f7912459c67eaab2fb854ed2bc1cc25772b300545fe7ed2dc03954da638649", + "sha256:87fdccbb6bb589095f413b1e05734ba492c962b4a45a13ff3408fa44ffe6479b", + "sha256:88c63a1b55f352b02c6ffd24b15ead9fc0e8bf781dbe070213039324922a2eea", + "sha256:8a674ac10e0a87b683f4fa2b6fa41090edfd686a6524bd8dedbd6138b309175c", + "sha256:93130612b837103e15ac3f9cbacb4613f9e348b58b3aad53721d92e57f96d46a", + "sha256:9744a863b489c79a73aba014df554b0e7a0fc44ef3f8a0ef2a52919c7d155031", + "sha256:9749a124280a0ada4187a6cfd1ffd35c350fb3af79c706589d98e088c5044267", + "sha256:97f715cf371b16ac88b8c19da00029804e20e25f30d80203417255d239f228b5", + "sha256:9bf919756d25e4114ace16a8ce91eb340eb57a08e2c6950c3cebcbe3dff2a5e7", + "sha256:9d12cf2851759b8de8ca5fde36a59c08210a97ffca0eb94c532ce7b17c6a3d1d", + "sha256:9ed4c92a0665002ff8ea852353aeb60d9141eb04109e88928026d3c8a9e5433c", + "sha256:a72661af47119a80d82fa583b554095308d6a4c356b2a554fdc2799bc19f2a43", + "sha256:afde17ae04d90fbe53afb628f7f2d4ca022797aa093e809de5c3cf276f61bbfa", + "sha256:b336c5e9cf03c7be40c47b5fd694c43c9f1358a80ba384a21969e0b4e66a9b17", + "sha256:b663f1e02de5d0573610756398e44c130add0eb9a3fc912a09665332942a2efb", + "sha256:b83bb06a0192cccf1eb8d0a28672a1b79c74c3a8a5f2619625aeb6f28b3a82bb", + "sha256:c2415d9d082152460f2bd4e382a1e85aed233abc92db5a3880da2257dc7daf7b", + "sha256:c83aa123d56f2e060644427a882a36b3c12db93727ad7a7b9efd7d7f3e9cc2c4", + "sha256:cfc391f4429ee0a9370aa93d812a52e1fee0f37a81861f4fdd1f4fb28e8547c3", + "sha256:db844eb158a87ccab83e868a762ea8024ae27337fc7ddcbfcddd157f841fdfe7", + "sha256:defed7ea5f218a9f2336301e6fd379f55c655bea65ba2476346340a0ce6f74a1", + "sha256:e16eb9541f3dd1a3e92b89005e37b1257b157b7256df0e36bd7b33b50be73bcb", + "sha256:e23281b9a08ec338469268f98f194658abfb13658ee98e2b7f85ee9dd06caa91", + "sha256:e2d9e1cbc1b25e22000328702b014227737756f4b5bf5c485ac1d8091ada078b", + "sha256:e48f4234f2469ed012a98f4b7874e7f7e173c167bed4934912a29e03167cf6b1", + "sha256:e4c4e92c14a57c9bd4cb4be678c25369bf7a092d55fd0866f759e425b9660806", + "sha256:ec1947eabbaf8e0531e8e899fc1d9876c179fc518989461f5d24e2223395a9e3", + "sha256:f909bbbc433048b499cb9db9e713b5d8d949e8c109a2a548502fb9aa8630f0b1" + ], + "markers": "platform_python_implementation == 'CPython'", + "version": "==1.0.9" + }, "certifi": { "hashes": [ "sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d", @@ -40,6 +108,47 @@ "markers": "python_full_version >= '3.5.0'", "version": "==3.3" }, + "mutagen": { + "hashes": [ + "sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1", + "sha256:9c9f243fcec7f410f138cb12c21c84c64fde4195481a30c9bfb05b5f003adfed" + ], + "markers": "python_version < '4' and python_full_version >= '3.5.0'", + "version": "==1.45.1" + }, + "pycryptodomex": { + "hashes": [ + "sha256:1ca8e1b4c62038bb2da55451385246f51f412c5f5eabd64812c01766a5989b4a", + "sha256:298c00ea41a81a491d5b244d295d18369e5aac4b61b77b2de5b249ca61cd6659", + "sha256:2aa887683eee493e015545bd69d3d21ac8d5ad582674ec98f4af84511e353e45", + "sha256:2ce76ed0081fd6ac8c74edc75b9d14eca2064173af79843c24fa62573263c1f2", + "sha256:3da13c2535b7aea94cc2a6d1b1b37746814c74b6e80790daddd55ca5c120a489", + "sha256:406ec8cfe0c098fadb18d597dc2ee6de4428d640c0ccafa453f3d9b2e58d29e2", + "sha256:4d0db8df9ffae36f416897ad184608d9d7a8c2b46c4612c6bc759b26c073f750", + "sha256:530756d2faa40af4c1f74123e1d889bd07feae45bac2fd32f259a35f7aa74151", + "sha256:77931df40bb5ce5e13f4de2bfc982b2ddc0198971fbd947776c8bb5050896eb2", + "sha256:797a36bd1f69df9e2798e33edb4bd04e5a30478efc08f9428c087f17f65a7045", + "sha256:8085bd0ad2034352eee4d4f3e2da985c2749cb7344b939f4d95ead38c2520859", + "sha256:8536bc08d130cae6dcba1ea689f2913dfd332d06113904d171f2f56da6228e89", + "sha256:a4d412eba5679ede84b41dbe48b1bed8f33131ab9db06c238a235334733acc5e", + "sha256:aebecde2adc4a6847094d3bd6a8a9538ef3438a5ea84ac1983fcb167db614461", + "sha256:b276cc4deb4a80f9dfd47a41ebb464b1fe91efd8b1b8620cf5ccf8b824b850d6", + "sha256:b5a185ae79f899b01ca49f365bdf15a45d78d9856f09b0de1a41b92afce1a07f", + "sha256:c4d8977ccda886d88dc3ca789de2f1adc714df912ff3934b3d0a3f3d777deafb", + "sha256:c5dd3ffa663c982d7f1be9eb494a8924f6d40e2e2f7d1d27384cfab1b2ac0662", + "sha256:ca88f2f7020002638276439a01ffbb0355634907d1aa5ca91f3dc0c2e44e8f3b", + "sha256:d2cce1c82a7845d7e2e8a0956c6b7ed3f1661c9acf18eb120fc71e098ab5c6fe", + "sha256:d709572d64825d8d59ea112e11cc7faf6007f294e9951324b7574af4251e4de8", + "sha256:da8db8374295fb532b4b0c467e66800ef17d100e4d5faa2bbbd6df35502da125", + "sha256:e36c7e3b5382cd5669cf199c4a04a0279a43b2a3bdd77627e9b89778ac9ec08c", + "sha256:e95a4a6c54d27a84a4624d2af8bb9ee178111604653194ca6880c98dcad92f48", + "sha256:ee835def05622e0c8b1435a906491760a43d0c462f065ec9143ec4b8d79f8bff", + "sha256:f75009715dcf4a3d680c2338ab19dac5498f8121173a929872950f4fb3a48fbf", + "sha256:f8524b8bc89470cec7ac51734907818d3620fb1637f8f8b542d650ebec42a126" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==3.14.1" + }, "requests": { "hashes": [ "sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f", @@ -63,6 +172,68 @@ ], "index": "pypi", "version": "==11.9.8" + }, + "websockets": { + "hashes": [ + "sha256:07cdc0a5b2549bcfbadb585ad8471ebdc7bdf91e32e34ae3889001c1c106a6af", + "sha256:210aad7fdd381c52e58777560860c7e6110b6174488ef1d4b681c08b68bf7f8c", + "sha256:28dd20b938a57c3124028680dc1600c197294da5db4292c76a0b48efb3ed7f76", + "sha256:2f94fa3ae454a63ea3a19f73b95deeebc9f02ba2d5617ca16f0bbdae375cda47", + "sha256:31564a67c3e4005f27815634343df688b25705cccb22bc1db621c781ddc64c69", + "sha256:347974105bbd4ea068106ec65e8e8ebd86f28c19e529d115d89bd8cc5cda3079", + "sha256:379e03422178436af4f3abe0aa8f401aa77ae2487843738542a75faf44a31f0c", + "sha256:3eda1cb7e9da1b22588cefff09f0951771d6ee9fa8dbe66f5ae04cc5f26b2b55", + "sha256:51695d3b199cd03098ae5b42833006a0f43dc5418d3102972addc593a783bc02", + "sha256:54c000abeaff6d8771a4e2cef40900919908ea7b6b6a30eae72752607c6db559", + "sha256:5b936bf552e4f6357f5727579072ff1e1324717902127ffe60c92d29b67b7be3", + "sha256:6075fd24df23133c1b078e08a9b04a3bc40b31a8def4ee0b9f2c8865acce913e", + "sha256:661f641b44ed315556a2fa630239adfd77bd1b11cb0b9d96ed8ad90b0b1e4978", + "sha256:6ea6b300a6bdd782e49922d690e11c3669828fe36fc2471408c58b93b5535a98", + "sha256:6ed1d6f791eabfd9808afea1e068f5e59418e55721db8b7f3bfc39dc831c42ae", + "sha256:7934e055fd5cd9dee60f11d16c8d79c4567315824bacb1246d0208a47eca9755", + "sha256:7ab36e17af592eec5747c68ef2722a74c1a4a70f3772bc661079baf4ae30e40d", + "sha256:7f6d96fdb0975044fdd7953b35d003b03f9e2bcf85f2d2cf86285ece53e9f991", + "sha256:83e5ca0d5b743cde3d29fda74ccab37bdd0911f25bd4cdf09ff8b51b7b4f2fa1", + "sha256:85506b3328a9e083cc0a0fb3ba27e33c8db78341b3eb12eb72e8afd166c36680", + "sha256:8af75085b4bc0b5c40c4a3c0e113fa95e84c60f4ed6786cbb675aeb1ee128247", + "sha256:8b1359aba0ff810d5830d5ab8e2c4a02bebf98a60aa0124fb29aa78cfdb8031f", + "sha256:8fbd7d77f8aba46d43245e86dd91a8970eac4fb74c473f8e30e9c07581f852b2", + "sha256:907e8247480f287aa9bbc9391bd6de23c906d48af54c8c421df84655eef66af7", + "sha256:93d5ea0b5da8d66d868b32c614d2b52d14304444e39e13a59566d4acb8d6e2e4", + "sha256:97bc9d41e69a7521a358f9b8e44871f6cdeb42af31815c17aed36372d4eec667", + "sha256:994cdb1942a7a4c2e10098d9162948c9e7b235df755de91ca33f6e0481366fdb", + "sha256:a141de3d5a92188234afa61653ed0bbd2dde46ad47b15c3042ffb89548e77094", + "sha256:a1e15b230c3613e8ea82c9fc6941b2093e8eb939dd794c02754d33980ba81e36", + "sha256:aad5e300ab32036eb3fdc350ad30877210e2f51bceaca83fb7fef4d2b6c72b79", + "sha256:b529fdfa881b69fe563dbd98acce84f3e5a67df13de415e143ef053ff006d500", + "sha256:b9c77f0d1436ea4b4dc089ed8335fa141e6a251a92f75f675056dac4ab47a71e", + "sha256:bb621ec2dbbbe8df78a27dbd9dd7919f9b7d32a73fafcb4d9252fc4637343582", + "sha256:c7250848ce69559756ad0086a37b82c986cd33c2d344ab87fea596c5ac6d9442", + "sha256:c8d1d14aa0f600b5be363077b621b1b4d1eb3fbf90af83f9281cda668e6ff7fd", + "sha256:d1655a6fc7aecd333b079d00fb3c8132d18988e47f19740c69303bf02e9883c6", + "sha256:d6353ba89cfc657a3f5beabb3b69be226adbb5c6c7a66398e17809b0ce3c4731", + "sha256:da4377904a3379f0c1b75a965fff23b28315bcd516d27f99a803720dfebd94d4", + "sha256:e49ea4c1a9543d2bd8a747ff24411509c29e4bdcde05b5b0895e2120cb1a761d", + "sha256:e4e08305bfd76ba8edab08dcc6496f40674f44eb9d5e23153efa0a35750337e8", + "sha256:e6fa05a680e35d0fcc1470cb070b10e6fe247af54768f488ed93542e71339d6f", + "sha256:e7e6f2d6fd48422071cc8a6f8542016f350b79cc782752de531577d35e9bd677", + "sha256:e904c0381c014b914136c492c8fa711ca4cced4e9b3d110e5e7d436d0fc289e8", + "sha256:ec2b0ab7edc8cd4b0eb428b38ed89079bdc20c6bdb5f889d353011038caac2f9", + "sha256:ef5ce841e102278c1c2e98f043db99d6755b1c58bde475516aef3a008ed7f28e", + "sha256:f351c7d7d92f67c0609329ab2735eee0426a03022771b00102816a72715bb00b", + "sha256:fab7c640815812ed5f10fbee7abbf58788d602046b7bb3af9b1ac753a6d5e916", + "sha256:fc06cc8073c8e87072138ba1e431300e2d408f054b27047d047b549455066ff4" + ], + "markers": "python_version >= '3.7'", + "version": "==10.3" + }, + "yt-dlp": { + "hashes": [ + "sha256:3a7b59d2fb4b39ce8ba8e0b9c5a37fe20e5624f46a2346b4ae66ab1320e35134", + "sha256:deec1009442312c1e2ee5298966842194d0e950b433f0d4fc844ef464b9c32a7" + ], + "index": "pypi", + "version": "==2022.5.18" } }, "develop": { @@ -351,7 +522,7 @@ "sha256:5d26852efe48c0a32b0509ffbc583fda1a2266545a78d104a6f4aff3db17d700", "sha256:c58c8eb8a762858f49e18436ff552e83914778e50e9d2f1660535ffb364552ec" ], - "markers": "python_version >= '3.7'", + "markers": "python_version < '3.10'", "version": "==4.11.4" }, "iniconfig": { @@ -720,7 +891,7 @@ "sha256:4c586de507202505346f3e32d1363eb9ed6932f0c2f63184dea88983ff4971e2", "sha256:d2bbd99c320a2532ac71ff6a3164867884357da3e3301f0240090c5d2fdac7ec" ], - "markers": "python_version < '4' and python_full_version >= '3.6.3'", + "markers": "python_full_version >= '3.6.3' and python_full_version < '4.0.0'", "version": "==12.4.4" }, "secretstorage": { diff --git a/README.md b/README.md index a871ea7..2a75706 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,37 @@ # vk-url-scraper Library to scrape data and especially media links (videos and photos) from vk.com URLs. +You can use it via the [command line](#command-line-usage) or as a [python library](#python-library-usage). -## Quick usage API -`pip install vk-url-scraper` to install. +## Installation +You can install the most recent release from [pypi](https://pypi.org/project/vk-url-scraper/) via `pip install vk-url-scraper`. + +To use the library you will need a valid username/password combination for vk.com. + +## Command line usage +```bash +# run this to learn more about the parameters +vk_url_scraper --help + +# scrape a URL and get the JSON result in the console +vk_url_scraper -username "username here" --password "password here" --urls https://vk.com/wall12345_6789 +# OR +vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 +# you can also have multiple urls +vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 https://vk.com/photo-12345_6789 https://vk.com/video12345_6789 +# save the JSON output into a file +vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 > output.json + +# download any photos or videos found in these URLS +# this will use or create an output/ folder and dump the files there +vk_url_scraper -u "username here" -p "password here" --download --urls https://vk.com/wall12345_6789 +# or +vk_url_scraper -u "username here" -p "password here" -d --urls https://vk.com/wall12345_6789 +``` + +## Python library usage ```python from vk_url_scraper import VkScraper @@ -41,6 +67,8 @@ print(res[0]["text]) # eg: -> to get the text from code see [docs] for all available functions. ### TODO +* scrape album links +* scrape profile links * docs online from sphinx ## Development @@ -54,6 +82,9 @@ see [docs] for all available functions. 3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to user verbose, colors, and test docstring examples) 3. `make docs` to generate shpynx docs -> edit [config.py](docs/source/conf.py) if needed +To test the command line interface available in [__main__.py](__vk_url_scraper/__main__.py) you need to pass the `-m` option to python like so: `python -m vk_url_scraper -u "" -p "" --urls ...` + + ## Releasing new version 1. edit [version.py](vk_url_scraper/version.py) with proper versioning 2. run `./scripts/release.sh` to create a tag and push, alternatively diff --git a/setup.py b/setup.py index 40d71da..87afb63 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ setup( url="https://github.com/bellingcat/vk-url-scraper", author="Bellingcat", author_email="tech@bellingcat.com", - license="Apache", + license="MIT", packages=find_packages( exclude=["*.tests", "*.tests.*", "tests.*", "tests"], ), diff --git a/tests/scraper_test.py b/tests/scraper_test.py index 76ecf96..81bbd77 100644 --- a/tests/scraper_test.py +++ b/tests/scraper_test.py @@ -1,12 +1,11 @@ import datetime import os +import tempfile import pytest from vk_url_scraper import VkScraper -from .util import assert_equal_lists - vks = None @@ -82,12 +81,30 @@ def test_scrape_wall_url_with_photos_inner_videos_and_links_with_inner_photos(): assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 11, 1, 9)) assert len(res[0]["payload"]) == 15 assert len(res[0]["attachments"].keys()) == 3 - assert_equal_lists(list(res[0]["attachments"].keys()), ["photo", "link", "video"]) + for k in ["photo", "link", "video"]: + assert k in list(res[0]["attachments"].keys()) assert len(res[0]["attachments"]["photo"]) == 5 assert len(res[0]["attachments"]["link"]) == 1 assert len(res[0]["attachments"]["video"]) == 1 +def test_scrape_download_multiple_media(): + res = vks.scrape("https://vk.com/w=wall-17315087_74182") + + with tempfile.TemporaryDirectory(dir="./") as tempdir: + vks.download_media(res, tempdir) + expect_files = { + "wall-17315087_74182_0.jpg", + "wall-17315087_74182_1.jpg", + "wall-17315087_74182_2.jpg", + "wall-17315087_74182_3.jpg", + "wall-17315087_74182_4.jpg", + "wall-17315087_74182_0.mkv", + } + found_files = set(os.listdir(tempdir)) + assert len(expect_files) == len(expect_files & found_files) + + def test_scrape_photo_only(): res = vks.scrape("https://vk.com/apiclub?z=photo-1_457242435%2Falbum-1_00%2Frev") assert len(res) == 1 diff --git a/tests/util.py b/tests/util.py deleted file mode 100644 index c9bd2e2..0000000 --- a/tests/util.py +++ /dev/null @@ -1,3 +0,0 @@ -def assert_equal_lists(l1, l2): - assert len(l1) == len(l2) - assert str(sorted(l1)) == str(sorted(l2)) diff --git a/vk_url_scraper/__init__.py b/vk_url_scraper/__init__.py index 2050a28..9f50225 100644 --- a/vk_url_scraper/__init__.py +++ b/vk_url_scraper/__init__.py @@ -1 +1,2 @@ from .scraper import VkScraper +from .utils import DateTimeEncoder, mkdir_if_not_exists diff --git a/vk_url_scraper/__main__.py b/vk_url_scraper/__main__.py new file mode 100644 index 0000000..c0c957d --- /dev/null +++ b/vk_url_scraper/__main__.py @@ -0,0 +1,63 @@ +import argparse +import json + +from .scraper import VkScraper +from .utils import DateTimeEncoder + + +def get_argument_parser(): + """ + Creates the CMD line arguments. 'python vk_url_scraper.py --help' + """ + parser = argparse.ArgumentParser( + description="Authenticate and scrape information from vk.com based on a URL or set of URLs." + ) + + parser.add_argument( + "-u", + "--username", + action="store", + dest="username", + required=True, + help="username for a valid vk.com account", + ) + parser.add_argument( + "-p", + "--password", + action="store", + dest="password", + required=True, + help="password for the valid vk.com account", + ) + parser.add_argument( + "-d", + "--download", + action=argparse.BooleanOptionalAction, + dest="download", + help="if set then all photos and videos will be downloaded to folder output/", + ) + parser.add_argument( + "--urls", + action="store", + dest="urls", + nargs=argparse.REMAINDER, + required=True, + help="must be the last argument: any text with one or more urls to scrape", + ) + return parser + + +def main(): + parser = get_argument_parser() + args = parser.parse_args() + vks = VkScraper(args.username, args.password) + text = " ".join(args.urls) + res = vks.scrape(text) + res_json = json.dumps(res, ensure_ascii=False, indent=4, cls=DateTimeEncoder) + print(res_json) + if args.download: + vks.download_media(res) + + +if __name__ == "__main__": + main() diff --git a/vk_url_scraper/scraper.py b/vk_url_scraper/scraper.py index eafd11c..5f13130 100644 --- a/vk_url_scraper/scraper.py +++ b/vk_url_scraper/scraper.py @@ -1,10 +1,15 @@ +import os import re from collections import defaultdict from datetime import datetime from typing import List +from urllib.parse import urlparse import requests import vk_api # used to get api_token after authentication +import yt_dlp # to download videos from url + +from .utils import mkdir_if_not_exists class VkScraper: @@ -273,3 +278,42 @@ class VkScraper: } ) return res + + def download_media(self, results: List[dict], destination: str = "./output/") -> List[str]: + """ + Receives a list of dicts as returned by any of the scrape* methods and downloads the URLS present + if they are of type photo or video into the destination folder + + Parameters + ---------- + results : List[dict] + list with valid dictionary results (see class definition) + destination : str + the directory to save the downloaded files to. defaults to output/ + + Returns + ------- + a list of filenames for the downloaded files + """ + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36" + } + mkdir_if_not_exists(destination) + downloaded = [] + for r in results: + for k, attachments in r["attachments"].items(): + if k == "photo": + for i, url in enumerate(attachments): + ext = os.path.splitext(urlparse(url).path)[1] + filename = os.path.join(destination, f"{r['id']}_{i}{ext}") + d = requests.get(url, headers=headers) + with open(filename, "wb") as f: + f.write(d.content) + downloaded.append(filename) + elif k == "video": + for i, url in enumerate(attachments): + filename = os.path.join(destination, f"{r['id']}_{i}.mkv") + ydl = yt_dlp.YoutubeDL({"outtmpl": filename, "quiet": True}) + ydl.extract_info(url, download=True) + downloaded.append(filename) + return downloaded diff --git a/vk_url_scraper/utils.py b/vk_url_scraper/utils.py new file mode 100644 index 0000000..742ab8b --- /dev/null +++ b/vk_url_scraper/utils.py @@ -0,0 +1,16 @@ +import json +import os +from datetime import datetime + + +class DateTimeEncoder(json.JSONEncoder): + # to allow json.dump with datetimes do json.dumps(obj, cls=DateTimeEncoder) + def default(self, o): + if isinstance(o, datetime): + return str(o) # with timezone + return json.JSONEncoder.default(self, o) + + +def mkdir_if_not_exists(folder): + if not os.path.exists(folder): + os.makedirs(folder)