mirror of
https://github.com/bellingcat/vk-url-scraper.git
synced 2026-06-08 03:18:37 +03:00
adds command line interface
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,5 +1,6 @@
|
||||
.env
|
||||
vk_config.v2.json
|
||||
output/
|
||||
# build artifacts
|
||||
|
||||
.eggs/
|
||||
|
||||
1
Pipfile
1
Pipfile
@@ -5,6 +5,7 @@ name = "pypi"
|
||||
|
||||
[packages]
|
||||
vk-api = "*"
|
||||
yt-dlp = "*"
|
||||
|
||||
[dev-packages]
|
||||
sphinx-copybutton = "==0.5.0"
|
||||
|
||||
177
Pipfile.lock
generated
177
Pipfile.lock
generated
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "bab533e734f6da55647cc76a9f5a51d46c641723d485e38a16e2e31bca097130"
|
||||
"sha256": "4224e1159b48a3e903601184bf0d3f7613a817b5fca7062a119c549563527798"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
@@ -16,6 +16,74 @@
|
||||
]
|
||||
},
|
||||
"default": {
|
||||
"brotli": {
|
||||
"hashes": [
|
||||
"sha256:12effe280b8ebfd389022aa65114e30407540ccb89b177d3fbc9a4f177c4bd5d",
|
||||
"sha256:160c78292e98d21e73a4cc7f76a234390e516afcd982fa17e1422f7c6a9ce9c8",
|
||||
"sha256:16d528a45c2e1909c2798f27f7bf0a3feec1dc9e50948e738b961618e38b6a7b",
|
||||
"sha256:19598ecddd8a212aedb1ffa15763dd52a388518c4550e615aed88dc3753c0f0c",
|
||||
"sha256:1c48472a6ba3b113452355b9af0a60da5c2ae60477f8feda8346f8fd48e3e87c",
|
||||
"sha256:268fe94547ba25b58ebc724680609c8ee3e5a843202e9a381f6f9c5e8bdb5c70",
|
||||
"sha256:269a5743a393c65db46a7bb982644c67ecba4b8d91b392403ad8a861ba6f495f",
|
||||
"sha256:26d168aac4aaec9a4394221240e8a5436b5634adc3cd1cdf637f6645cecbf181",
|
||||
"sha256:29d1d350178e5225397e28ea1b7aca3648fcbab546d20e7475805437bfb0a130",
|
||||
"sha256:2aad0e0baa04517741c9bb5b07586c642302e5fb3e75319cb62087bd0995ab19",
|
||||
"sha256:3496fc835370da351d37cada4cf744039616a6db7d13c430035e901443a34daa",
|
||||
"sha256:35a3edbe18e876e596553c4007a087f8bcfd538f19bc116917b3c7522fca0429",
|
||||
"sha256:3b78a24b5fd13c03ee2b7b86290ed20efdc95da75a3557cc06811764d5ad1126",
|
||||
"sha256:40d15c79f42e0a2c72892bf407979febd9cf91f36f495ffb333d1d04cebb34e4",
|
||||
"sha256:44bb8ff420c1d19d91d79d8c3574b8954288bdff0273bf788954064d260d7ab0",
|
||||
"sha256:4688c1e42968ba52e57d8670ad2306fe92e0169c6f3af0089be75bbac0c64a3b",
|
||||
"sha256:495ba7e49c2db22b046a53b469bbecea802efce200dffb69b93dd47397edc9b6",
|
||||
"sha256:4d1b810aa0ed773f81dceda2cc7b403d01057458730e309856356d4ef4188438",
|
||||
"sha256:503fa6af7da9f4b5780bb7e4cbe0c639b010f12be85d02c99452825dd0feef3f",
|
||||
"sha256:56d027eace784738457437df7331965473f2c0da2c70e1a1f6fdbae5402e0389",
|
||||
"sha256:5913a1177fc36e30fcf6dc868ce23b0453952c78c04c266d3149b3d39e1410d6",
|
||||
"sha256:5b6ef7d9f9c38292df3690fe3e302b5b530999fa90014853dcd0d6902fb59f26",
|
||||
"sha256:5cb1e18167792d7d21e21365d7650b72d5081ed476123ff7b8cac7f45189c0c7",
|
||||
"sha256:61a7ee1f13ab913897dac7da44a73c6d44d48a4adff42a5701e3239791c96e14",
|
||||
"sha256:622a231b08899c864eb87e85f81c75e7b9ce05b001e59bbfbf43d4a71f5f32b2",
|
||||
"sha256:68715970f16b6e92c574c30747c95cf8cf62804569647386ff032195dc89a430",
|
||||
"sha256:6b2ae9f5f67f89aade1fab0f7fd8f2832501311c363a21579d02defa844d9296",
|
||||
"sha256:6c772d6c0a79ac0f414a9f8947cc407e119b8598de7621f39cacadae3cf57d12",
|
||||
"sha256:6d847b14f7ea89f6ad3c9e3901d1bc4835f6b390a9c71df999b0162d9bb1e20f",
|
||||
"sha256:76ffebb907bec09ff511bb3acc077695e2c32bc2142819491579a695f77ffd4d",
|
||||
"sha256:7bbff90b63328013e1e8cb50650ae0b9bac54ffb4be6104378490193cd60f85a",
|
||||
"sha256:7cb81373984cc0e4682f31bc3d6be9026006d96eecd07ea49aafb06897746452",
|
||||
"sha256:7ee83d3e3a024a9618e5be64648d6d11c37047ac48adff25f12fa4226cf23d1c",
|
||||
"sha256:854c33dad5ba0fbd6ab69185fec8dab89e13cda6b7d191ba111987df74f38761",
|
||||
"sha256:85f7912459c67eaab2fb854ed2bc1cc25772b300545fe7ed2dc03954da638649",
|
||||
"sha256:87fdccbb6bb589095f413b1e05734ba492c962b4a45a13ff3408fa44ffe6479b",
|
||||
"sha256:88c63a1b55f352b02c6ffd24b15ead9fc0e8bf781dbe070213039324922a2eea",
|
||||
"sha256:8a674ac10e0a87b683f4fa2b6fa41090edfd686a6524bd8dedbd6138b309175c",
|
||||
"sha256:93130612b837103e15ac3f9cbacb4613f9e348b58b3aad53721d92e57f96d46a",
|
||||
"sha256:9744a863b489c79a73aba014df554b0e7a0fc44ef3f8a0ef2a52919c7d155031",
|
||||
"sha256:9749a124280a0ada4187a6cfd1ffd35c350fb3af79c706589d98e088c5044267",
|
||||
"sha256:97f715cf371b16ac88b8c19da00029804e20e25f30d80203417255d239f228b5",
|
||||
"sha256:9bf919756d25e4114ace16a8ce91eb340eb57a08e2c6950c3cebcbe3dff2a5e7",
|
||||
"sha256:9d12cf2851759b8de8ca5fde36a59c08210a97ffca0eb94c532ce7b17c6a3d1d",
|
||||
"sha256:9ed4c92a0665002ff8ea852353aeb60d9141eb04109e88928026d3c8a9e5433c",
|
||||
"sha256:a72661af47119a80d82fa583b554095308d6a4c356b2a554fdc2799bc19f2a43",
|
||||
"sha256:afde17ae04d90fbe53afb628f7f2d4ca022797aa093e809de5c3cf276f61bbfa",
|
||||
"sha256:b336c5e9cf03c7be40c47b5fd694c43c9f1358a80ba384a21969e0b4e66a9b17",
|
||||
"sha256:b663f1e02de5d0573610756398e44c130add0eb9a3fc912a09665332942a2efb",
|
||||
"sha256:b83bb06a0192cccf1eb8d0a28672a1b79c74c3a8a5f2619625aeb6f28b3a82bb",
|
||||
"sha256:c2415d9d082152460f2bd4e382a1e85aed233abc92db5a3880da2257dc7daf7b",
|
||||
"sha256:c83aa123d56f2e060644427a882a36b3c12db93727ad7a7b9efd7d7f3e9cc2c4",
|
||||
"sha256:cfc391f4429ee0a9370aa93d812a52e1fee0f37a81861f4fdd1f4fb28e8547c3",
|
||||
"sha256:db844eb158a87ccab83e868a762ea8024ae27337fc7ddcbfcddd157f841fdfe7",
|
||||
"sha256:defed7ea5f218a9f2336301e6fd379f55c655bea65ba2476346340a0ce6f74a1",
|
||||
"sha256:e16eb9541f3dd1a3e92b89005e37b1257b157b7256df0e36bd7b33b50be73bcb",
|
||||
"sha256:e23281b9a08ec338469268f98f194658abfb13658ee98e2b7f85ee9dd06caa91",
|
||||
"sha256:e2d9e1cbc1b25e22000328702b014227737756f4b5bf5c485ac1d8091ada078b",
|
||||
"sha256:e48f4234f2469ed012a98f4b7874e7f7e173c167bed4934912a29e03167cf6b1",
|
||||
"sha256:e4c4e92c14a57c9bd4cb4be678c25369bf7a092d55fd0866f759e425b9660806",
|
||||
"sha256:ec1947eabbaf8e0531e8e899fc1d9876c179fc518989461f5d24e2223395a9e3",
|
||||
"sha256:f909bbbc433048b499cb9db9e713b5d8d949e8c109a2a548502fb9aa8630f0b1"
|
||||
],
|
||||
"markers": "platform_python_implementation == 'CPython'",
|
||||
"version": "==1.0.9"
|
||||
},
|
||||
"certifi": {
|
||||
"hashes": [
|
||||
"sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d",
|
||||
@@ -40,6 +108,47 @@
|
||||
"markers": "python_full_version >= '3.5.0'",
|
||||
"version": "==3.3"
|
||||
},
|
||||
"mutagen": {
|
||||
"hashes": [
|
||||
"sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1",
|
||||
"sha256:9c9f243fcec7f410f138cb12c21c84c64fde4195481a30c9bfb05b5f003adfed"
|
||||
],
|
||||
"markers": "python_version < '4' and python_full_version >= '3.5.0'",
|
||||
"version": "==1.45.1"
|
||||
},
|
||||
"pycryptodomex": {
|
||||
"hashes": [
|
||||
"sha256:1ca8e1b4c62038bb2da55451385246f51f412c5f5eabd64812c01766a5989b4a",
|
||||
"sha256:298c00ea41a81a491d5b244d295d18369e5aac4b61b77b2de5b249ca61cd6659",
|
||||
"sha256:2aa887683eee493e015545bd69d3d21ac8d5ad582674ec98f4af84511e353e45",
|
||||
"sha256:2ce76ed0081fd6ac8c74edc75b9d14eca2064173af79843c24fa62573263c1f2",
|
||||
"sha256:3da13c2535b7aea94cc2a6d1b1b37746814c74b6e80790daddd55ca5c120a489",
|
||||
"sha256:406ec8cfe0c098fadb18d597dc2ee6de4428d640c0ccafa453f3d9b2e58d29e2",
|
||||
"sha256:4d0db8df9ffae36f416897ad184608d9d7a8c2b46c4612c6bc759b26c073f750",
|
||||
"sha256:530756d2faa40af4c1f74123e1d889bd07feae45bac2fd32f259a35f7aa74151",
|
||||
"sha256:77931df40bb5ce5e13f4de2bfc982b2ddc0198971fbd947776c8bb5050896eb2",
|
||||
"sha256:797a36bd1f69df9e2798e33edb4bd04e5a30478efc08f9428c087f17f65a7045",
|
||||
"sha256:8085bd0ad2034352eee4d4f3e2da985c2749cb7344b939f4d95ead38c2520859",
|
||||
"sha256:8536bc08d130cae6dcba1ea689f2913dfd332d06113904d171f2f56da6228e89",
|
||||
"sha256:a4d412eba5679ede84b41dbe48b1bed8f33131ab9db06c238a235334733acc5e",
|
||||
"sha256:aebecde2adc4a6847094d3bd6a8a9538ef3438a5ea84ac1983fcb167db614461",
|
||||
"sha256:b276cc4deb4a80f9dfd47a41ebb464b1fe91efd8b1b8620cf5ccf8b824b850d6",
|
||||
"sha256:b5a185ae79f899b01ca49f365bdf15a45d78d9856f09b0de1a41b92afce1a07f",
|
||||
"sha256:c4d8977ccda886d88dc3ca789de2f1adc714df912ff3934b3d0a3f3d777deafb",
|
||||
"sha256:c5dd3ffa663c982d7f1be9eb494a8924f6d40e2e2f7d1d27384cfab1b2ac0662",
|
||||
"sha256:ca88f2f7020002638276439a01ffbb0355634907d1aa5ca91f3dc0c2e44e8f3b",
|
||||
"sha256:d2cce1c82a7845d7e2e8a0956c6b7ed3f1661c9acf18eb120fc71e098ab5c6fe",
|
||||
"sha256:d709572d64825d8d59ea112e11cc7faf6007f294e9951324b7574af4251e4de8",
|
||||
"sha256:da8db8374295fb532b4b0c467e66800ef17d100e4d5faa2bbbd6df35502da125",
|
||||
"sha256:e36c7e3b5382cd5669cf199c4a04a0279a43b2a3bdd77627e9b89778ac9ec08c",
|
||||
"sha256:e95a4a6c54d27a84a4624d2af8bb9ee178111604653194ca6880c98dcad92f48",
|
||||
"sha256:ee835def05622e0c8b1435a906491760a43d0c462f065ec9143ec4b8d79f8bff",
|
||||
"sha256:f75009715dcf4a3d680c2338ab19dac5498f8121173a929872950f4fb3a48fbf",
|
||||
"sha256:f8524b8bc89470cec7ac51734907818d3620fb1637f8f8b542d650ebec42a126"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==3.14.1"
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
"sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f",
|
||||
@@ -63,6 +172,68 @@
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==11.9.8"
|
||||
},
|
||||
"websockets": {
|
||||
"hashes": [
|
||||
"sha256:07cdc0a5b2549bcfbadb585ad8471ebdc7bdf91e32e34ae3889001c1c106a6af",
|
||||
"sha256:210aad7fdd381c52e58777560860c7e6110b6174488ef1d4b681c08b68bf7f8c",
|
||||
"sha256:28dd20b938a57c3124028680dc1600c197294da5db4292c76a0b48efb3ed7f76",
|
||||
"sha256:2f94fa3ae454a63ea3a19f73b95deeebc9f02ba2d5617ca16f0bbdae375cda47",
|
||||
"sha256:31564a67c3e4005f27815634343df688b25705cccb22bc1db621c781ddc64c69",
|
||||
"sha256:347974105bbd4ea068106ec65e8e8ebd86f28c19e529d115d89bd8cc5cda3079",
|
||||
"sha256:379e03422178436af4f3abe0aa8f401aa77ae2487843738542a75faf44a31f0c",
|
||||
"sha256:3eda1cb7e9da1b22588cefff09f0951771d6ee9fa8dbe66f5ae04cc5f26b2b55",
|
||||
"sha256:51695d3b199cd03098ae5b42833006a0f43dc5418d3102972addc593a783bc02",
|
||||
"sha256:54c000abeaff6d8771a4e2cef40900919908ea7b6b6a30eae72752607c6db559",
|
||||
"sha256:5b936bf552e4f6357f5727579072ff1e1324717902127ffe60c92d29b67b7be3",
|
||||
"sha256:6075fd24df23133c1b078e08a9b04a3bc40b31a8def4ee0b9f2c8865acce913e",
|
||||
"sha256:661f641b44ed315556a2fa630239adfd77bd1b11cb0b9d96ed8ad90b0b1e4978",
|
||||
"sha256:6ea6b300a6bdd782e49922d690e11c3669828fe36fc2471408c58b93b5535a98",
|
||||
"sha256:6ed1d6f791eabfd9808afea1e068f5e59418e55721db8b7f3bfc39dc831c42ae",
|
||||
"sha256:7934e055fd5cd9dee60f11d16c8d79c4567315824bacb1246d0208a47eca9755",
|
||||
"sha256:7ab36e17af592eec5747c68ef2722a74c1a4a70f3772bc661079baf4ae30e40d",
|
||||
"sha256:7f6d96fdb0975044fdd7953b35d003b03f9e2bcf85f2d2cf86285ece53e9f991",
|
||||
"sha256:83e5ca0d5b743cde3d29fda74ccab37bdd0911f25bd4cdf09ff8b51b7b4f2fa1",
|
||||
"sha256:85506b3328a9e083cc0a0fb3ba27e33c8db78341b3eb12eb72e8afd166c36680",
|
||||
"sha256:8af75085b4bc0b5c40c4a3c0e113fa95e84c60f4ed6786cbb675aeb1ee128247",
|
||||
"sha256:8b1359aba0ff810d5830d5ab8e2c4a02bebf98a60aa0124fb29aa78cfdb8031f",
|
||||
"sha256:8fbd7d77f8aba46d43245e86dd91a8970eac4fb74c473f8e30e9c07581f852b2",
|
||||
"sha256:907e8247480f287aa9bbc9391bd6de23c906d48af54c8c421df84655eef66af7",
|
||||
"sha256:93d5ea0b5da8d66d868b32c614d2b52d14304444e39e13a59566d4acb8d6e2e4",
|
||||
"sha256:97bc9d41e69a7521a358f9b8e44871f6cdeb42af31815c17aed36372d4eec667",
|
||||
"sha256:994cdb1942a7a4c2e10098d9162948c9e7b235df755de91ca33f6e0481366fdb",
|
||||
"sha256:a141de3d5a92188234afa61653ed0bbd2dde46ad47b15c3042ffb89548e77094",
|
||||
"sha256:a1e15b230c3613e8ea82c9fc6941b2093e8eb939dd794c02754d33980ba81e36",
|
||||
"sha256:aad5e300ab32036eb3fdc350ad30877210e2f51bceaca83fb7fef4d2b6c72b79",
|
||||
"sha256:b529fdfa881b69fe563dbd98acce84f3e5a67df13de415e143ef053ff006d500",
|
||||
"sha256:b9c77f0d1436ea4b4dc089ed8335fa141e6a251a92f75f675056dac4ab47a71e",
|
||||
"sha256:bb621ec2dbbbe8df78a27dbd9dd7919f9b7d32a73fafcb4d9252fc4637343582",
|
||||
"sha256:c7250848ce69559756ad0086a37b82c986cd33c2d344ab87fea596c5ac6d9442",
|
||||
"sha256:c8d1d14aa0f600b5be363077b621b1b4d1eb3fbf90af83f9281cda668e6ff7fd",
|
||||
"sha256:d1655a6fc7aecd333b079d00fb3c8132d18988e47f19740c69303bf02e9883c6",
|
||||
"sha256:d6353ba89cfc657a3f5beabb3b69be226adbb5c6c7a66398e17809b0ce3c4731",
|
||||
"sha256:da4377904a3379f0c1b75a965fff23b28315bcd516d27f99a803720dfebd94d4",
|
||||
"sha256:e49ea4c1a9543d2bd8a747ff24411509c29e4bdcde05b5b0895e2120cb1a761d",
|
||||
"sha256:e4e08305bfd76ba8edab08dcc6496f40674f44eb9d5e23153efa0a35750337e8",
|
||||
"sha256:e6fa05a680e35d0fcc1470cb070b10e6fe247af54768f488ed93542e71339d6f",
|
||||
"sha256:e7e6f2d6fd48422071cc8a6f8542016f350b79cc782752de531577d35e9bd677",
|
||||
"sha256:e904c0381c014b914136c492c8fa711ca4cced4e9b3d110e5e7d436d0fc289e8",
|
||||
"sha256:ec2b0ab7edc8cd4b0eb428b38ed89079bdc20c6bdb5f889d353011038caac2f9",
|
||||
"sha256:ef5ce841e102278c1c2e98f043db99d6755b1c58bde475516aef3a008ed7f28e",
|
||||
"sha256:f351c7d7d92f67c0609329ab2735eee0426a03022771b00102816a72715bb00b",
|
||||
"sha256:fab7c640815812ed5f10fbee7abbf58788d602046b7bb3af9b1ac753a6d5e916",
|
||||
"sha256:fc06cc8073c8e87072138ba1e431300e2d408f054b27047d047b549455066ff4"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==10.3"
|
||||
},
|
||||
"yt-dlp": {
|
||||
"hashes": [
|
||||
"sha256:3a7b59d2fb4b39ce8ba8e0b9c5a37fe20e5624f46a2346b4ae66ab1320e35134",
|
||||
"sha256:deec1009442312c1e2ee5298966842194d0e950b433f0d4fc844ef464b9c32a7"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2022.5.18"
|
||||
}
|
||||
},
|
||||
"develop": {
|
||||
@@ -351,7 +522,7 @@
|
||||
"sha256:5d26852efe48c0a32b0509ffbc583fda1a2266545a78d104a6f4aff3db17d700",
|
||||
"sha256:c58c8eb8a762858f49e18436ff552e83914778e50e9d2f1660535ffb364552ec"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"markers": "python_version < '3.10'",
|
||||
"version": "==4.11.4"
|
||||
},
|
||||
"iniconfig": {
|
||||
@@ -720,7 +891,7 @@
|
||||
"sha256:4c586de507202505346f3e32d1363eb9ed6932f0c2f63184dea88983ff4971e2",
|
||||
"sha256:d2bbd99c320a2532ac71ff6a3164867884357da3e3301f0240090c5d2fdac7ec"
|
||||
],
|
||||
"markers": "python_version < '4' and python_full_version >= '3.6.3'",
|
||||
"markers": "python_full_version >= '3.6.3' and python_full_version < '4.0.0'",
|
||||
"version": "==12.4.4"
|
||||
},
|
||||
"secretstorage": {
|
||||
|
||||
35
README.md
35
README.md
@@ -1,11 +1,37 @@
|
||||
# vk-url-scraper
|
||||
Library to scrape data and especially media links (videos and photos) from vk.com URLs.
|
||||
|
||||
You can use it via the [command line](#command-line-usage) or as a [python library](#python-library-usage).
|
||||
|
||||
## Quick usage API
|
||||
`pip install vk-url-scraper` to install.
|
||||
## Installation
|
||||
You can install the most recent release from [pypi](https://pypi.org/project/vk-url-scraper/) via `pip install vk-url-scraper`.
|
||||
|
||||
To use the library you will need a valid username/password combination for vk.com.
|
||||
|
||||
## Command line usage
|
||||
```bash
|
||||
# run this to learn more about the parameters
|
||||
vk_url_scraper --help
|
||||
|
||||
# scrape a URL and get the JSON result in the console
|
||||
vk_url_scraper -username "username here" --password "password here" --urls https://vk.com/wall12345_6789
|
||||
# OR
|
||||
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789
|
||||
# you can also have multiple urls
|
||||
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 https://vk.com/photo-12345_6789 https://vk.com/video12345_6789
|
||||
|
||||
|
||||
# save the JSON output into a file
|
||||
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 > output.json
|
||||
|
||||
# download any photos or videos found in these URLS
|
||||
# this will use or create an output/ folder and dump the files there
|
||||
vk_url_scraper -u "username here" -p "password here" --download --urls https://vk.com/wall12345_6789
|
||||
# or
|
||||
vk_url_scraper -u "username here" -p "password here" -d --urls https://vk.com/wall12345_6789
|
||||
```
|
||||
|
||||
## Python library usage
|
||||
```python
|
||||
from vk_url_scraper import VkScraper
|
||||
|
||||
@@ -41,6 +67,8 @@ print(res[0]["text]) # eg: -> to get the text from code
|
||||
see [docs] for all available functions.
|
||||
|
||||
### TODO
|
||||
* scrape album links
|
||||
* scrape profile links
|
||||
* docs online from sphinx
|
||||
|
||||
## Development
|
||||
@@ -54,6 +82,9 @@ see [docs] for all available functions.
|
||||
3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to user verbose, colors, and test docstring examples)
|
||||
3. `make docs` to generate shpynx docs -> edit [config.py](docs/source/conf.py) if needed
|
||||
|
||||
To test the command line interface available in [__main__.py](__vk_url_scraper/__main__.py) you need to pass the `-m` option to python like so: `python -m vk_url_scraper -u "" -p "" --urls ...`
|
||||
|
||||
|
||||
## Releasing new version
|
||||
1. edit [version.py](vk_url_scraper/version.py) with proper versioning
|
||||
2. run `./scripts/release.sh` to create a tag and push, alternatively
|
||||
|
||||
2
setup.py
2
setup.py
@@ -47,7 +47,7 @@ setup(
|
||||
url="https://github.com/bellingcat/vk-url-scraper",
|
||||
author="Bellingcat",
|
||||
author_email="tech@bellingcat.com",
|
||||
license="Apache",
|
||||
license="MIT",
|
||||
packages=find_packages(
|
||||
exclude=["*.tests", "*.tests.*", "tests.*", "tests"],
|
||||
),
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
import datetime
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
from vk_url_scraper import VkScraper
|
||||
|
||||
from .util import assert_equal_lists
|
||||
|
||||
vks = None
|
||||
|
||||
|
||||
@@ -82,12 +81,30 @@ def test_scrape_wall_url_with_photos_inner_videos_and_links_with_inner_photos():
|
||||
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 11, 1, 9))
|
||||
assert len(res[0]["payload"]) == 15
|
||||
assert len(res[0]["attachments"].keys()) == 3
|
||||
assert_equal_lists(list(res[0]["attachments"].keys()), ["photo", "link", "video"])
|
||||
for k in ["photo", "link", "video"]:
|
||||
assert k in list(res[0]["attachments"].keys())
|
||||
assert len(res[0]["attachments"]["photo"]) == 5
|
||||
assert len(res[0]["attachments"]["link"]) == 1
|
||||
assert len(res[0]["attachments"]["video"]) == 1
|
||||
|
||||
|
||||
def test_scrape_download_multiple_media():
|
||||
res = vks.scrape("https://vk.com/w=wall-17315087_74182")
|
||||
|
||||
with tempfile.TemporaryDirectory(dir="./") as tempdir:
|
||||
vks.download_media(res, tempdir)
|
||||
expect_files = {
|
||||
"wall-17315087_74182_0.jpg",
|
||||
"wall-17315087_74182_1.jpg",
|
||||
"wall-17315087_74182_2.jpg",
|
||||
"wall-17315087_74182_3.jpg",
|
||||
"wall-17315087_74182_4.jpg",
|
||||
"wall-17315087_74182_0.mkv",
|
||||
}
|
||||
found_files = set(os.listdir(tempdir))
|
||||
assert len(expect_files) == len(expect_files & found_files)
|
||||
|
||||
|
||||
def test_scrape_photo_only():
|
||||
res = vks.scrape("https://vk.com/apiclub?z=photo-1_457242435%2Falbum-1_00%2Frev")
|
||||
assert len(res) == 1
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
def assert_equal_lists(l1, l2):
|
||||
assert len(l1) == len(l2)
|
||||
assert str(sorted(l1)) == str(sorted(l2))
|
||||
@@ -1 +1,2 @@
|
||||
from .scraper import VkScraper
|
||||
from .utils import DateTimeEncoder, mkdir_if_not_exists
|
||||
|
||||
63
vk_url_scraper/__main__.py
Normal file
63
vk_url_scraper/__main__.py
Normal file
@@ -0,0 +1,63 @@
|
||||
import argparse
|
||||
import json
|
||||
|
||||
from .scraper import VkScraper
|
||||
from .utils import DateTimeEncoder
|
||||
|
||||
|
||||
def get_argument_parser():
|
||||
"""
|
||||
Creates the CMD line arguments. 'python vk_url_scraper.py --help'
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Authenticate and scrape information from vk.com based on a URL or set of URLs."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-u",
|
||||
"--username",
|
||||
action="store",
|
||||
dest="username",
|
||||
required=True,
|
||||
help="username for a valid vk.com account",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-p",
|
||||
"--password",
|
||||
action="store",
|
||||
dest="password",
|
||||
required=True,
|
||||
help="password for the valid vk.com account",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--download",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
dest="download",
|
||||
help="if set then all photos and videos will be downloaded to folder output/",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--urls",
|
||||
action="store",
|
||||
dest="urls",
|
||||
nargs=argparse.REMAINDER,
|
||||
required=True,
|
||||
help="must be the last argument: any text with one or more urls to scrape",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main():
|
||||
parser = get_argument_parser()
|
||||
args = parser.parse_args()
|
||||
vks = VkScraper(args.username, args.password)
|
||||
text = " ".join(args.urls)
|
||||
res = vks.scrape(text)
|
||||
res_json = json.dumps(res, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
|
||||
print(res_json)
|
||||
if args.download:
|
||||
vks.download_media(res)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,10 +1,15 @@
|
||||
import os
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from typing import List
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
import vk_api # used to get api_token after authentication
|
||||
import yt_dlp # to download videos from url
|
||||
|
||||
from .utils import mkdir_if_not_exists
|
||||
|
||||
|
||||
class VkScraper:
|
||||
@@ -273,3 +278,42 @@ class VkScraper:
|
||||
}
|
||||
)
|
||||
return res
|
||||
|
||||
def download_media(self, results: List[dict], destination: str = "./output/") -> List[str]:
|
||||
"""
|
||||
Receives a list of dicts as returned by any of the scrape* methods and downloads the URLS present
|
||||
if they are of type photo or video into the destination folder
|
||||
|
||||
Parameters
|
||||
----------
|
||||
results : List[dict]
|
||||
list with valid dictionary results (see class definition)
|
||||
destination : str
|
||||
the directory to save the downloaded files to. defaults to output/
|
||||
|
||||
Returns
|
||||
-------
|
||||
a list of filenames for the downloaded files
|
||||
"""
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
|
||||
}
|
||||
mkdir_if_not_exists(destination)
|
||||
downloaded = []
|
||||
for r in results:
|
||||
for k, attachments in r["attachments"].items():
|
||||
if k == "photo":
|
||||
for i, url in enumerate(attachments):
|
||||
ext = os.path.splitext(urlparse(url).path)[1]
|
||||
filename = os.path.join(destination, f"{r['id']}_{i}{ext}")
|
||||
d = requests.get(url, headers=headers)
|
||||
with open(filename, "wb") as f:
|
||||
f.write(d.content)
|
||||
downloaded.append(filename)
|
||||
elif k == "video":
|
||||
for i, url in enumerate(attachments):
|
||||
filename = os.path.join(destination, f"{r['id']}_{i}.mkv")
|
||||
ydl = yt_dlp.YoutubeDL({"outtmpl": filename, "quiet": True})
|
||||
ydl.extract_info(url, download=True)
|
||||
downloaded.append(filename)
|
||||
return downloaded
|
||||
|
||||
16
vk_url_scraper/utils.py
Normal file
16
vk_url_scraper/utils.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class DateTimeEncoder(json.JSONEncoder):
|
||||
# to allow json.dump with datetimes do json.dumps(obj, cls=DateTimeEncoder)
|
||||
def default(self, o):
|
||||
if isinstance(o, datetime):
|
||||
return str(o) # with timezone
|
||||
return json.JSONEncoder.default(self, o)
|
||||
|
||||
|
||||
def mkdir_if_not_exists(folder):
|
||||
if not os.path.exists(folder):
|
||||
os.makedirs(folder)
|
||||
Reference in New Issue
Block a user