diff --git a/Pipfile b/Pipfile index 5f86225..5457fcc 100644 --- a/Pipfile +++ b/Pipfile @@ -16,10 +16,11 @@ snscrape = {git = "https://github.com/bellingcat/snscrape.git"} ffmpeg-python = "*" polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"} garc = "*" -youtube-dl = "*" +yt-dlp = "*" telethon = "*" pytesseract = "*" pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"} +instaloader = "*" [dev-packages] pytest = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 5a75176..50622b1 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "afacc6dd45c110f235861c54db45f5546fb0095f4e68a1084e85fd0e902db21c" + "sha256": "d465f2d09a728ee76cb0af521890ecc1e1bce672acbd1caf2e4d01b6567480d5" }, "pipfile-spec": 6, "requires": { @@ -49,19 +49,87 @@ }, "boto3": { "hashes": [ - "sha256:15fa6d1acac422d2d34f7811e02acfc7ac222cea24db3f463d5c52f2f87baa52", - "sha256:c974a7fa781c500b7067441f9883ed939cf8c80bcd74c88b11965b336cabb4b6" + "sha256:8d6f3c548f0ee03d742f404c96515e7579fc6968135aaa50dd855a046698ff79", + "sha256:d857feb6af9932e1ee3a748060a2cd9fd6043dbbccf66976eda54586597efdc0" ], "index": "pypi", - "version": "==1.21.16" + "version": "==1.21.18" }, "botocore": { "hashes": [ - "sha256:0a809efb821d81dc29f2e6c404ed123176b8d2eb43103758f31d89b291af2a8b", - "sha256:dcff7f9b5fea98701d0b520eba99385c538825f10e6d1cab1e7da213293d141e" + "sha256:7ea8ef1ff7c4882ab59b337662f90ddf5ea860e95e7e209dca593a34ea585b1b", + "sha256:d2da7ccbc5ddd61fe3cd45fcbd3de380d9e3a15bfa8fbfd2d9259a93dcc60c56" ], "markers": "python_version >= '3.6'", - "version": "==1.24.16" + "version": "==1.24.18" + }, + "brotli": { + "hashes": [ + "sha256:12effe280b8ebfd389022aa65114e30407540ccb89b177d3fbc9a4f177c4bd5d", + "sha256:160c78292e98d21e73a4cc7f76a234390e516afcd982fa17e1422f7c6a9ce9c8", + "sha256:16d528a45c2e1909c2798f27f7bf0a3feec1dc9e50948e738b961618e38b6a7b", + "sha256:19598ecddd8a212aedb1ffa15763dd52a388518c4550e615aed88dc3753c0f0c", + "sha256:1c48472a6ba3b113452355b9af0a60da5c2ae60477f8feda8346f8fd48e3e87c", + "sha256:268fe94547ba25b58ebc724680609c8ee3e5a843202e9a381f6f9c5e8bdb5c70", + "sha256:269a5743a393c65db46a7bb982644c67ecba4b8d91b392403ad8a861ba6f495f", + "sha256:26d168aac4aaec9a4394221240e8a5436b5634adc3cd1cdf637f6645cecbf181", + "sha256:29d1d350178e5225397e28ea1b7aca3648fcbab546d20e7475805437bfb0a130", + "sha256:2aad0e0baa04517741c9bb5b07586c642302e5fb3e75319cb62087bd0995ab19", + "sha256:3496fc835370da351d37cada4cf744039616a6db7d13c430035e901443a34daa", + "sha256:35a3edbe18e876e596553c4007a087f8bcfd538f19bc116917b3c7522fca0429", + "sha256:3b78a24b5fd13c03ee2b7b86290ed20efdc95da75a3557cc06811764d5ad1126", + "sha256:40d15c79f42e0a2c72892bf407979febd9cf91f36f495ffb333d1d04cebb34e4", + "sha256:44bb8ff420c1d19d91d79d8c3574b8954288bdff0273bf788954064d260d7ab0", + "sha256:4688c1e42968ba52e57d8670ad2306fe92e0169c6f3af0089be75bbac0c64a3b", + "sha256:495ba7e49c2db22b046a53b469bbecea802efce200dffb69b93dd47397edc9b6", + "sha256:4d1b810aa0ed773f81dceda2cc7b403d01057458730e309856356d4ef4188438", + "sha256:503fa6af7da9f4b5780bb7e4cbe0c639b010f12be85d02c99452825dd0feef3f", + "sha256:56d027eace784738457437df7331965473f2c0da2c70e1a1f6fdbae5402e0389", + "sha256:5913a1177fc36e30fcf6dc868ce23b0453952c78c04c266d3149b3d39e1410d6", + "sha256:5b6ef7d9f9c38292df3690fe3e302b5b530999fa90014853dcd0d6902fb59f26", + "sha256:5cb1e18167792d7d21e21365d7650b72d5081ed476123ff7b8cac7f45189c0c7", + "sha256:61a7ee1f13ab913897dac7da44a73c6d44d48a4adff42a5701e3239791c96e14", + "sha256:622a231b08899c864eb87e85f81c75e7b9ce05b001e59bbfbf43d4a71f5f32b2", + "sha256:68715970f16b6e92c574c30747c95cf8cf62804569647386ff032195dc89a430", + "sha256:6b2ae9f5f67f89aade1fab0f7fd8f2832501311c363a21579d02defa844d9296", + "sha256:6c772d6c0a79ac0f414a9f8947cc407e119b8598de7621f39cacadae3cf57d12", + "sha256:6d847b14f7ea89f6ad3c9e3901d1bc4835f6b390a9c71df999b0162d9bb1e20f", + "sha256:76ffebb907bec09ff511bb3acc077695e2c32bc2142819491579a695f77ffd4d", + "sha256:7bbff90b63328013e1e8cb50650ae0b9bac54ffb4be6104378490193cd60f85a", + "sha256:7cb81373984cc0e4682f31bc3d6be9026006d96eecd07ea49aafb06897746452", + "sha256:7ee83d3e3a024a9618e5be64648d6d11c37047ac48adff25f12fa4226cf23d1c", + "sha256:854c33dad5ba0fbd6ab69185fec8dab89e13cda6b7d191ba111987df74f38761", + "sha256:85f7912459c67eaab2fb854ed2bc1cc25772b300545fe7ed2dc03954da638649", + "sha256:87fdccbb6bb589095f413b1e05734ba492c962b4a45a13ff3408fa44ffe6479b", + "sha256:88c63a1b55f352b02c6ffd24b15ead9fc0e8bf781dbe070213039324922a2eea", + "sha256:8a674ac10e0a87b683f4fa2b6fa41090edfd686a6524bd8dedbd6138b309175c", + "sha256:93130612b837103e15ac3f9cbacb4613f9e348b58b3aad53721d92e57f96d46a", + "sha256:9744a863b489c79a73aba014df554b0e7a0fc44ef3f8a0ef2a52919c7d155031", + "sha256:9749a124280a0ada4187a6cfd1ffd35c350fb3af79c706589d98e088c5044267", + "sha256:97f715cf371b16ac88b8c19da00029804e20e25f30d80203417255d239f228b5", + "sha256:9bf919756d25e4114ace16a8ce91eb340eb57a08e2c6950c3cebcbe3dff2a5e7", + "sha256:9d12cf2851759b8de8ca5fde36a59c08210a97ffca0eb94c532ce7b17c6a3d1d", + "sha256:9ed4c92a0665002ff8ea852353aeb60d9141eb04109e88928026d3c8a9e5433c", + "sha256:a72661af47119a80d82fa583b554095308d6a4c356b2a554fdc2799bc19f2a43", + "sha256:afde17ae04d90fbe53afb628f7f2d4ca022797aa093e809de5c3cf276f61bbfa", + "sha256:b336c5e9cf03c7be40c47b5fd694c43c9f1358a80ba384a21969e0b4e66a9b17", + "sha256:b663f1e02de5d0573610756398e44c130add0eb9a3fc912a09665332942a2efb", + "sha256:b83bb06a0192cccf1eb8d0a28672a1b79c74c3a8a5f2619625aeb6f28b3a82bb", + "sha256:c2415d9d082152460f2bd4e382a1e85aed233abc92db5a3880da2257dc7daf7b", + "sha256:c83aa123d56f2e060644427a882a36b3c12db93727ad7a7b9efd7d7f3e9cc2c4", + "sha256:cfc391f4429ee0a9370aa93d812a52e1fee0f37a81861f4fdd1f4fb28e8547c3", + "sha256:db844eb158a87ccab83e868a762ea8024ae27337fc7ddcbfcddd157f841fdfe7", + "sha256:defed7ea5f218a9f2336301e6fd379f55c655bea65ba2476346340a0ce6f74a1", + "sha256:e16eb9541f3dd1a3e92b89005e37b1257b157b7256df0e36bd7b33b50be73bcb", + "sha256:e23281b9a08ec338469268f98f194658abfb13658ee98e2b7f85ee9dd06caa91", + "sha256:e2d9e1cbc1b25e22000328702b014227737756f4b5bf5c485ac1d8091ada078b", + "sha256:e48f4234f2469ed012a98f4b7874e7f7e173c167bed4934912a29e03167cf6b1", + "sha256:e4c4e92c14a57c9bd4cb4be678c25369bf7a092d55fd0866f759e425b9660806", + "sha256:ec1947eabbaf8e0531e8e899fc1d9876c179fc518989461f5d24e2223395a9e3", + "sha256:f909bbbc433048b499cb9db9e713b5d8d949e8c109a2a548502fb9aa8630f0b1" + ], + "markers": "platform_python_implementation == 'CPython'", + "version": "==1.0.9" }, "bs4": { "hashes": [ @@ -226,11 +294,11 @@ }, "importlib-metadata": { "hashes": [ - "sha256:b36ffa925fe3139b2f6ff11d6925ffd4fa7bc47870165e3ac260ac7b4f91e6ac", - "sha256:d16e8c1deb60de41b8e8ed21c1a7b947b0bc62fab7e1d470bcdf331cea2e6735" + "sha256:1208431ca90a8cca1a6b8af391bb53c1a2db74e5d1cef6ddced95d4b2062edc6", + "sha256:ea4c597ebf37142f827b8f39299579e31685c31d3a438b59f469406afd0f2539" ], "markers": "python_version < '3.10'", - "version": "==4.11.2" + "version": "==4.11.3" }, "iniconfig": { "hashes": [ @@ -239,6 +307,13 @@ ], "version": "==1.1.1" }, + "instaloader": { + "hashes": [ + "sha256:9615a12a5a01a8b6c9d99a2a047b21d81b341cfd77656b9261bda30ece0cd562" + ], + "index": "pypi", + "version": "==4.8.4" + }, "jinja2": { "hashes": [ "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8", @@ -376,6 +451,14 @@ "markers": "python_version >= '3.7'", "version": "==2.1.0" }, + "mutagen": { + "hashes": [ + "sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1", + "sha256:9c9f243fcec7f410f138cb12c21c84c64fde4195481a30c9bfb05b5f003adfed" + ], + "markers": "python_version >= '3.5' and python_version < '4'", + "version": "==1.45.1" + }, "numpy": { "hashes": [ "sha256:07a8c89a04997625236c5ecb7afe35a02af3896c8aa01890a849913a2309c676", @@ -395,6 +478,7 @@ "sha256:dbc7601a3b7472d559dc7b933b18b4b66f9aa7452c120e87dfb33d02008c8a18", "sha256:e7927a589df200c5e23c57970bafbd0cd322459aa7b1ff73b7c2e84d6e3eae62", "sha256:f8c1f39caad2c896bc0018f699882b345b2a63708008be29b1f355ebf6f933fe", + "sha256:f950f8845b480cffe522913d35567e29dd381b0dc7e4ce6a4a9f9156417d2430", "sha256:fade0d4f4d292b6f39951b6836d7a3c7ef5b2347f3c420cd9820a1d90d794802", "sha256:fdf3c08bce27132395d3c3ba1503cac12e17282358cb4bddc25cc46b0aca07aa" ], @@ -521,6 +605,39 @@ ], "version": "==0.4.8" }, + "pycryptodomex": { + "hashes": [ + "sha256:1ca8e1b4c62038bb2da55451385246f51f412c5f5eabd64812c01766a5989b4a", + "sha256:298c00ea41a81a491d5b244d295d18369e5aac4b61b77b2de5b249ca61cd6659", + "sha256:2aa887683eee493e015545bd69d3d21ac8d5ad582674ec98f4af84511e353e45", + "sha256:2ce76ed0081fd6ac8c74edc75b9d14eca2064173af79843c24fa62573263c1f2", + "sha256:3da13c2535b7aea94cc2a6d1b1b37746814c74b6e80790daddd55ca5c120a489", + "sha256:406ec8cfe0c098fadb18d597dc2ee6de4428d640c0ccafa453f3d9b2e58d29e2", + "sha256:4d0db8df9ffae36f416897ad184608d9d7a8c2b46c4612c6bc759b26c073f750", + "sha256:530756d2faa40af4c1f74123e1d889bd07feae45bac2fd32f259a35f7aa74151", + "sha256:77931df40bb5ce5e13f4de2bfc982b2ddc0198971fbd947776c8bb5050896eb2", + "sha256:797a36bd1f69df9e2798e33edb4bd04e5a30478efc08f9428c087f17f65a7045", + "sha256:8085bd0ad2034352eee4d4f3e2da985c2749cb7344b939f4d95ead38c2520859", + "sha256:8536bc08d130cae6dcba1ea689f2913dfd332d06113904d171f2f56da6228e89", + "sha256:a4d412eba5679ede84b41dbe48b1bed8f33131ab9db06c238a235334733acc5e", + "sha256:aebecde2adc4a6847094d3bd6a8a9538ef3438a5ea84ac1983fcb167db614461", + "sha256:b276cc4deb4a80f9dfd47a41ebb464b1fe91efd8b1b8620cf5ccf8b824b850d6", + "sha256:b5a185ae79f899b01ca49f365bdf15a45d78d9856f09b0de1a41b92afce1a07f", + "sha256:c4d8977ccda886d88dc3ca789de2f1adc714df912ff3934b3d0a3f3d777deafb", + "sha256:c5dd3ffa663c982d7f1be9eb494a8924f6d40e2e2f7d1d27384cfab1b2ac0662", + "sha256:ca88f2f7020002638276439a01ffbb0355634907d1aa5ca91f3dc0c2e44e8f3b", + "sha256:d2cce1c82a7845d7e2e8a0956c6b7ed3f1661c9acf18eb120fc71e098ab5c6fe", + "sha256:d709572d64825d8d59ea112e11cc7faf6007f294e9951324b7574af4251e4de8", + "sha256:da8db8374295fb532b4b0c467e66800ef17d100e4d5faa2bbbd6df35502da125", + "sha256:e36c7e3b5382cd5669cf199c4a04a0279a43b2a3bdd77627e9b89778ac9ec08c", + "sha256:e95a4a6c54d27a84a4624d2af8bb9ee178111604653194ca6880c98dcad92f48", + "sha256:ee835def05622e0c8b1435a906491760a43d0c462f065ec9143ec4b8d79f8bff", + "sha256:f75009715dcf4a3d680c2338ab19dac5498f8121173a929872950f4fb3a48fbf", + "sha256:f8524b8bc89470cec7ac51734907818d3620fb1637f8f8b542d650ebec42a126" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==3.14.1" + }, "pyexiftool": { "git": "https://github.com/smarnach/pyexiftool.git", "ref": "3db3764895e687d75b42d3ae4e554ca8664a7f6f" @@ -559,11 +676,11 @@ }, "pytest": { "hashes": [ - "sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db", - "sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171" + "sha256:b555252a95bbb2a37a97b5ac2eb050c436f7989993565f5e0c9128fcaacadd0e", + "sha256:f1089d218cfcc63a212c42896f1b7fbf096874d045e1988186861a1a87d27b47" ], - "markers": "python_version >= '3.6'", - "version": "==7.0.1" + "markers": "python_version >= '3.7'", + "version": "==7.1.0" }, "python-dateutil": { "hashes": [ @@ -681,7 +798,7 @@ "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17", "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb" ], - "markers": "python_version >= '3.6' and python_version < '4.0'", + "markers": "python_version >= '3.6' and python_version < '4'", "version": "==4.8" }, "s3transfer": { @@ -853,16 +970,70 @@ "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed", "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", "version": "==1.26.8" }, - "youtube-dl": { + "websockets": { "hashes": [ - "sha256:bc59e86c5d15d887ac590454511f08ce2c47698d5a82c27bfe27b5d814bbaed2", - "sha256:f1336d5de68647e0364a47b3c0712578e59ec76f02048ff5c50ef1c69d79cd55" + "sha256:038afef2a05893578d10dadbdbb5f112bd115c46347e1efe99f6a356ff062138", + "sha256:05f6e9757017270e7a92a2975e2ae88a9a582ffc4629086fd6039aa80e99cd86", + "sha256:0b66421f9f13d4df60cd48ab977ed2c2b6c9147ae1a33caf5a9f46294422fda1", + "sha256:0cd02f36d37e503aca88ab23cc0a1a0e92a263d37acf6331521eb38040dcf77b", + "sha256:0f73cb2526d6da268e86977b2c4b58f2195994e53070fe567d5487c6436047e6", + "sha256:117383d0a17a0dda349f7a8790763dde75c1508ff8e4d6e8328b898b7df48397", + "sha256:1c1f3b18c8162e3b09761d0c6a0305fd642934202541cc511ef972cb9463261e", + "sha256:1c9031e90ebfc486e9cdad532b94004ade3aa39a31d3c46c105bb0b579cd2490", + "sha256:2349fa81b6b959484bb2bda556ccb9eb70ba68987646a0f8a537a1a18319fb03", + "sha256:24b879ba7db12bb525d4e58089fcbe6a3df3ce4666523183654170e86d372cbe", + "sha256:2aa9b91347ecd0412683f28aabe27f6bad502d89bd363b76e0a3508b1596402e", + "sha256:56d48eebe9e39ce0d68701bce3b21df923aa05dcc00f9fd8300de1df31a7c07c", + "sha256:5a38a0175ae82e4a8c4bac29fc01b9ee26d7d5a614e5ee11e7813c68a7d938ce", + "sha256:5b04270b5613f245ec84bb2c6a482a9d009aefad37c0575f6cda8499125d5d5c", + "sha256:6193bbc1ee63aadeb9a4d81de0e19477401d150d506aee772d8380943f118186", + "sha256:669e54228a4d9457abafed27cbf0e2b9f401445c4dfefc12bf8e4db9751703b8", + "sha256:6a009eb551c46fd79737791c0c833fc0e5b56bcd1c3057498b262d660b92e9cd", + "sha256:71a4491cfe7a9f18ee57d41163cb6a8a3fa591e0f0564ca8b0ed86b2a30cced4", + "sha256:7b38a5c9112e3dbbe45540f7b60c5204f49b3cb501b40950d6ab34cd202ab1d0", + "sha256:7bb9d8a6beca478c7e9bdde0159bd810cc1006ad6a7cb460533bae39da692ca2", + "sha256:82bc33db6d8309dc27a3bee11f7da2288ad925fcbabc2a4bb78f7e9c56249baf", + "sha256:8351c3c86b08156337b0e4ece0e3c5ec3e01fcd14e8950996832a23c99416098", + "sha256:8beac786a388bb99a66c3be4ab0fb38273c0e3bc17f612a4e0a47c4fc8b9c045", + "sha256:97950c7c844ec6f8d292440953ae18b99e3a6a09885e09d20d5e7ecd9b914cf8", + "sha256:98f57b3120f8331cd7440dbe0e776474f5e3632fdaa474af1f6b754955a47d71", + "sha256:9ca2ca05a4c29179f06cf6727b45dba5d228da62623ec9df4184413d8aae6cb9", + "sha256:a03a25d95cc7400bd4d61a63460b5d85a7761c12075ee2f51de1ffe73aa593d3", + "sha256:a10c0c1ee02164246f90053273a42d72a3b2452a7e7486fdae781138cf7fbe2d", + "sha256:a72b92f96e5e540d5dda99ee3346e199ade8df63152fa3c737260da1730c411f", + "sha256:ac081aa0307f263d63c5ff0727935c736c8dad51ddf2dc9f5d0c4759842aefaa", + "sha256:b22bdc795e62e71118b63e14a08bacfa4f262fd2877de7e5b950f5ac16b0348f", + "sha256:b4059e2ccbe6587b6dc9a01db5fc49ead9a884faa4076eea96c5ec62cb32f42a", + "sha256:b7fe45ae43ac814beb8ca09d6995b56800676f2cfa8e23f42839dc69bba34a42", + "sha256:bef03a51f9657fb03d8da6ccd233fe96e04101a852f0ffd35f5b725b28221ff3", + "sha256:bffc65442dd35c473ca9790a3fa3ba06396102a950794f536783f4b8060af8dd", + "sha256:c21a67ab9a94bd53e10bba21912556027fea944648a09e6508415ad14e37c325", + "sha256:c67d9cacb3f6537ca21e9b224d4fd08481538e43bcac08b3d93181b0816def39", + "sha256:c6e56606842bb24e16e36ae7eb308d866b4249cf0be8f63b212f287eeb76b124", + "sha256:cb316b87cbe3c0791c2ad92a5a36bf6adc87c457654335810b25048c1daa6fd5", + "sha256:cef40a1b183dcf39d23b392e9dd1d9b07ab9c46aadf294fff1350fb79146e72b", + "sha256:cf931c33db9c87c53d009856045dd524e4a378445693382a920fa1e0eb77c36c", + "sha256:d4d110a84b63c5cfdd22485acc97b8b919aefeecd6300c0c9d551e055b9a88ea", + "sha256:d5396710f86a306cf52f87fd8ea594a0e894ba0cc5a36059eaca3a477dc332aa", + "sha256:f09f46b1ff6d09b01c7816c50bd1903cf7d02ebbdb63726132717c2fcda835d5", + "sha256:f14bd10e170abc01682a9f8b28b16e6f20acf6175945ef38db6ffe31b0c72c3f", + "sha256:f5c335dc0e7dc271ef36df3f439868b3c790775f345338c2f61a562f1074187b", + "sha256:f8296b8408ec6853b26771599990721a26403e62b9de7e50ac0a056772ac0b5e", + "sha256:fa35c5d1830d0fb7b810324e9eeab9aa92e8f273f11fdbdc0741dcded6d72b9f" + ], + "markers": "python_version >= '3.7'", + "version": "==10.2" + }, + "yt-dlp": { + "hashes": [ + "sha256:05179f0f2c34f06910003bb9f80af68ff798b072ca0f826c0e6704a3fbd5b306", + "sha256:68546578c18e6ce87450b53769d5d5b7f5a23e5209784976db6c7ccbf7954b21" ], "index": "pypi", - "version": "==2021.12.17" + "version": "==2022.3.8.2" }, "zipp": { "hashes": [ @@ -973,11 +1144,11 @@ }, "pytest": { "hashes": [ - "sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db", - "sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171" + "sha256:b555252a95bbb2a37a97b5ac2eb050c436f7989993565f5e0c9128fcaacadd0e", + "sha256:f1089d218cfcc63a212c42896f1b7fbf096874d045e1988186861a1a87d27b47" ], - "markers": "python_version >= '3.6'", - "version": "==7.0.1" + "markers": "python_version >= '3.7'", + "version": "==7.1.0" }, "pytest-cov": { "hashes": [ diff --git a/cisticola/scraper/__init__.py b/cisticola/scraper/__init__.py index 4f33931..36e6cd5 100644 --- a/cisticola/scraper/__init__.py +++ b/cisticola/scraper/__init__.py @@ -3,8 +3,11 @@ from .base import Scraper, ScraperController from .bitchute import BitchuteScraper from .gab import GabScraper from .gettr import GettrScraper +from .instagram import InstagramScraper from .odysee import OdyseeScraper from .rumble import RumbleScraper from .telegram_snscrape import TelegramSnscrapeScraper from .telegram_telethon import TelegramTelethonScraper -from .twitter import TwitterScraper \ No newline at end of file +from .twitter import TwitterScraper +from .vkontakte import VkontakteScraper +from .youtube import YoutubeScraper \ No newline at end of file diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index f2eae25..d7f69a1 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -8,6 +8,7 @@ import boto3 from loguru import logger import ffmpeg from sqlalchemy.orm import sessionmaker +import yt_dlp from cisticola.base import Channel, ScraperResult, mapper_registry from cisticola.utils import make_request @@ -69,6 +70,38 @@ class Scraper: return blob, content_type, key + def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: + + content_type = 'video/mp4' + + with tempfile.TemporaryDirectory() as temp_dir: + ydl_opts = { + "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", + "merge_output_format": "mp4", + "outtmpl": f"{temp_dir}/%(id)s.%(ext)s", + "noplaylist": True, + 'quiet': True, + "verbose": False,} + ydl = yt_dlp.YoutubeDL(ydl_opts) + + try: + meta = ydl.extract_info( + url, + download=True,) + except yt_dlp.utils.DownloadError as e: + raise e + else: + video_id = meta["id"] + video_ext = meta["ext"] + + with open(f"{temp_dir}/{video_id}.{video_ext}", "rb") as f: + blob = f.read() + + if key is None: + key = self.url_to_key(url = url, content_type = content_type) + + return blob, content_type, key + def archive_blob(self, blob: bytes, content_type: str, key: str) -> str: filename = self.__version__.replace(' ', '_') + '/' + key @@ -101,7 +134,7 @@ class ScraperController: def register_scrapers(self, scraper: List[Scraper]): self.scrapers.extend(scraper) - @logger.catch + @logger.catch(reraise = True) def scrape_channels(self, channels: List[Channel], archive_media: bool = True): if self.session is None: logger.error("No DB session") diff --git a/cisticola/scraper/instagram.py b/cisticola/scraper/instagram.py new file mode 100644 index 0000000..eb20ecb --- /dev/null +++ b/cisticola/scraper/instagram.py @@ -0,0 +1,102 @@ +from typing import Generator +from datetime import datetime, timezone +import os +import json +import tempfile +from pathlib import Path + +from loguru import logger +import instaloader + +from cisticola.base import Channel, ScraperResult +from cisticola.scraper.base import Scraper + +BASE_URL = 'https://www.instagram.com/' + +CONTENT_TYPES = { + 'jpg' : 'image/jpeg', + 'mp4' : 'video/mp4'} + +class InstagramScraper(Scraper): + __version__ = "InstagramScraper 0.0.1" + + def get_username_from_url(self, url): + username = url.split(BASE_URL)[1].strip('/') + return username + + def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: + + username = self.get_username_from_url(channel.url) + + loader = instaloader.Instaloader( + quiet = True, + download_comments = False, + save_metadata = False) + + loader.login( + user = os.environ['INSTAGRAM_USERNAME'], + passwd = os.environ['INSTAGRAM_PASSWORD']) + + profile = instaloader.Profile.from_username( + context = loader.context, + username = username) + + for post in profile.get_posts(): + + if since is not None and post.date_utc <= since.date: + break + + post_url = f'{BASE_URL}p/{post.shortcode}/' + + archived_urls = {} + + if archive_media: + + with tempfile.TemporaryDirectory() as temp_dir: + + loader.download_post(post = post, target = Path(temp_dir)) + + files = os.listdir(temp_dir) + files = [f for f in files if not f.endswith('.txt')] + + for file in files: + ext = file.split('.')[-1] + content_type = CONTENT_TYPES[ext] + filename = Path(temp_dir, file) + key = f'{post.shortcode}__{file}' + + with open(filename, 'rb') as f: + blob = f.read() + + archived_url = self.archive_blob(blob = blob, content_type = content_type, key = key) + archived_urls[post_url] = archived_url + + yield ScraperResult( + scraper=self.__version__, + platform="Instagram", + channel=channel.id, + platform_id=post.mediaid, + date=post.date_utc, + date_archived=datetime.now(timezone.utc), + raw_data=json.dumps(post._asdict(), default=str), + archived_urls=archived_urls) + + for comment in post.get_comments(): + + comment_dict = comment._asdict() + comment_dict['post_url'] = post_url + comment_dict['is_comment'] = True + + yield ScraperResult( + scraper=self.__version__, + platform="Instagram", + channel=channel.id, + platform_id=post.mediaid, + date=comment.created_at_utc, + date_archived=datetime.now(timezone.utc), + raw_data=json.dumps(comment_dict, default=str), + archived_urls={}) + + def can_handle(self, channel): + if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None: + return True \ No newline at end of file diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index dbb4194..8546d6e 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -1,12 +1,9 @@ from datetime import datetime, timezone import json -from typing import Generator, Tuple -import tempfile +from typing import Generator from urllib.parse import urlparse -import requests from bs4 import BeautifulSoup -import youtube_dl from cisticola.base import Channel, ScraperResult from cisticola.scraper import Scraper, make_request @@ -37,7 +34,7 @@ class RumbleScraper(Scraper): url = post['media_url'] - media_blob, content_type, key = self.url_to_blob(url) + media_blob, content_type, key = self.ytdlp_url_to_blob(url) archived_url = self.archive_blob(media_blob, content_type, key) archived_urls[post['media_url']] = archived_url @@ -51,43 +48,15 @@ class RumbleScraper(Scraper): raw_data=json.dumps(post), archived_urls=archived_urls) + def url_to_key(self, url: str, content_type: str) -> str: + ext = '.' + content_type.split('/')[-1] + key = urlparse(url).path.split('/')[-2] + ext + return key + def can_handle(self, channel): if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None: return True - def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: - - content_type = 'video/mp4' - ext = '.' + content_type.split('/')[-1] - - with tempfile.TemporaryDirectory() as temp_dir: - ydl_opts = { - "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", - "merge_output_format": "mp4", - "outtmpl": f"{temp_dir}/%(id)s.%(ext)s", - "noplaylist": True, - 'quiet': True, - "verbose": False,} - ydl = youtube_dl.YoutubeDL(ydl_opts) - - try: - meta = ydl.extract_info( - url, - download=True,) - except youtube_dl.utils.DownloadError as e: - raise e - else: - video_id = meta["id"] - video_ext = meta["ext"] - - with open(f"{temp_dir}/{video_id}.{video_ext}", "rb") as f: - blob = f.read() - - if key is None: - key = urlparse(url).path.split('/')[-2] + ext - - return blob, content_type, key - #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# def get_media_url(url): diff --git a/cisticola/scraper/vkontakte.py b/cisticola/scraper/vkontakte.py new file mode 100644 index 0000000..7ca5659 --- /dev/null +++ b/cisticola/scraper/vkontakte.py @@ -0,0 +1,80 @@ +from datetime import datetime, timezone +from typing import Generator +from urllib.parse import urlparse + +from snscrape.modules.vkontakte import VKontakteUserScraper +from loguru import logger + +from cisticola.base import Channel, ScraperResult +from cisticola.scraper.base import Scraper + +class VkontakteScraper(Scraper): + """An implementation of a Scraper for Vkontakte, using snscrape library""" + __version__ = "VkontakteScraper 0.0.1" + + def get_username_from_url(self, url): + username = url.split('https://vk.com/')[1] + + return username + + def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: + + username = self.get_username_from_url(channel.url) + scraper = VKontakteUserScraper(username) + + first = True + + for post in scraper.get_items(): + if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc): + # with VKontakteUserScraper, the first tweet could be an old pinned tweet + if first: + first = False + continue + else: + break + + archived_urls = {} + + if archive_media: + + if post.photos: + + for photo in post.photos: + variant = max( + [v for v in photo.variants], key=lambda v: v.width * v.height) + url = variant.url + + if url is not None: + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_blob(media_blob, content_type, key) + archived_urls[url] = archived_url + + if post.video: + url = post.video.url + media_blob, content_type, key = self.ytdlp_url_to_blob(url) + archived_url = self.archive_blob(media_blob, content_type, key) + archived_urls[url] = archived_url + + yield ScraperResult( + scraper=self.__version__, + platform="Vkontatke", + channel=channel.id, + platform_id=post.url.split('/')[-1], + date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc), + date_archived=datetime.now(timezone.utc), + raw_data=post.json(), + archived_urls=archived_urls) + + def can_handle(self, channel): + if channel.platform == "Vkontakte" and channel.platform_id: + return True + + def url_to_key(self, url: str, content_type: str) -> str: + path = urlparse(url).path + if path.endswith('.jpg'): + key = '_'.join(path.split('/')[-2:]) + else: + ext = '.mp4' + key = path.split('/')[-1] + ext + + return key \ No newline at end of file diff --git a/cisticola/scraper/youtube.py b/cisticola/scraper/youtube.py new file mode 100644 index 0000000..20ae6a3 --- /dev/null +++ b/cisticola/scraper/youtube.py @@ -0,0 +1,79 @@ +from datetime import datetime, timezone +import json +from typing import Generator +import tempfile + +import yt_dlp + +from cisticola.base import Channel, ScraperResult +from cisticola.scraper import Scraper + +class YoutubeScraper(Scraper): + """An implementation of a Scraper for Youtube, using youtube-dl""" + __version__ = "YoutubeScraper 0.0.1" + + def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: + + content_type = 'video/mp4' + + if since is None: + since_date = datetime.min + start_date = None + else: + since_date = since.date + start_date = since.date.strftime('%Y%m%d') + + with tempfile.TemporaryDirectory() as temp_dir: + + daterange = yt_dlp.utils.DateRange(start = start_date) + + ydl_opts = { + "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", + "merge_output_format": "mp4", + "outtmpl": f"{temp_dir}/%(id)s.%(ext)s", + "daterange" : daterange} + + ydl = yt_dlp.YoutubeDL(ydl_opts) + + try: + meta = ydl.extract_info( + channel.url, + download=archive_media) + except yt_dlp.utils.DownloadError as e: + raise e + else: + videos = meta['entries'] + valid_videos = [video for video in videos if since_date < datetime.strptime(video['upload_date'], '%Y%m%d')] + + for video in valid_videos: + + archived_urls = {} + video_id = video["id"] + video_ext = video["ext"] + + if archive_media: + + key = f"{video_id}.{video_ext}" + + with open(f"{temp_dir}/{key}", "rb") as f: + media_blob = f.read() + archived_url = self.archive_blob(media_blob, content_type, key) + + url = video['webpage_url'] + + archived_url = self.archive_blob(media_blob, content_type, key) + archived_urls[url] = archived_url + + yield ScraperResult( + scraper=self.__version__, + platform="Youtube", + channel=channel.id, + platform_id=video_id, + date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc), + date_archived=datetime.now(timezone.utc), + raw_data=json.dumps(video, default = str), + archived_urls=archived_urls) + + def can_handle(self, channel): + if channel.platform == "Youtube" and channel.url: + return True \ No newline at end of file diff --git a/cisticola/transformer/base.py b/cisticola/transformer/base.py index ce6bb59..ce22f03 100644 --- a/cisticola/transformer/base.py +++ b/cisticola/transformer/base.py @@ -101,7 +101,7 @@ class ETLController: self.session = sessionmaker() self.session.configure(bind=engine) - @logger.catch + @logger.catch(reraise = True) def transform_results(self, results: List[ScraperResult], hydrate: bool = True): """Transforms raw ScraperResults objects into TransformedResult objects and Media objects. Then, adds them to the database. @@ -149,7 +149,7 @@ class ETLController: if handled == False: logger.warning(f"No Transformer could handle {result}") - @logger.catch + @logger.catch(reraise = True) def transform_all_untransformed(self, hydrate: bool = True): """Transform all ScraperResult objects in the database that do not have an equivalent TransformedResult object stored. diff --git a/tests/conftest.py b/tests/conftest.py index 42548e9..7703639 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -52,8 +52,23 @@ GETTR_CHANNEL_KWARGS = { 'chat': False, 'notes': ''} -ODYSEE_CHANNEL_KWARGS = { +INSTAGRAM_CHANNEL_KWARGS = { 'id': 3, + 'name': 'borland.88 (test)', + 'platform_id': 'borland.88', + 'category': 'test', + 'followers': None, + 'platform': 'Instagram', + 'url': 'https://www.instagram.com/borland.88/', + 'screenname': 'borland.88', + 'country': 'UA', + 'influencer': None, + 'public': True, + 'chat': False, + 'notes': ''} + +ODYSEE_CHANNEL_KWARGS = { + 'id': 4, 'name': "Mak1n' Bacon (test)", 'platform_id': 'Mak1nBacon', 'category': 'test', @@ -68,7 +83,7 @@ ODYSEE_CHANNEL_KWARGS = { 'notes': ''} RUMBLE_CHANNEL_KWARGS = { - 'id': 4, + 'id': 5, 'name': 'we are uploading videos wow products', 'platform_id': 'c-916305', 'category': 'test', @@ -83,7 +98,7 @@ RUMBLE_CHANNEL_KWARGS = { 'notes': ''} TELEGRAM_CHANNEL_KWARGS = { - 'id': 5, + 'id': 6, 'name': 'South West Ohio Proud Boys (test)', 'platform_id': -1001276612436, 'category': 'test', @@ -98,7 +113,7 @@ TELEGRAM_CHANNEL_KWARGS = { 'notes': ''} TWITTER_CHANNEL_KWARGS = { - 'id': 5, + 'id': 7, 'name': 'L Weber (test)', 'platform_id': 1424979017749442595, 'category': 'test', @@ -112,6 +127,36 @@ TWITTER_CHANNEL_KWARGS = { 'chat': False, 'notes': ''} +VKONTAKTE_CHANNEL_KWARGS = { + 'id': 8, + 'name': 'Wwg1wgA (test)', + 'platform_id': 'club201278078', + 'category': 'test', + 'followers': None, + 'platform': 'Vkontakte', + 'url': 'https://vk.com/club201278078', + 'screenname': 'Wwg1wgA', + 'country': 'FR', + 'influencer': None, + 'public': True, + 'chat': False, + 'notes': ''} + +YOUTUBE_CHANNEL_KWARGS = { + 'id': 9, + 'name': 'AnEs87 (test)', + 'platform_id': 'UCP6exBqGoxGLv_pM9Dxk2pA', + 'category': 'test', + 'followers': None, + 'platform': 'Youtube', + 'url': 'https://www.youtube.com/channel/UCP6exBqGoxGLv_pM9Dxk2pA', + 'screenname': 'AnEs87', + 'country': 'SV', + 'influencer': None, + 'public': True, + 'chat': False, + 'notes': ''} + #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# @@ -151,12 +196,15 @@ def channel_kwargs(): """ return { - 'bitchute': BITCHUTE_CHANNEL_KWARGS, - 'gab': GAB_CHANNEL_KWARGS, - 'gettr': GETTR_CHANNEL_KWARGS, - 'odysee': ODYSEE_CHANNEL_KWARGS, - 'rumble': RUMBLE_CHANNEL_KWARGS, - 'telegram': TELEGRAM_CHANNEL_KWARGS, - 'twitter': TWITTER_CHANNEL_KWARGS} + 'bitchute' : BITCHUTE_CHANNEL_KWARGS, + 'gab' : GAB_CHANNEL_KWARGS, + 'gettr' : GETTR_CHANNEL_KWARGS, + 'instagram' : INSTAGRAM_CHANNEL_KWARGS, + 'odysee' : ODYSEE_CHANNEL_KWARGS, + 'rumble' : RUMBLE_CHANNEL_KWARGS, + 'telegram' : TELEGRAM_CHANNEL_KWARGS, + 'twitter' : TWITTER_CHANNEL_KWARGS, + 'vkontakte' : VKONTAKTE_CHANNEL_KWARGS, + 'youtube' : YOUTUBE_CHANNEL_KWARGS} #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# diff --git a/tests/scraper/instagram.py b/tests/scraper/instagram.py new file mode 100644 index 0000000..0beb546 --- /dev/null +++ b/tests/scraper/instagram.py @@ -0,0 +1,16 @@ +from cisticola.base import Channel +from cisticola.scraper import InstagramScraper + +def test_scrape_instagram_channel_no_media(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['instagram'])] + controller.register_scraper(scraper = InstagramScraper()) + controller.scrape_channels(channels = channels, archive_media = False) + +def test_scrape_instagram_channel(controller, channel_kwargs): + + controller.reset_db() + + channels = [Channel(**channel_kwargs['instagram'])] + controller.register_scraper(scraper = InstagramScraper()) + controller.scrape_channels(channels = channels, archive_media = True) diff --git a/tests/scraper/vkontakte.py b/tests/scraper/vkontakte.py new file mode 100644 index 0000000..ef7cfa1 --- /dev/null +++ b/tests/scraper/vkontakte.py @@ -0,0 +1,16 @@ +from cisticola.base import Channel +from cisticola.scraper import VkontakteScraper + +def test_scrape_vkontakte_channel_no_media(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['vkontakte'])] + controller.register_scraper(scraper = VkontakteScraper()) + controller.scrape_channels(channels = channels, archive_media = False) + +def test_scrape_vkontakte_channel(controller, channel_kwargs): + + controller.reset_db() + + channels = [Channel(**channel_kwargs['vkontakte'])] + controller.register_scraper(scraper = VkontakteScraper()) + controller.scrape_channels(channels = channels, archive_media = True) diff --git a/tests/scraper/youtube.py b/tests/scraper/youtube.py new file mode 100644 index 0000000..9d14760 --- /dev/null +++ b/tests/scraper/youtube.py @@ -0,0 +1,16 @@ +from cisticola.base import Channel +from cisticola.scraper import YoutubeScraper + +def test_scrape_youtube_channel_no_media(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['youtube'])] + controller.register_scraper(scraper = YoutubeScraper()) + controller.scrape_channels(channels = channels, archive_media = False) + +def test_scrape_youtube_channel(controller, channel_kwargs): + + controller.reset_db() + + channels = [Channel(**channel_kwargs['youtube'])] + controller.register_scraper(scraper = YoutubeScraper()) + controller.scrape_channels(channels = channels, archive_media = True)