From 750f0cc8879ce08dfcb40ed65515569a2e4a80ad Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Mon, 14 Mar 2022 10:28:10 -0500 Subject: [PATCH] added scraper for Instagram --- Pipfile | 1 + Pipfile.lock | 219 +++++++++++++++++++++++++++++---- cisticola/scraper/__init__.py | 1 + cisticola/scraper/instagram.py | 102 +++++++++++++++ tests/conftest.py | 28 ++++- tests/scraper/instagram.py | 16 +++ 6 files changed, 337 insertions(+), 30 deletions(-) create mode 100644 cisticola/scraper/instagram.py create mode 100644 tests/scraper/instagram.py diff --git a/Pipfile b/Pipfile index 0337328..5457fcc 100644 --- a/Pipfile +++ b/Pipfile @@ -20,6 +20,7 @@ yt-dlp = "*" telethon = "*" pytesseract = "*" pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"} +instaloader = "*" [dev-packages] pytest = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 5a75176..50622b1 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "afacc6dd45c110f235861c54db45f5546fb0095f4e68a1084e85fd0e902db21c" + "sha256": "d465f2d09a728ee76cb0af521890ecc1e1bce672acbd1caf2e4d01b6567480d5" }, "pipfile-spec": 6, "requires": { @@ -49,19 +49,87 @@ }, "boto3": { "hashes": [ - "sha256:15fa6d1acac422d2d34f7811e02acfc7ac222cea24db3f463d5c52f2f87baa52", - "sha256:c974a7fa781c500b7067441f9883ed939cf8c80bcd74c88b11965b336cabb4b6" + "sha256:8d6f3c548f0ee03d742f404c96515e7579fc6968135aaa50dd855a046698ff79", + "sha256:d857feb6af9932e1ee3a748060a2cd9fd6043dbbccf66976eda54586597efdc0" ], "index": "pypi", - "version": "==1.21.16" + "version": "==1.21.18" }, "botocore": { "hashes": [ - "sha256:0a809efb821d81dc29f2e6c404ed123176b8d2eb43103758f31d89b291af2a8b", - "sha256:dcff7f9b5fea98701d0b520eba99385c538825f10e6d1cab1e7da213293d141e" + "sha256:7ea8ef1ff7c4882ab59b337662f90ddf5ea860e95e7e209dca593a34ea585b1b", + "sha256:d2da7ccbc5ddd61fe3cd45fcbd3de380d9e3a15bfa8fbfd2d9259a93dcc60c56" ], "markers": "python_version >= '3.6'", - "version": "==1.24.16" + "version": "==1.24.18" + }, + "brotli": { + "hashes": [ + "sha256:12effe280b8ebfd389022aa65114e30407540ccb89b177d3fbc9a4f177c4bd5d", + "sha256:160c78292e98d21e73a4cc7f76a234390e516afcd982fa17e1422f7c6a9ce9c8", + "sha256:16d528a45c2e1909c2798f27f7bf0a3feec1dc9e50948e738b961618e38b6a7b", + "sha256:19598ecddd8a212aedb1ffa15763dd52a388518c4550e615aed88dc3753c0f0c", + "sha256:1c48472a6ba3b113452355b9af0a60da5c2ae60477f8feda8346f8fd48e3e87c", + "sha256:268fe94547ba25b58ebc724680609c8ee3e5a843202e9a381f6f9c5e8bdb5c70", + "sha256:269a5743a393c65db46a7bb982644c67ecba4b8d91b392403ad8a861ba6f495f", + "sha256:26d168aac4aaec9a4394221240e8a5436b5634adc3cd1cdf637f6645cecbf181", + "sha256:29d1d350178e5225397e28ea1b7aca3648fcbab546d20e7475805437bfb0a130", + "sha256:2aad0e0baa04517741c9bb5b07586c642302e5fb3e75319cb62087bd0995ab19", + "sha256:3496fc835370da351d37cada4cf744039616a6db7d13c430035e901443a34daa", + "sha256:35a3edbe18e876e596553c4007a087f8bcfd538f19bc116917b3c7522fca0429", + "sha256:3b78a24b5fd13c03ee2b7b86290ed20efdc95da75a3557cc06811764d5ad1126", + "sha256:40d15c79f42e0a2c72892bf407979febd9cf91f36f495ffb333d1d04cebb34e4", + "sha256:44bb8ff420c1d19d91d79d8c3574b8954288bdff0273bf788954064d260d7ab0", + "sha256:4688c1e42968ba52e57d8670ad2306fe92e0169c6f3af0089be75bbac0c64a3b", + "sha256:495ba7e49c2db22b046a53b469bbecea802efce200dffb69b93dd47397edc9b6", + "sha256:4d1b810aa0ed773f81dceda2cc7b403d01057458730e309856356d4ef4188438", + "sha256:503fa6af7da9f4b5780bb7e4cbe0c639b010f12be85d02c99452825dd0feef3f", + "sha256:56d027eace784738457437df7331965473f2c0da2c70e1a1f6fdbae5402e0389", + "sha256:5913a1177fc36e30fcf6dc868ce23b0453952c78c04c266d3149b3d39e1410d6", + "sha256:5b6ef7d9f9c38292df3690fe3e302b5b530999fa90014853dcd0d6902fb59f26", + "sha256:5cb1e18167792d7d21e21365d7650b72d5081ed476123ff7b8cac7f45189c0c7", + "sha256:61a7ee1f13ab913897dac7da44a73c6d44d48a4adff42a5701e3239791c96e14", + "sha256:622a231b08899c864eb87e85f81c75e7b9ce05b001e59bbfbf43d4a71f5f32b2", + "sha256:68715970f16b6e92c574c30747c95cf8cf62804569647386ff032195dc89a430", + "sha256:6b2ae9f5f67f89aade1fab0f7fd8f2832501311c363a21579d02defa844d9296", + "sha256:6c772d6c0a79ac0f414a9f8947cc407e119b8598de7621f39cacadae3cf57d12", + "sha256:6d847b14f7ea89f6ad3c9e3901d1bc4835f6b390a9c71df999b0162d9bb1e20f", + "sha256:76ffebb907bec09ff511bb3acc077695e2c32bc2142819491579a695f77ffd4d", + "sha256:7bbff90b63328013e1e8cb50650ae0b9bac54ffb4be6104378490193cd60f85a", + "sha256:7cb81373984cc0e4682f31bc3d6be9026006d96eecd07ea49aafb06897746452", + "sha256:7ee83d3e3a024a9618e5be64648d6d11c37047ac48adff25f12fa4226cf23d1c", + "sha256:854c33dad5ba0fbd6ab69185fec8dab89e13cda6b7d191ba111987df74f38761", + "sha256:85f7912459c67eaab2fb854ed2bc1cc25772b300545fe7ed2dc03954da638649", + "sha256:87fdccbb6bb589095f413b1e05734ba492c962b4a45a13ff3408fa44ffe6479b", + "sha256:88c63a1b55f352b02c6ffd24b15ead9fc0e8bf781dbe070213039324922a2eea", + "sha256:8a674ac10e0a87b683f4fa2b6fa41090edfd686a6524bd8dedbd6138b309175c", + "sha256:93130612b837103e15ac3f9cbacb4613f9e348b58b3aad53721d92e57f96d46a", + "sha256:9744a863b489c79a73aba014df554b0e7a0fc44ef3f8a0ef2a52919c7d155031", + "sha256:9749a124280a0ada4187a6cfd1ffd35c350fb3af79c706589d98e088c5044267", + "sha256:97f715cf371b16ac88b8c19da00029804e20e25f30d80203417255d239f228b5", + "sha256:9bf919756d25e4114ace16a8ce91eb340eb57a08e2c6950c3cebcbe3dff2a5e7", + "sha256:9d12cf2851759b8de8ca5fde36a59c08210a97ffca0eb94c532ce7b17c6a3d1d", + "sha256:9ed4c92a0665002ff8ea852353aeb60d9141eb04109e88928026d3c8a9e5433c", + "sha256:a72661af47119a80d82fa583b554095308d6a4c356b2a554fdc2799bc19f2a43", + "sha256:afde17ae04d90fbe53afb628f7f2d4ca022797aa093e809de5c3cf276f61bbfa", + "sha256:b336c5e9cf03c7be40c47b5fd694c43c9f1358a80ba384a21969e0b4e66a9b17", + "sha256:b663f1e02de5d0573610756398e44c130add0eb9a3fc912a09665332942a2efb", + "sha256:b83bb06a0192cccf1eb8d0a28672a1b79c74c3a8a5f2619625aeb6f28b3a82bb", + "sha256:c2415d9d082152460f2bd4e382a1e85aed233abc92db5a3880da2257dc7daf7b", + "sha256:c83aa123d56f2e060644427a882a36b3c12db93727ad7a7b9efd7d7f3e9cc2c4", + "sha256:cfc391f4429ee0a9370aa93d812a52e1fee0f37a81861f4fdd1f4fb28e8547c3", + "sha256:db844eb158a87ccab83e868a762ea8024ae27337fc7ddcbfcddd157f841fdfe7", + "sha256:defed7ea5f218a9f2336301e6fd379f55c655bea65ba2476346340a0ce6f74a1", + "sha256:e16eb9541f3dd1a3e92b89005e37b1257b157b7256df0e36bd7b33b50be73bcb", + "sha256:e23281b9a08ec338469268f98f194658abfb13658ee98e2b7f85ee9dd06caa91", + "sha256:e2d9e1cbc1b25e22000328702b014227737756f4b5bf5c485ac1d8091ada078b", + "sha256:e48f4234f2469ed012a98f4b7874e7f7e173c167bed4934912a29e03167cf6b1", + "sha256:e4c4e92c14a57c9bd4cb4be678c25369bf7a092d55fd0866f759e425b9660806", + "sha256:ec1947eabbaf8e0531e8e899fc1d9876c179fc518989461f5d24e2223395a9e3", + "sha256:f909bbbc433048b499cb9db9e713b5d8d949e8c109a2a548502fb9aa8630f0b1" + ], + "markers": "platform_python_implementation == 'CPython'", + "version": "==1.0.9" }, "bs4": { "hashes": [ @@ -226,11 +294,11 @@ }, "importlib-metadata": { "hashes": [ - "sha256:b36ffa925fe3139b2f6ff11d6925ffd4fa7bc47870165e3ac260ac7b4f91e6ac", - "sha256:d16e8c1deb60de41b8e8ed21c1a7b947b0bc62fab7e1d470bcdf331cea2e6735" + "sha256:1208431ca90a8cca1a6b8af391bb53c1a2db74e5d1cef6ddced95d4b2062edc6", + "sha256:ea4c597ebf37142f827b8f39299579e31685c31d3a438b59f469406afd0f2539" ], "markers": "python_version < '3.10'", - "version": "==4.11.2" + "version": "==4.11.3" }, "iniconfig": { "hashes": [ @@ -239,6 +307,13 @@ ], "version": "==1.1.1" }, + "instaloader": { + "hashes": [ + "sha256:9615a12a5a01a8b6c9d99a2a047b21d81b341cfd77656b9261bda30ece0cd562" + ], + "index": "pypi", + "version": "==4.8.4" + }, "jinja2": { "hashes": [ "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8", @@ -376,6 +451,14 @@ "markers": "python_version >= '3.7'", "version": "==2.1.0" }, + "mutagen": { + "hashes": [ + "sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1", + "sha256:9c9f243fcec7f410f138cb12c21c84c64fde4195481a30c9bfb05b5f003adfed" + ], + "markers": "python_version >= '3.5' and python_version < '4'", + "version": "==1.45.1" + }, "numpy": { "hashes": [ "sha256:07a8c89a04997625236c5ecb7afe35a02af3896c8aa01890a849913a2309c676", @@ -395,6 +478,7 @@ "sha256:dbc7601a3b7472d559dc7b933b18b4b66f9aa7452c120e87dfb33d02008c8a18", "sha256:e7927a589df200c5e23c57970bafbd0cd322459aa7b1ff73b7c2e84d6e3eae62", "sha256:f8c1f39caad2c896bc0018f699882b345b2a63708008be29b1f355ebf6f933fe", + "sha256:f950f8845b480cffe522913d35567e29dd381b0dc7e4ce6a4a9f9156417d2430", "sha256:fade0d4f4d292b6f39951b6836d7a3c7ef5b2347f3c420cd9820a1d90d794802", "sha256:fdf3c08bce27132395d3c3ba1503cac12e17282358cb4bddc25cc46b0aca07aa" ], @@ -521,6 +605,39 @@ ], "version": "==0.4.8" }, + "pycryptodomex": { + "hashes": [ + "sha256:1ca8e1b4c62038bb2da55451385246f51f412c5f5eabd64812c01766a5989b4a", + "sha256:298c00ea41a81a491d5b244d295d18369e5aac4b61b77b2de5b249ca61cd6659", + "sha256:2aa887683eee493e015545bd69d3d21ac8d5ad582674ec98f4af84511e353e45", + "sha256:2ce76ed0081fd6ac8c74edc75b9d14eca2064173af79843c24fa62573263c1f2", + "sha256:3da13c2535b7aea94cc2a6d1b1b37746814c74b6e80790daddd55ca5c120a489", + "sha256:406ec8cfe0c098fadb18d597dc2ee6de4428d640c0ccafa453f3d9b2e58d29e2", + "sha256:4d0db8df9ffae36f416897ad184608d9d7a8c2b46c4612c6bc759b26c073f750", + "sha256:530756d2faa40af4c1f74123e1d889bd07feae45bac2fd32f259a35f7aa74151", + "sha256:77931df40bb5ce5e13f4de2bfc982b2ddc0198971fbd947776c8bb5050896eb2", + "sha256:797a36bd1f69df9e2798e33edb4bd04e5a30478efc08f9428c087f17f65a7045", + "sha256:8085bd0ad2034352eee4d4f3e2da985c2749cb7344b939f4d95ead38c2520859", + "sha256:8536bc08d130cae6dcba1ea689f2913dfd332d06113904d171f2f56da6228e89", + "sha256:a4d412eba5679ede84b41dbe48b1bed8f33131ab9db06c238a235334733acc5e", + "sha256:aebecde2adc4a6847094d3bd6a8a9538ef3438a5ea84ac1983fcb167db614461", + "sha256:b276cc4deb4a80f9dfd47a41ebb464b1fe91efd8b1b8620cf5ccf8b824b850d6", + "sha256:b5a185ae79f899b01ca49f365bdf15a45d78d9856f09b0de1a41b92afce1a07f", + "sha256:c4d8977ccda886d88dc3ca789de2f1adc714df912ff3934b3d0a3f3d777deafb", + "sha256:c5dd3ffa663c982d7f1be9eb494a8924f6d40e2e2f7d1d27384cfab1b2ac0662", + "sha256:ca88f2f7020002638276439a01ffbb0355634907d1aa5ca91f3dc0c2e44e8f3b", + "sha256:d2cce1c82a7845d7e2e8a0956c6b7ed3f1661c9acf18eb120fc71e098ab5c6fe", + "sha256:d709572d64825d8d59ea112e11cc7faf6007f294e9951324b7574af4251e4de8", + "sha256:da8db8374295fb532b4b0c467e66800ef17d100e4d5faa2bbbd6df35502da125", + "sha256:e36c7e3b5382cd5669cf199c4a04a0279a43b2a3bdd77627e9b89778ac9ec08c", + "sha256:e95a4a6c54d27a84a4624d2af8bb9ee178111604653194ca6880c98dcad92f48", + "sha256:ee835def05622e0c8b1435a906491760a43d0c462f065ec9143ec4b8d79f8bff", + "sha256:f75009715dcf4a3d680c2338ab19dac5498f8121173a929872950f4fb3a48fbf", + "sha256:f8524b8bc89470cec7ac51734907818d3620fb1637f8f8b542d650ebec42a126" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==3.14.1" + }, "pyexiftool": { "git": "https://github.com/smarnach/pyexiftool.git", "ref": "3db3764895e687d75b42d3ae4e554ca8664a7f6f" @@ -559,11 +676,11 @@ }, "pytest": { "hashes": [ - "sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db", - "sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171" + "sha256:b555252a95bbb2a37a97b5ac2eb050c436f7989993565f5e0c9128fcaacadd0e", + "sha256:f1089d218cfcc63a212c42896f1b7fbf096874d045e1988186861a1a87d27b47" ], - "markers": "python_version >= '3.6'", - "version": "==7.0.1" + "markers": "python_version >= '3.7'", + "version": "==7.1.0" }, "python-dateutil": { "hashes": [ @@ -681,7 +798,7 @@ "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17", "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb" ], - "markers": "python_version >= '3.6' and python_version < '4.0'", + "markers": "python_version >= '3.6' and python_version < '4'", "version": "==4.8" }, "s3transfer": { @@ -853,16 +970,70 @@ "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed", "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", "version": "==1.26.8" }, - "youtube-dl": { + "websockets": { "hashes": [ - "sha256:bc59e86c5d15d887ac590454511f08ce2c47698d5a82c27bfe27b5d814bbaed2", - "sha256:f1336d5de68647e0364a47b3c0712578e59ec76f02048ff5c50ef1c69d79cd55" + "sha256:038afef2a05893578d10dadbdbb5f112bd115c46347e1efe99f6a356ff062138", + "sha256:05f6e9757017270e7a92a2975e2ae88a9a582ffc4629086fd6039aa80e99cd86", + "sha256:0b66421f9f13d4df60cd48ab977ed2c2b6c9147ae1a33caf5a9f46294422fda1", + "sha256:0cd02f36d37e503aca88ab23cc0a1a0e92a263d37acf6331521eb38040dcf77b", + "sha256:0f73cb2526d6da268e86977b2c4b58f2195994e53070fe567d5487c6436047e6", + "sha256:117383d0a17a0dda349f7a8790763dde75c1508ff8e4d6e8328b898b7df48397", + "sha256:1c1f3b18c8162e3b09761d0c6a0305fd642934202541cc511ef972cb9463261e", + "sha256:1c9031e90ebfc486e9cdad532b94004ade3aa39a31d3c46c105bb0b579cd2490", + "sha256:2349fa81b6b959484bb2bda556ccb9eb70ba68987646a0f8a537a1a18319fb03", + "sha256:24b879ba7db12bb525d4e58089fcbe6a3df3ce4666523183654170e86d372cbe", + "sha256:2aa9b91347ecd0412683f28aabe27f6bad502d89bd363b76e0a3508b1596402e", + "sha256:56d48eebe9e39ce0d68701bce3b21df923aa05dcc00f9fd8300de1df31a7c07c", + "sha256:5a38a0175ae82e4a8c4bac29fc01b9ee26d7d5a614e5ee11e7813c68a7d938ce", + "sha256:5b04270b5613f245ec84bb2c6a482a9d009aefad37c0575f6cda8499125d5d5c", + "sha256:6193bbc1ee63aadeb9a4d81de0e19477401d150d506aee772d8380943f118186", + "sha256:669e54228a4d9457abafed27cbf0e2b9f401445c4dfefc12bf8e4db9751703b8", + "sha256:6a009eb551c46fd79737791c0c833fc0e5b56bcd1c3057498b262d660b92e9cd", + "sha256:71a4491cfe7a9f18ee57d41163cb6a8a3fa591e0f0564ca8b0ed86b2a30cced4", + "sha256:7b38a5c9112e3dbbe45540f7b60c5204f49b3cb501b40950d6ab34cd202ab1d0", + "sha256:7bb9d8a6beca478c7e9bdde0159bd810cc1006ad6a7cb460533bae39da692ca2", + "sha256:82bc33db6d8309dc27a3bee11f7da2288ad925fcbabc2a4bb78f7e9c56249baf", + "sha256:8351c3c86b08156337b0e4ece0e3c5ec3e01fcd14e8950996832a23c99416098", + "sha256:8beac786a388bb99a66c3be4ab0fb38273c0e3bc17f612a4e0a47c4fc8b9c045", + "sha256:97950c7c844ec6f8d292440953ae18b99e3a6a09885e09d20d5e7ecd9b914cf8", + "sha256:98f57b3120f8331cd7440dbe0e776474f5e3632fdaa474af1f6b754955a47d71", + "sha256:9ca2ca05a4c29179f06cf6727b45dba5d228da62623ec9df4184413d8aae6cb9", + "sha256:a03a25d95cc7400bd4d61a63460b5d85a7761c12075ee2f51de1ffe73aa593d3", + "sha256:a10c0c1ee02164246f90053273a42d72a3b2452a7e7486fdae781138cf7fbe2d", + "sha256:a72b92f96e5e540d5dda99ee3346e199ade8df63152fa3c737260da1730c411f", + "sha256:ac081aa0307f263d63c5ff0727935c736c8dad51ddf2dc9f5d0c4759842aefaa", + "sha256:b22bdc795e62e71118b63e14a08bacfa4f262fd2877de7e5b950f5ac16b0348f", + "sha256:b4059e2ccbe6587b6dc9a01db5fc49ead9a884faa4076eea96c5ec62cb32f42a", + "sha256:b7fe45ae43ac814beb8ca09d6995b56800676f2cfa8e23f42839dc69bba34a42", + "sha256:bef03a51f9657fb03d8da6ccd233fe96e04101a852f0ffd35f5b725b28221ff3", + "sha256:bffc65442dd35c473ca9790a3fa3ba06396102a950794f536783f4b8060af8dd", + "sha256:c21a67ab9a94bd53e10bba21912556027fea944648a09e6508415ad14e37c325", + "sha256:c67d9cacb3f6537ca21e9b224d4fd08481538e43bcac08b3d93181b0816def39", + "sha256:c6e56606842bb24e16e36ae7eb308d866b4249cf0be8f63b212f287eeb76b124", + "sha256:cb316b87cbe3c0791c2ad92a5a36bf6adc87c457654335810b25048c1daa6fd5", + "sha256:cef40a1b183dcf39d23b392e9dd1d9b07ab9c46aadf294fff1350fb79146e72b", + "sha256:cf931c33db9c87c53d009856045dd524e4a378445693382a920fa1e0eb77c36c", + "sha256:d4d110a84b63c5cfdd22485acc97b8b919aefeecd6300c0c9d551e055b9a88ea", + "sha256:d5396710f86a306cf52f87fd8ea594a0e894ba0cc5a36059eaca3a477dc332aa", + "sha256:f09f46b1ff6d09b01c7816c50bd1903cf7d02ebbdb63726132717c2fcda835d5", + "sha256:f14bd10e170abc01682a9f8b28b16e6f20acf6175945ef38db6ffe31b0c72c3f", + "sha256:f5c335dc0e7dc271ef36df3f439868b3c790775f345338c2f61a562f1074187b", + "sha256:f8296b8408ec6853b26771599990721a26403e62b9de7e50ac0a056772ac0b5e", + "sha256:fa35c5d1830d0fb7b810324e9eeab9aa92e8f273f11fdbdc0741dcded6d72b9f" + ], + "markers": "python_version >= '3.7'", + "version": "==10.2" + }, + "yt-dlp": { + "hashes": [ + "sha256:05179f0f2c34f06910003bb9f80af68ff798b072ca0f826c0e6704a3fbd5b306", + "sha256:68546578c18e6ce87450b53769d5d5b7f5a23e5209784976db6c7ccbf7954b21" ], "index": "pypi", - "version": "==2021.12.17" + "version": "==2022.3.8.2" }, "zipp": { "hashes": [ @@ -973,11 +1144,11 @@ }, "pytest": { "hashes": [ - "sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db", - "sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171" + "sha256:b555252a95bbb2a37a97b5ac2eb050c436f7989993565f5e0c9128fcaacadd0e", + "sha256:f1089d218cfcc63a212c42896f1b7fbf096874d045e1988186861a1a87d27b47" ], - "markers": "python_version >= '3.6'", - "version": "==7.0.1" + "markers": "python_version >= '3.7'", + "version": "==7.1.0" }, "pytest-cov": { "hashes": [ diff --git a/cisticola/scraper/__init__.py b/cisticola/scraper/__init__.py index f5240d3..36e6cd5 100644 --- a/cisticola/scraper/__init__.py +++ b/cisticola/scraper/__init__.py @@ -3,6 +3,7 @@ from .base import Scraper, ScraperController from .bitchute import BitchuteScraper from .gab import GabScraper from .gettr import GettrScraper +from .instagram import InstagramScraper from .odysee import OdyseeScraper from .rumble import RumbleScraper from .telegram_snscrape import TelegramSnscrapeScraper diff --git a/cisticola/scraper/instagram.py b/cisticola/scraper/instagram.py new file mode 100644 index 0000000..eb20ecb --- /dev/null +++ b/cisticola/scraper/instagram.py @@ -0,0 +1,102 @@ +from typing import Generator +from datetime import datetime, timezone +import os +import json +import tempfile +from pathlib import Path + +from loguru import logger +import instaloader + +from cisticola.base import Channel, ScraperResult +from cisticola.scraper.base import Scraper + +BASE_URL = 'https://www.instagram.com/' + +CONTENT_TYPES = { + 'jpg' : 'image/jpeg', + 'mp4' : 'video/mp4'} + +class InstagramScraper(Scraper): + __version__ = "InstagramScraper 0.0.1" + + def get_username_from_url(self, url): + username = url.split(BASE_URL)[1].strip('/') + return username + + def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: + + username = self.get_username_from_url(channel.url) + + loader = instaloader.Instaloader( + quiet = True, + download_comments = False, + save_metadata = False) + + loader.login( + user = os.environ['INSTAGRAM_USERNAME'], + passwd = os.environ['INSTAGRAM_PASSWORD']) + + profile = instaloader.Profile.from_username( + context = loader.context, + username = username) + + for post in profile.get_posts(): + + if since is not None and post.date_utc <= since.date: + break + + post_url = f'{BASE_URL}p/{post.shortcode}/' + + archived_urls = {} + + if archive_media: + + with tempfile.TemporaryDirectory() as temp_dir: + + loader.download_post(post = post, target = Path(temp_dir)) + + files = os.listdir(temp_dir) + files = [f for f in files if not f.endswith('.txt')] + + for file in files: + ext = file.split('.')[-1] + content_type = CONTENT_TYPES[ext] + filename = Path(temp_dir, file) + key = f'{post.shortcode}__{file}' + + with open(filename, 'rb') as f: + blob = f.read() + + archived_url = self.archive_blob(blob = blob, content_type = content_type, key = key) + archived_urls[post_url] = archived_url + + yield ScraperResult( + scraper=self.__version__, + platform="Instagram", + channel=channel.id, + platform_id=post.mediaid, + date=post.date_utc, + date_archived=datetime.now(timezone.utc), + raw_data=json.dumps(post._asdict(), default=str), + archived_urls=archived_urls) + + for comment in post.get_comments(): + + comment_dict = comment._asdict() + comment_dict['post_url'] = post_url + comment_dict['is_comment'] = True + + yield ScraperResult( + scraper=self.__version__, + platform="Instagram", + channel=channel.id, + platform_id=post.mediaid, + date=comment.created_at_utc, + date_archived=datetime.now(timezone.utc), + raw_data=json.dumps(comment_dict, default=str), + archived_urls={}) + + def can_handle(self, channel): + if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None: + return True \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 338fcb9..f85c079 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -51,8 +51,23 @@ GETTR_CHANNEL_KWARGS = { 'chat': False, 'notes': ''} -ODYSEE_CHANNEL_KWARGS = { +INSTAGRAM_CHANNEL_KWARGS = { 'id': 3, + 'name': 'borland.88 (test)', + 'platform_id': 'borland.88', + 'category': 'test', + 'followers': None, + 'platform': 'Instagram', + 'url': 'https://www.instagram.com/borland.88/', + 'screenname': 'borland.88', + 'country': 'UA', + 'influencer': None, + 'public': True, + 'chat': False, + 'notes': ''} + +ODYSEE_CHANNEL_KWARGS = { + 'id': 4, 'name': "Mak1n' Bacon (test)", 'platform_id': 'Mak1nBacon', 'category': 'test', @@ -67,7 +82,7 @@ ODYSEE_CHANNEL_KWARGS = { 'notes': ''} RUMBLE_CHANNEL_KWARGS = { - 'id': 4, + 'id': 5, 'name': 'we are uploading videos wow products', 'platform_id': 'c-916305', 'category': 'test', @@ -82,7 +97,7 @@ RUMBLE_CHANNEL_KWARGS = { 'notes': ''} TELEGRAM_CHANNEL_KWARGS = { - 'id': 5, + 'id': 6, 'name': 'South West Ohio Proud Boys (test)', 'platform_id': -1001276612436, 'category': 'test', @@ -97,7 +112,7 @@ TELEGRAM_CHANNEL_KWARGS = { 'notes': ''} TWITTER_CHANNEL_KWARGS = { - 'id': 5, + 'id': 6, 'name': 'Logan Williams (test)', 'platform_id': 891729132, 'category': 'test', @@ -112,7 +127,7 @@ TWITTER_CHANNEL_KWARGS = { 'notes': ''} VKONTAKTE_CHANNEL_KWARGS = { - 'id': 6, + 'id': 8, 'name': 'Wwg1wgA (test)', 'platform_id': 'club201278078', 'category': 'test', @@ -127,7 +142,7 @@ VKONTAKTE_CHANNEL_KWARGS = { 'notes': ''} YOUTUBE_CHANNEL_KWARGS = { - 'id': 7, + 'id': 9, 'name': 'AnEs87 (test)', 'platform_id': 'UCP6exBqGoxGLv_pM9Dxk2pA', 'category': 'test', @@ -169,6 +184,7 @@ def channel_kwargs(): 'bitchute' : BITCHUTE_CHANNEL_KWARGS, 'gab' : GAB_CHANNEL_KWARGS, 'gettr' : GETTR_CHANNEL_KWARGS, + 'instagram' : INSTAGRAM_CHANNEL_KWARGS, 'odysee' : ODYSEE_CHANNEL_KWARGS, 'rumble' : RUMBLE_CHANNEL_KWARGS, 'telegram' : TELEGRAM_CHANNEL_KWARGS, diff --git a/tests/scraper/instagram.py b/tests/scraper/instagram.py new file mode 100644 index 0000000..0beb546 --- /dev/null +++ b/tests/scraper/instagram.py @@ -0,0 +1,16 @@ +from cisticola.base import Channel +from cisticola.scraper import InstagramScraper + +def test_scrape_instagram_channel_no_media(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['instagram'])] + controller.register_scraper(scraper = InstagramScraper()) + controller.scrape_channels(channels = channels, archive_media = False) + +def test_scrape_instagram_channel(controller, channel_kwargs): + + controller.reset_db() + + channels = [Channel(**channel_kwargs['instagram'])] + controller.register_scraper(scraper = InstagramScraper()) + controller.scrape_channels(channels = channels, archive_media = True)