merged scraper into main and fixed minor merge conflict

This commit is contained in:
Tristan Lee
2022-03-15 09:12:12 -05:00
13 changed files with 612 additions and 78 deletions

View File

@@ -16,10 +16,11 @@ snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
ffmpeg-python = "*"
polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"}
garc = "*"
youtube-dl = "*"
yt-dlp = "*"
telethon = "*"
pytesseract = "*"
pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"}
instaloader = "*"
[dev-packages]
pytest = "*"

219
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "afacc6dd45c110f235861c54db45f5546fb0095f4e68a1084e85fd0e902db21c"
"sha256": "d465f2d09a728ee76cb0af521890ecc1e1bce672acbd1caf2e4d01b6567480d5"
},
"pipfile-spec": 6,
"requires": {
@@ -49,19 +49,87 @@
},
"boto3": {
"hashes": [
"sha256:15fa6d1acac422d2d34f7811e02acfc7ac222cea24db3f463d5c52f2f87baa52",
"sha256:c974a7fa781c500b7067441f9883ed939cf8c80bcd74c88b11965b336cabb4b6"
"sha256:8d6f3c548f0ee03d742f404c96515e7579fc6968135aaa50dd855a046698ff79",
"sha256:d857feb6af9932e1ee3a748060a2cd9fd6043dbbccf66976eda54586597efdc0"
],
"index": "pypi",
"version": "==1.21.16"
"version": "==1.21.18"
},
"botocore": {
"hashes": [
"sha256:0a809efb821d81dc29f2e6c404ed123176b8d2eb43103758f31d89b291af2a8b",
"sha256:dcff7f9b5fea98701d0b520eba99385c538825f10e6d1cab1e7da213293d141e"
"sha256:7ea8ef1ff7c4882ab59b337662f90ddf5ea860e95e7e209dca593a34ea585b1b",
"sha256:d2da7ccbc5ddd61fe3cd45fcbd3de380d9e3a15bfa8fbfd2d9259a93dcc60c56"
],
"markers": "python_version >= '3.6'",
"version": "==1.24.16"
"version": "==1.24.18"
},
"brotli": {
"hashes": [
"sha256:12effe280b8ebfd389022aa65114e30407540ccb89b177d3fbc9a4f177c4bd5d",
"sha256:160c78292e98d21e73a4cc7f76a234390e516afcd982fa17e1422f7c6a9ce9c8",
"sha256:16d528a45c2e1909c2798f27f7bf0a3feec1dc9e50948e738b961618e38b6a7b",
"sha256:19598ecddd8a212aedb1ffa15763dd52a388518c4550e615aed88dc3753c0f0c",
"sha256:1c48472a6ba3b113452355b9af0a60da5c2ae60477f8feda8346f8fd48e3e87c",
"sha256:268fe94547ba25b58ebc724680609c8ee3e5a843202e9a381f6f9c5e8bdb5c70",
"sha256:269a5743a393c65db46a7bb982644c67ecba4b8d91b392403ad8a861ba6f495f",
"sha256:26d168aac4aaec9a4394221240e8a5436b5634adc3cd1cdf637f6645cecbf181",
"sha256:29d1d350178e5225397e28ea1b7aca3648fcbab546d20e7475805437bfb0a130",
"sha256:2aad0e0baa04517741c9bb5b07586c642302e5fb3e75319cb62087bd0995ab19",
"sha256:3496fc835370da351d37cada4cf744039616a6db7d13c430035e901443a34daa",
"sha256:35a3edbe18e876e596553c4007a087f8bcfd538f19bc116917b3c7522fca0429",
"sha256:3b78a24b5fd13c03ee2b7b86290ed20efdc95da75a3557cc06811764d5ad1126",
"sha256:40d15c79f42e0a2c72892bf407979febd9cf91f36f495ffb333d1d04cebb34e4",
"sha256:44bb8ff420c1d19d91d79d8c3574b8954288bdff0273bf788954064d260d7ab0",
"sha256:4688c1e42968ba52e57d8670ad2306fe92e0169c6f3af0089be75bbac0c64a3b",
"sha256:495ba7e49c2db22b046a53b469bbecea802efce200dffb69b93dd47397edc9b6",
"sha256:4d1b810aa0ed773f81dceda2cc7b403d01057458730e309856356d4ef4188438",
"sha256:503fa6af7da9f4b5780bb7e4cbe0c639b010f12be85d02c99452825dd0feef3f",
"sha256:56d027eace784738457437df7331965473f2c0da2c70e1a1f6fdbae5402e0389",
"sha256:5913a1177fc36e30fcf6dc868ce23b0453952c78c04c266d3149b3d39e1410d6",
"sha256:5b6ef7d9f9c38292df3690fe3e302b5b530999fa90014853dcd0d6902fb59f26",
"sha256:5cb1e18167792d7d21e21365d7650b72d5081ed476123ff7b8cac7f45189c0c7",
"sha256:61a7ee1f13ab913897dac7da44a73c6d44d48a4adff42a5701e3239791c96e14",
"sha256:622a231b08899c864eb87e85f81c75e7b9ce05b001e59bbfbf43d4a71f5f32b2",
"sha256:68715970f16b6e92c574c30747c95cf8cf62804569647386ff032195dc89a430",
"sha256:6b2ae9f5f67f89aade1fab0f7fd8f2832501311c363a21579d02defa844d9296",
"sha256:6c772d6c0a79ac0f414a9f8947cc407e119b8598de7621f39cacadae3cf57d12",
"sha256:6d847b14f7ea89f6ad3c9e3901d1bc4835f6b390a9c71df999b0162d9bb1e20f",
"sha256:76ffebb907bec09ff511bb3acc077695e2c32bc2142819491579a695f77ffd4d",
"sha256:7bbff90b63328013e1e8cb50650ae0b9bac54ffb4be6104378490193cd60f85a",
"sha256:7cb81373984cc0e4682f31bc3d6be9026006d96eecd07ea49aafb06897746452",
"sha256:7ee83d3e3a024a9618e5be64648d6d11c37047ac48adff25f12fa4226cf23d1c",
"sha256:854c33dad5ba0fbd6ab69185fec8dab89e13cda6b7d191ba111987df74f38761",
"sha256:85f7912459c67eaab2fb854ed2bc1cc25772b300545fe7ed2dc03954da638649",
"sha256:87fdccbb6bb589095f413b1e05734ba492c962b4a45a13ff3408fa44ffe6479b",
"sha256:88c63a1b55f352b02c6ffd24b15ead9fc0e8bf781dbe070213039324922a2eea",
"sha256:8a674ac10e0a87b683f4fa2b6fa41090edfd686a6524bd8dedbd6138b309175c",
"sha256:93130612b837103e15ac3f9cbacb4613f9e348b58b3aad53721d92e57f96d46a",
"sha256:9744a863b489c79a73aba014df554b0e7a0fc44ef3f8a0ef2a52919c7d155031",
"sha256:9749a124280a0ada4187a6cfd1ffd35c350fb3af79c706589d98e088c5044267",
"sha256:97f715cf371b16ac88b8c19da00029804e20e25f30d80203417255d239f228b5",
"sha256:9bf919756d25e4114ace16a8ce91eb340eb57a08e2c6950c3cebcbe3dff2a5e7",
"sha256:9d12cf2851759b8de8ca5fde36a59c08210a97ffca0eb94c532ce7b17c6a3d1d",
"sha256:9ed4c92a0665002ff8ea852353aeb60d9141eb04109e88928026d3c8a9e5433c",
"sha256:a72661af47119a80d82fa583b554095308d6a4c356b2a554fdc2799bc19f2a43",
"sha256:afde17ae04d90fbe53afb628f7f2d4ca022797aa093e809de5c3cf276f61bbfa",
"sha256:b336c5e9cf03c7be40c47b5fd694c43c9f1358a80ba384a21969e0b4e66a9b17",
"sha256:b663f1e02de5d0573610756398e44c130add0eb9a3fc912a09665332942a2efb",
"sha256:b83bb06a0192cccf1eb8d0a28672a1b79c74c3a8a5f2619625aeb6f28b3a82bb",
"sha256:c2415d9d082152460f2bd4e382a1e85aed233abc92db5a3880da2257dc7daf7b",
"sha256:c83aa123d56f2e060644427a882a36b3c12db93727ad7a7b9efd7d7f3e9cc2c4",
"sha256:cfc391f4429ee0a9370aa93d812a52e1fee0f37a81861f4fdd1f4fb28e8547c3",
"sha256:db844eb158a87ccab83e868a762ea8024ae27337fc7ddcbfcddd157f841fdfe7",
"sha256:defed7ea5f218a9f2336301e6fd379f55c655bea65ba2476346340a0ce6f74a1",
"sha256:e16eb9541f3dd1a3e92b89005e37b1257b157b7256df0e36bd7b33b50be73bcb",
"sha256:e23281b9a08ec338469268f98f194658abfb13658ee98e2b7f85ee9dd06caa91",
"sha256:e2d9e1cbc1b25e22000328702b014227737756f4b5bf5c485ac1d8091ada078b",
"sha256:e48f4234f2469ed012a98f4b7874e7f7e173c167bed4934912a29e03167cf6b1",
"sha256:e4c4e92c14a57c9bd4cb4be678c25369bf7a092d55fd0866f759e425b9660806",
"sha256:ec1947eabbaf8e0531e8e899fc1d9876c179fc518989461f5d24e2223395a9e3",
"sha256:f909bbbc433048b499cb9db9e713b5d8d949e8c109a2a548502fb9aa8630f0b1"
],
"markers": "platform_python_implementation == 'CPython'",
"version": "==1.0.9"
},
"bs4": {
"hashes": [
@@ -226,11 +294,11 @@
},
"importlib-metadata": {
"hashes": [
"sha256:b36ffa925fe3139b2f6ff11d6925ffd4fa7bc47870165e3ac260ac7b4f91e6ac",
"sha256:d16e8c1deb60de41b8e8ed21c1a7b947b0bc62fab7e1d470bcdf331cea2e6735"
"sha256:1208431ca90a8cca1a6b8af391bb53c1a2db74e5d1cef6ddced95d4b2062edc6",
"sha256:ea4c597ebf37142f827b8f39299579e31685c31d3a438b59f469406afd0f2539"
],
"markers": "python_version < '3.10'",
"version": "==4.11.2"
"version": "==4.11.3"
},
"iniconfig": {
"hashes": [
@@ -239,6 +307,13 @@
],
"version": "==1.1.1"
},
"instaloader": {
"hashes": [
"sha256:9615a12a5a01a8b6c9d99a2a047b21d81b341cfd77656b9261bda30ece0cd562"
],
"index": "pypi",
"version": "==4.8.4"
},
"jinja2": {
"hashes": [
"sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8",
@@ -376,6 +451,14 @@
"markers": "python_version >= '3.7'",
"version": "==2.1.0"
},
"mutagen": {
"hashes": [
"sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1",
"sha256:9c9f243fcec7f410f138cb12c21c84c64fde4195481a30c9bfb05b5f003adfed"
],
"markers": "python_version >= '3.5' and python_version < '4'",
"version": "==1.45.1"
},
"numpy": {
"hashes": [
"sha256:07a8c89a04997625236c5ecb7afe35a02af3896c8aa01890a849913a2309c676",
@@ -395,6 +478,7 @@
"sha256:dbc7601a3b7472d559dc7b933b18b4b66f9aa7452c120e87dfb33d02008c8a18",
"sha256:e7927a589df200c5e23c57970bafbd0cd322459aa7b1ff73b7c2e84d6e3eae62",
"sha256:f8c1f39caad2c896bc0018f699882b345b2a63708008be29b1f355ebf6f933fe",
"sha256:f950f8845b480cffe522913d35567e29dd381b0dc7e4ce6a4a9f9156417d2430",
"sha256:fade0d4f4d292b6f39951b6836d7a3c7ef5b2347f3c420cd9820a1d90d794802",
"sha256:fdf3c08bce27132395d3c3ba1503cac12e17282358cb4bddc25cc46b0aca07aa"
],
@@ -521,6 +605,39 @@
],
"version": "==0.4.8"
},
"pycryptodomex": {
"hashes": [
"sha256:1ca8e1b4c62038bb2da55451385246f51f412c5f5eabd64812c01766a5989b4a",
"sha256:298c00ea41a81a491d5b244d295d18369e5aac4b61b77b2de5b249ca61cd6659",
"sha256:2aa887683eee493e015545bd69d3d21ac8d5ad582674ec98f4af84511e353e45",
"sha256:2ce76ed0081fd6ac8c74edc75b9d14eca2064173af79843c24fa62573263c1f2",
"sha256:3da13c2535b7aea94cc2a6d1b1b37746814c74b6e80790daddd55ca5c120a489",
"sha256:406ec8cfe0c098fadb18d597dc2ee6de4428d640c0ccafa453f3d9b2e58d29e2",
"sha256:4d0db8df9ffae36f416897ad184608d9d7a8c2b46c4612c6bc759b26c073f750",
"sha256:530756d2faa40af4c1f74123e1d889bd07feae45bac2fd32f259a35f7aa74151",
"sha256:77931df40bb5ce5e13f4de2bfc982b2ddc0198971fbd947776c8bb5050896eb2",
"sha256:797a36bd1f69df9e2798e33edb4bd04e5a30478efc08f9428c087f17f65a7045",
"sha256:8085bd0ad2034352eee4d4f3e2da985c2749cb7344b939f4d95ead38c2520859",
"sha256:8536bc08d130cae6dcba1ea689f2913dfd332d06113904d171f2f56da6228e89",
"sha256:a4d412eba5679ede84b41dbe48b1bed8f33131ab9db06c238a235334733acc5e",
"sha256:aebecde2adc4a6847094d3bd6a8a9538ef3438a5ea84ac1983fcb167db614461",
"sha256:b276cc4deb4a80f9dfd47a41ebb464b1fe91efd8b1b8620cf5ccf8b824b850d6",
"sha256:b5a185ae79f899b01ca49f365bdf15a45d78d9856f09b0de1a41b92afce1a07f",
"sha256:c4d8977ccda886d88dc3ca789de2f1adc714df912ff3934b3d0a3f3d777deafb",
"sha256:c5dd3ffa663c982d7f1be9eb494a8924f6d40e2e2f7d1d27384cfab1b2ac0662",
"sha256:ca88f2f7020002638276439a01ffbb0355634907d1aa5ca91f3dc0c2e44e8f3b",
"sha256:d2cce1c82a7845d7e2e8a0956c6b7ed3f1661c9acf18eb120fc71e098ab5c6fe",
"sha256:d709572d64825d8d59ea112e11cc7faf6007f294e9951324b7574af4251e4de8",
"sha256:da8db8374295fb532b4b0c467e66800ef17d100e4d5faa2bbbd6df35502da125",
"sha256:e36c7e3b5382cd5669cf199c4a04a0279a43b2a3bdd77627e9b89778ac9ec08c",
"sha256:e95a4a6c54d27a84a4624d2af8bb9ee178111604653194ca6880c98dcad92f48",
"sha256:ee835def05622e0c8b1435a906491760a43d0c462f065ec9143ec4b8d79f8bff",
"sha256:f75009715dcf4a3d680c2338ab19dac5498f8121173a929872950f4fb3a48fbf",
"sha256:f8524b8bc89470cec7ac51734907818d3620fb1637f8f8b542d650ebec42a126"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==3.14.1"
},
"pyexiftool": {
"git": "https://github.com/smarnach/pyexiftool.git",
"ref": "3db3764895e687d75b42d3ae4e554ca8664a7f6f"
@@ -559,11 +676,11 @@
},
"pytest": {
"hashes": [
"sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db",
"sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171"
"sha256:b555252a95bbb2a37a97b5ac2eb050c436f7989993565f5e0c9128fcaacadd0e",
"sha256:f1089d218cfcc63a212c42896f1b7fbf096874d045e1988186861a1a87d27b47"
],
"markers": "python_version >= '3.6'",
"version": "==7.0.1"
"markers": "python_version >= '3.7'",
"version": "==7.1.0"
},
"python-dateutil": {
"hashes": [
@@ -681,7 +798,7 @@
"sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17",
"sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"
],
"markers": "python_version >= '3.6' and python_version < '4.0'",
"markers": "python_version >= '3.6' and python_version < '4'",
"version": "==4.8"
},
"s3transfer": {
@@ -853,16 +970,70 @@
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'",
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.26.8"
},
"youtube-dl": {
"websockets": {
"hashes": [
"sha256:bc59e86c5d15d887ac590454511f08ce2c47698d5a82c27bfe27b5d814bbaed2",
"sha256:f1336d5de68647e0364a47b3c0712578e59ec76f02048ff5c50ef1c69d79cd55"
"sha256:038afef2a05893578d10dadbdbb5f112bd115c46347e1efe99f6a356ff062138",
"sha256:05f6e9757017270e7a92a2975e2ae88a9a582ffc4629086fd6039aa80e99cd86",
"sha256:0b66421f9f13d4df60cd48ab977ed2c2b6c9147ae1a33caf5a9f46294422fda1",
"sha256:0cd02f36d37e503aca88ab23cc0a1a0e92a263d37acf6331521eb38040dcf77b",
"sha256:0f73cb2526d6da268e86977b2c4b58f2195994e53070fe567d5487c6436047e6",
"sha256:117383d0a17a0dda349f7a8790763dde75c1508ff8e4d6e8328b898b7df48397",
"sha256:1c1f3b18c8162e3b09761d0c6a0305fd642934202541cc511ef972cb9463261e",
"sha256:1c9031e90ebfc486e9cdad532b94004ade3aa39a31d3c46c105bb0b579cd2490",
"sha256:2349fa81b6b959484bb2bda556ccb9eb70ba68987646a0f8a537a1a18319fb03",
"sha256:24b879ba7db12bb525d4e58089fcbe6a3df3ce4666523183654170e86d372cbe",
"sha256:2aa9b91347ecd0412683f28aabe27f6bad502d89bd363b76e0a3508b1596402e",
"sha256:56d48eebe9e39ce0d68701bce3b21df923aa05dcc00f9fd8300de1df31a7c07c",
"sha256:5a38a0175ae82e4a8c4bac29fc01b9ee26d7d5a614e5ee11e7813c68a7d938ce",
"sha256:5b04270b5613f245ec84bb2c6a482a9d009aefad37c0575f6cda8499125d5d5c",
"sha256:6193bbc1ee63aadeb9a4d81de0e19477401d150d506aee772d8380943f118186",
"sha256:669e54228a4d9457abafed27cbf0e2b9f401445c4dfefc12bf8e4db9751703b8",
"sha256:6a009eb551c46fd79737791c0c833fc0e5b56bcd1c3057498b262d660b92e9cd",
"sha256:71a4491cfe7a9f18ee57d41163cb6a8a3fa591e0f0564ca8b0ed86b2a30cced4",
"sha256:7b38a5c9112e3dbbe45540f7b60c5204f49b3cb501b40950d6ab34cd202ab1d0",
"sha256:7bb9d8a6beca478c7e9bdde0159bd810cc1006ad6a7cb460533bae39da692ca2",
"sha256:82bc33db6d8309dc27a3bee11f7da2288ad925fcbabc2a4bb78f7e9c56249baf",
"sha256:8351c3c86b08156337b0e4ece0e3c5ec3e01fcd14e8950996832a23c99416098",
"sha256:8beac786a388bb99a66c3be4ab0fb38273c0e3bc17f612a4e0a47c4fc8b9c045",
"sha256:97950c7c844ec6f8d292440953ae18b99e3a6a09885e09d20d5e7ecd9b914cf8",
"sha256:98f57b3120f8331cd7440dbe0e776474f5e3632fdaa474af1f6b754955a47d71",
"sha256:9ca2ca05a4c29179f06cf6727b45dba5d228da62623ec9df4184413d8aae6cb9",
"sha256:a03a25d95cc7400bd4d61a63460b5d85a7761c12075ee2f51de1ffe73aa593d3",
"sha256:a10c0c1ee02164246f90053273a42d72a3b2452a7e7486fdae781138cf7fbe2d",
"sha256:a72b92f96e5e540d5dda99ee3346e199ade8df63152fa3c737260da1730c411f",
"sha256:ac081aa0307f263d63c5ff0727935c736c8dad51ddf2dc9f5d0c4759842aefaa",
"sha256:b22bdc795e62e71118b63e14a08bacfa4f262fd2877de7e5b950f5ac16b0348f",
"sha256:b4059e2ccbe6587b6dc9a01db5fc49ead9a884faa4076eea96c5ec62cb32f42a",
"sha256:b7fe45ae43ac814beb8ca09d6995b56800676f2cfa8e23f42839dc69bba34a42",
"sha256:bef03a51f9657fb03d8da6ccd233fe96e04101a852f0ffd35f5b725b28221ff3",
"sha256:bffc65442dd35c473ca9790a3fa3ba06396102a950794f536783f4b8060af8dd",
"sha256:c21a67ab9a94bd53e10bba21912556027fea944648a09e6508415ad14e37c325",
"sha256:c67d9cacb3f6537ca21e9b224d4fd08481538e43bcac08b3d93181b0816def39",
"sha256:c6e56606842bb24e16e36ae7eb308d866b4249cf0be8f63b212f287eeb76b124",
"sha256:cb316b87cbe3c0791c2ad92a5a36bf6adc87c457654335810b25048c1daa6fd5",
"sha256:cef40a1b183dcf39d23b392e9dd1d9b07ab9c46aadf294fff1350fb79146e72b",
"sha256:cf931c33db9c87c53d009856045dd524e4a378445693382a920fa1e0eb77c36c",
"sha256:d4d110a84b63c5cfdd22485acc97b8b919aefeecd6300c0c9d551e055b9a88ea",
"sha256:d5396710f86a306cf52f87fd8ea594a0e894ba0cc5a36059eaca3a477dc332aa",
"sha256:f09f46b1ff6d09b01c7816c50bd1903cf7d02ebbdb63726132717c2fcda835d5",
"sha256:f14bd10e170abc01682a9f8b28b16e6f20acf6175945ef38db6ffe31b0c72c3f",
"sha256:f5c335dc0e7dc271ef36df3f439868b3c790775f345338c2f61a562f1074187b",
"sha256:f8296b8408ec6853b26771599990721a26403e62b9de7e50ac0a056772ac0b5e",
"sha256:fa35c5d1830d0fb7b810324e9eeab9aa92e8f273f11fdbdc0741dcded6d72b9f"
],
"markers": "python_version >= '3.7'",
"version": "==10.2"
},
"yt-dlp": {
"hashes": [
"sha256:05179f0f2c34f06910003bb9f80af68ff798b072ca0f826c0e6704a3fbd5b306",
"sha256:68546578c18e6ce87450b53769d5d5b7f5a23e5209784976db6c7ccbf7954b21"
],
"index": "pypi",
"version": "==2021.12.17"
"version": "==2022.3.8.2"
},
"zipp": {
"hashes": [
@@ -973,11 +1144,11 @@
},
"pytest": {
"hashes": [
"sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db",
"sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171"
"sha256:b555252a95bbb2a37a97b5ac2eb050c436f7989993565f5e0c9128fcaacadd0e",
"sha256:f1089d218cfcc63a212c42896f1b7fbf096874d045e1988186861a1a87d27b47"
],
"markers": "python_version >= '3.6'",
"version": "==7.0.1"
"markers": "python_version >= '3.7'",
"version": "==7.1.0"
},
"pytest-cov": {
"hashes": [

View File

@@ -3,8 +3,11 @@ from .base import Scraper, ScraperController
from .bitchute import BitchuteScraper
from .gab import GabScraper
from .gettr import GettrScraper
from .instagram import InstagramScraper
from .odysee import OdyseeScraper
from .rumble import RumbleScraper
from .telegram_snscrape import TelegramSnscrapeScraper
from .telegram_telethon import TelegramTelethonScraper
from .twitter import TwitterScraper
from .twitter import TwitterScraper
from .vkontakte import VkontakteScraper
from .youtube import YoutubeScraper

View File

@@ -8,6 +8,7 @@ import boto3
from loguru import logger
import ffmpeg
from sqlalchemy.orm import sessionmaker
import yt_dlp
from cisticola.base import Channel, ScraperResult, mapper_registry
from cisticola.utils import make_request
@@ -69,6 +70,38 @@ class Scraper:
return blob, content_type, key
def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
content_type = 'video/mp4'
with tempfile.TemporaryDirectory() as temp_dir:
ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4",
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
"noplaylist": True,
'quiet': True,
"verbose": False,}
ydl = yt_dlp.YoutubeDL(ydl_opts)
try:
meta = ydl.extract_info(
url,
download=True,)
except yt_dlp.utils.DownloadError as e:
raise e
else:
video_id = meta["id"]
video_ext = meta["ext"]
with open(f"{temp_dir}/{video_id}.{video_ext}", "rb") as f:
blob = f.read()
if key is None:
key = self.url_to_key(url = url, content_type = content_type)
return blob, content_type, key
def archive_blob(self, blob: bytes, content_type: str, key: str) -> str:
filename = self.__version__.replace(' ', '_') + '/' + key
@@ -101,7 +134,7 @@ class ScraperController:
def register_scrapers(self, scraper: List[Scraper]):
self.scrapers.extend(scraper)
@logger.catch
@logger.catch(reraise = True)
def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
if self.session is None:
logger.error("No DB session")

View File

@@ -0,0 +1,102 @@
from typing import Generator
from datetime import datetime, timezone
import os
import json
import tempfile
from pathlib import Path
from loguru import logger
import instaloader
from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
BASE_URL = 'https://www.instagram.com/'
CONTENT_TYPES = {
'jpg' : 'image/jpeg',
'mp4' : 'video/mp4'}
class InstagramScraper(Scraper):
__version__ = "InstagramScraper 0.0.1"
def get_username_from_url(self, url):
username = url.split(BASE_URL)[1].strip('/')
return username
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = self.get_username_from_url(channel.url)
loader = instaloader.Instaloader(
quiet = True,
download_comments = False,
save_metadata = False)
loader.login(
user = os.environ['INSTAGRAM_USERNAME'],
passwd = os.environ['INSTAGRAM_PASSWORD'])
profile = instaloader.Profile.from_username(
context = loader.context,
username = username)
for post in profile.get_posts():
if since is not None and post.date_utc <= since.date:
break
post_url = f'{BASE_URL}p/{post.shortcode}/'
archived_urls = {}
if archive_media:
with tempfile.TemporaryDirectory() as temp_dir:
loader.download_post(post = post, target = Path(temp_dir))
files = os.listdir(temp_dir)
files = [f for f in files if not f.endswith('.txt')]
for file in files:
ext = file.split('.')[-1]
content_type = CONTENT_TYPES[ext]
filename = Path(temp_dir, file)
key = f'{post.shortcode}__{file}'
with open(filename, 'rb') as f:
blob = f.read()
archived_url = self.archive_blob(blob = blob, content_type = content_type, key = key)
archived_urls[post_url] = archived_url
yield ScraperResult(
scraper=self.__version__,
platform="Instagram",
channel=channel.id,
platform_id=post.mediaid,
date=post.date_utc,
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post._asdict(), default=str),
archived_urls=archived_urls)
for comment in post.get_comments():
comment_dict = comment._asdict()
comment_dict['post_url'] = post_url
comment_dict['is_comment'] = True
yield ScraperResult(
scraper=self.__version__,
platform="Instagram",
channel=channel.id,
platform_id=post.mediaid,
date=comment.created_at_utc,
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(comment_dict, default=str),
archived_urls={})
def can_handle(self, channel):
if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None:
return True

View File

@@ -1,12 +1,9 @@
from datetime import datetime, timezone
import json
from typing import Generator, Tuple
import tempfile
from typing import Generator
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
import youtube_dl
from cisticola.base import Channel, ScraperResult
from cisticola.scraper import Scraper, make_request
@@ -37,7 +34,7 @@ class RumbleScraper(Scraper):
url = post['media_url']
media_blob, content_type, key = self.url_to_blob(url)
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[post['media_url']] = archived_url
@@ -51,43 +48,15 @@ class RumbleScraper(Scraper):
raw_data=json.dumps(post),
archived_urls=archived_urls)
def url_to_key(self, url: str, content_type: str) -> str:
ext = '.' + content_type.split('/')[-1]
key = urlparse(url).path.split('/')[-2] + ext
return key
def can_handle(self, channel):
if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None:
return True
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
content_type = 'video/mp4'
ext = '.' + content_type.split('/')[-1]
with tempfile.TemporaryDirectory() as temp_dir:
ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4",
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
"noplaylist": True,
'quiet': True,
"verbose": False,}
ydl = youtube_dl.YoutubeDL(ydl_opts)
try:
meta = ydl.extract_info(
url,
download=True,)
except youtube_dl.utils.DownloadError as e:
raise e
else:
video_id = meta["id"]
video_ext = meta["ext"]
with open(f"{temp_dir}/{video_id}.{video_ext}", "rb") as f:
blob = f.read()
if key is None:
key = urlparse(url).path.split('/')[-2] + ext
return blob, content_type, key
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def get_media_url(url):

View File

@@ -0,0 +1,80 @@
from datetime import datetime, timezone
from typing import Generator
from urllib.parse import urlparse
from snscrape.modules.vkontakte import VKontakteUserScraper
from loguru import logger
from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
class VkontakteScraper(Scraper):
"""An implementation of a Scraper for Vkontakte, using snscrape library"""
__version__ = "VkontakteScraper 0.0.1"
def get_username_from_url(self, url):
username = url.split('https://vk.com/')[1]
return username
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = self.get_username_from_url(channel.url)
scraper = VKontakteUserScraper(username)
first = True
for post in scraper.get_items():
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
# with VKontakteUserScraper, the first tweet could be an old pinned tweet
if first:
first = False
continue
else:
break
archived_urls = {}
if archive_media:
if post.photos:
for photo in post.photos:
variant = max(
[v for v in photo.variants], key=lambda v: v.width * v.height)
url = variant.url
if url is not None:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
if post.video:
url = post.video.url
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
platform="Vkontatke",
channel=channel.id,
platform_id=post.url.split('/')[-1],
date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=post.json(),
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Vkontakte" and channel.platform_id:
return True
def url_to_key(self, url: str, content_type: str) -> str:
path = urlparse(url).path
if path.endswith('.jpg'):
key = '_'.join(path.split('/')[-2:])
else:
ext = '.mp4'
key = path.split('/')[-1] + ext
return key

View File

@@ -0,0 +1,79 @@
from datetime import datetime, timezone
import json
from typing import Generator
import tempfile
import yt_dlp
from cisticola.base import Channel, ScraperResult
from cisticola.scraper import Scraper
class YoutubeScraper(Scraper):
"""An implementation of a Scraper for Youtube, using youtube-dl"""
__version__ = "YoutubeScraper 0.0.1"
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
content_type = 'video/mp4'
if since is None:
since_date = datetime.min
start_date = None
else:
since_date = since.date
start_date = since.date.strftime('%Y%m%d')
with tempfile.TemporaryDirectory() as temp_dir:
daterange = yt_dlp.utils.DateRange(start = start_date)
ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4",
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
"daterange" : daterange}
ydl = yt_dlp.YoutubeDL(ydl_opts)
try:
meta = ydl.extract_info(
channel.url,
download=archive_media)
except yt_dlp.utils.DownloadError as e:
raise e
else:
videos = meta['entries']
valid_videos = [video for video in videos if since_date < datetime.strptime(video['upload_date'], '%Y%m%d')]
for video in valid_videos:
archived_urls = {}
video_id = video["id"]
video_ext = video["ext"]
if archive_media:
key = f"{video_id}.{video_ext}"
with open(f"{temp_dir}/{key}", "rb") as f:
media_blob = f.read()
archived_url = self.archive_blob(media_blob, content_type, key)
url = video['webpage_url']
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
platform="Youtube",
channel=channel.id,
platform_id=video_id,
date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(video, default = str),
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Youtube" and channel.url:
return True

View File

@@ -101,7 +101,7 @@ class ETLController:
self.session = sessionmaker()
self.session.configure(bind=engine)
@logger.catch
@logger.catch(reraise = True)
def transform_results(self, results: List[ScraperResult], hydrate: bool = True):
"""Transforms raw ScraperResults objects into TransformedResult objects and
Media objects. Then, adds them to the database.
@@ -149,7 +149,7 @@ class ETLController:
if handled == False:
logger.warning(f"No Transformer could handle {result}")
@logger.catch
@logger.catch(reraise = True)
def transform_all_untransformed(self, hydrate: bool = True):
"""Transform all ScraperResult objects in the database that do not have an
equivalent TransformedResult object stored.

View File

@@ -52,8 +52,23 @@ GETTR_CHANNEL_KWARGS = {
'chat': False,
'notes': ''}
ODYSEE_CHANNEL_KWARGS = {
INSTAGRAM_CHANNEL_KWARGS = {
'id': 3,
'name': 'borland.88 (test)',
'platform_id': 'borland.88',
'category': 'test',
'followers': None,
'platform': 'Instagram',
'url': 'https://www.instagram.com/borland.88/',
'screenname': 'borland.88',
'country': 'UA',
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
ODYSEE_CHANNEL_KWARGS = {
'id': 4,
'name': "Mak1n' Bacon (test)",
'platform_id': 'Mak1nBacon',
'category': 'test',
@@ -68,7 +83,7 @@ ODYSEE_CHANNEL_KWARGS = {
'notes': ''}
RUMBLE_CHANNEL_KWARGS = {
'id': 4,
'id': 5,
'name': 'we are uploading videos wow products',
'platform_id': 'c-916305',
'category': 'test',
@@ -83,7 +98,7 @@ RUMBLE_CHANNEL_KWARGS = {
'notes': ''}
TELEGRAM_CHANNEL_KWARGS = {
'id': 5,
'id': 6,
'name': 'South West Ohio Proud Boys (test)',
'platform_id': -1001276612436,
'category': 'test',
@@ -98,7 +113,7 @@ TELEGRAM_CHANNEL_KWARGS = {
'notes': ''}
TWITTER_CHANNEL_KWARGS = {
'id': 5,
'id': 7,
'name': 'L Weber (test)',
'platform_id': 1424979017749442595,
'category': 'test',
@@ -112,6 +127,36 @@ TWITTER_CHANNEL_KWARGS = {
'chat': False,
'notes': ''}
VKONTAKTE_CHANNEL_KWARGS = {
'id': 8,
'name': 'Wwg1wgA (test)',
'platform_id': 'club201278078',
'category': 'test',
'followers': None,
'platform': 'Vkontakte',
'url': 'https://vk.com/club201278078',
'screenname': 'Wwg1wgA',
'country': 'FR',
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
YOUTUBE_CHANNEL_KWARGS = {
'id': 9,
'name': 'AnEs87 (test)',
'platform_id': 'UCP6exBqGoxGLv_pM9Dxk2pA',
'category': 'test',
'followers': None,
'platform': 'Youtube',
'url': 'https://www.youtube.com/channel/UCP6exBqGoxGLv_pM9Dxk2pA',
'screenname': 'AnEs87',
'country': 'SV',
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
@@ -151,12 +196,15 @@ def channel_kwargs():
"""
return {
'bitchute': BITCHUTE_CHANNEL_KWARGS,
'gab': GAB_CHANNEL_KWARGS,
'gettr': GETTR_CHANNEL_KWARGS,
'odysee': ODYSEE_CHANNEL_KWARGS,
'rumble': RUMBLE_CHANNEL_KWARGS,
'telegram': TELEGRAM_CHANNEL_KWARGS,
'twitter': TWITTER_CHANNEL_KWARGS}
'bitchute' : BITCHUTE_CHANNEL_KWARGS,
'gab' : GAB_CHANNEL_KWARGS,
'gettr' : GETTR_CHANNEL_KWARGS,
'instagram' : INSTAGRAM_CHANNEL_KWARGS,
'odysee' : ODYSEE_CHANNEL_KWARGS,
'rumble' : RUMBLE_CHANNEL_KWARGS,
'telegram' : TELEGRAM_CHANNEL_KWARGS,
'twitter' : TWITTER_CHANNEL_KWARGS,
'vkontakte' : VKONTAKTE_CHANNEL_KWARGS,
'youtube' : YOUTUBE_CHANNEL_KWARGS}
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -0,0 +1,16 @@
from cisticola.base import Channel
from cisticola.scraper import InstagramScraper
def test_scrape_instagram_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['instagram'])]
controller.register_scraper(scraper = InstagramScraper())
controller.scrape_channels(channels = channels, archive_media = False)
def test_scrape_instagram_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['instagram'])]
controller.register_scraper(scraper = InstagramScraper())
controller.scrape_channels(channels = channels, archive_media = True)

View File

@@ -0,0 +1,16 @@
from cisticola.base import Channel
from cisticola.scraper import VkontakteScraper
def test_scrape_vkontakte_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['vkontakte'])]
controller.register_scraper(scraper = VkontakteScraper())
controller.scrape_channels(channels = channels, archive_media = False)
def test_scrape_vkontakte_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['vkontakte'])]
controller.register_scraper(scraper = VkontakteScraper())
controller.scrape_channels(channels = channels, archive_media = True)

16
tests/scraper/youtube.py Normal file
View File

@@ -0,0 +1,16 @@
from cisticola.base import Channel
from cisticola.scraper import YoutubeScraper
def test_scrape_youtube_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['youtube'])]
controller.register_scraper(scraper = YoutubeScraper())
controller.scrape_channels(channels = channels, archive_media = False)
def test_scrape_youtube_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['youtube'])]
controller.register_scraper(scraper = YoutubeScraper())
controller.scrape_channels(channels = channels, archive_media = True)