mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-12 21:38:33 +03:00
added capability of running scraper without archiving media, and implemented prototype Telethon scraper for Telegram
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -8,6 +8,7 @@ docs/source/_*
|
||||
*.ipynb
|
||||
*.db
|
||||
.env
|
||||
*.session
|
||||
|
||||
# Unit test / coverage reports
|
||||
reports
|
||||
|
||||
1
Pipfile
1
Pipfile
@@ -17,6 +17,7 @@ ffmpeg-python = "*"
|
||||
polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"}
|
||||
garc = "*"
|
||||
youtube-dl = "*"
|
||||
telethon = "*"
|
||||
|
||||
[dev-packages]
|
||||
pytest = "*"
|
||||
|
||||
172
Pipfile.lock
generated
172
Pipfile.lock
generated
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "ea2a1f1dff68fa0bd30dab06553e913f467c3b1399388b97f0ed913ab74c6e85"
|
||||
"sha256": "3d293e1f3802d64ae7a8fbfc4c1d742cc33cd4c520a6263f93e566f89faa7013"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
@@ -49,19 +49,19 @@
|
||||
},
|
||||
"boto3": {
|
||||
"hashes": [
|
||||
"sha256:75709628320cea8ce137975dc33b75213c2e4f6e7cd09e55290de7245e2c79e2",
|
||||
"sha256:c92ec20a670721b5a1bc013b305a84db2b7f9c716653b3056ce7e2fbd2a180ef"
|
||||
"sha256:30394729b38d5ce2f845440428a55161c6d45478044e553a12ca1acf56d7278a",
|
||||
"sha256:895489900eb882777124c3b64a13df49785cf77f7bd1504e783464fb3b4c8163"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.21.12"
|
||||
"version": "==1.21.15"
|
||||
},
|
||||
"botocore": {
|
||||
"hashes": [
|
||||
"sha256:0174999a04b0a2e42457106093ace9b36fa94772a442d9bcf60750263d1d073e",
|
||||
"sha256:0cd7395311a3fef4aad8df8f511b4f7d221c24ae30934bd5c03458b0fc096d0c"
|
||||
"sha256:405082f92a9e524e1aee96cbc90134668026d7da3c12f86990c91a12620ca28b",
|
||||
"sha256:fa4816e94e72111a9341204061e760bcbde74ca5d900d3f2206c2c2e8e4b56e4"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.24.12"
|
||||
"version": "==1.24.15"
|
||||
},
|
||||
"bs4": {
|
||||
"hashes": [
|
||||
@@ -378,28 +378,28 @@
|
||||
},
|
||||
"numpy": {
|
||||
"hashes": [
|
||||
"sha256:03ae5850619abb34a879d5f2d4bb4dcd025d6d8fb72f5e461dae84edccfe129f",
|
||||
"sha256:076aee5a3763d41da6bef9565fdf3cb987606f567cd8b104aded2b38b7b47abf",
|
||||
"sha256:0b536b6840e84c1c6a410f3a5aa727821e6108f3454d81a5cd5900999ef04f89",
|
||||
"sha256:15efb7b93806d438e3bc590ca8ef2f953b0ce4f86f337ef4559d31ec6cf9d7dd",
|
||||
"sha256:168259b1b184aa83a514f307352c25c56af111c269ffc109d9704e81f72e764b",
|
||||
"sha256:2638389562bda1635b564490d76713695ff497242a83d9b684d27bb4a6cc9d7a",
|
||||
"sha256:3556c5550de40027d3121ebbb170f61bbe19eb639c7ad0c7b482cd9b560cd23b",
|
||||
"sha256:4a176959b6e7e00b5a0d6f549a479f869829bfd8150282c590deee6d099bbb6e",
|
||||
"sha256:515a8b6edbb904594685da6e176ac9fbea8f73a5ebae947281de6613e27f1956",
|
||||
"sha256:55535c7c2f61e2b2fc817c5cbe1af7cb907c7f011e46ae0a52caa4be1f19afe2",
|
||||
"sha256:59153979d60f5bfe9e4c00e401e24dfe0469ef8da6d68247439d3278f30a180f",
|
||||
"sha256:60cb8e5933193a3cc2912ee29ca331e9c15b2da034f76159b7abc520b3d1233a",
|
||||
"sha256:6767ad399e9327bfdbaa40871be4254d1995f4a3ca3806127f10cec778bd9896",
|
||||
"sha256:76a4f9bce0278becc2da7da3b8ef854bed41a991f4226911a24a9711baad672c",
|
||||
"sha256:8cf33634b60c9cef346663a222d9841d3bbbc0a2f00221d6bcfd0d993d5543f6",
|
||||
"sha256:94dd11d9f13ea1be17bac39c1942f527cbf7065f94953cf62dfe805653da2f8f",
|
||||
"sha256:aafa46b5a39a27aca566198d3312fb3bde95ce9677085efd02c86f7ef6be4ec7",
|
||||
"sha256:badca914580eb46385e7f7e4e426fea6de0a37b9e06bec252e481ae7ec287082",
|
||||
"sha256:d76a26c5118c4d96e264acc9e3242d72e1a2b92e739807b3b69d8d47684b6677"
|
||||
"sha256:07a8c89a04997625236c5ecb7afe35a02af3896c8aa01890a849913a2309c676",
|
||||
"sha256:08d9b008d0156c70dc392bb3ab3abb6e7a711383c3247b410b39962263576cd4",
|
||||
"sha256:201b4d0552831f7250a08d3b38de0d989d6f6e4658b709a02a73c524ccc6ffce",
|
||||
"sha256:2c10a93606e0b4b95c9b04b77dc349b398fdfbda382d2a39ba5a822f669a0123",
|
||||
"sha256:3ca688e1b9b95d80250bca34b11a05e389b1420d00e87a0d12dc45f131f704a1",
|
||||
"sha256:48a3aecd3b997bf452a2dedb11f4e79bc5bfd21a1d4cc760e703c31d57c84b3e",
|
||||
"sha256:568dfd16224abddafb1cbcce2ff14f522abe037268514dd7e42c6776a1c3f8e5",
|
||||
"sha256:5bfb1bb598e8229c2d5d48db1860bcf4311337864ea3efdbe1171fb0c5da515d",
|
||||
"sha256:639b54cdf6aa4f82fe37ebf70401bbb74b8508fddcf4797f9fe59615b8c5813a",
|
||||
"sha256:8251ed96f38b47b4295b1ae51631de7ffa8260b5b087808ef09a39a9d66c97ab",
|
||||
"sha256:92bfa69cfbdf7dfc3040978ad09a48091143cffb778ec3b03fa170c494118d75",
|
||||
"sha256:97098b95aa4e418529099c26558eeb8486e66bd1e53a6b606d684d0c3616b168",
|
||||
"sha256:a3bae1a2ed00e90b3ba5f7bd0a7c7999b55d609e0c54ceb2b076a25e345fa9f4",
|
||||
"sha256:c34ea7e9d13a70bf2ab64a2532fe149a9aced424cd05a2c4ba662fd989e3e45f",
|
||||
"sha256:dbc7601a3b7472d559dc7b933b18b4b66f9aa7452c120e87dfb33d02008c8a18",
|
||||
"sha256:e7927a589df200c5e23c57970bafbd0cd322459aa7b1ff73b7c2e84d6e3eae62",
|
||||
"sha256:f8c1f39caad2c896bc0018f699882b345b2a63708008be29b1f355ebf6f933fe",
|
||||
"sha256:fade0d4f4d292b6f39951b6836d7a3c7ef5b2347f3c420cd9820a1d90d794802",
|
||||
"sha256:fdf3c08bce27132395d3c3ba1503cac12e17282358cb4bddc25cc46b0aca07aa"
|
||||
],
|
||||
"markers": "python_version < '3.10' and platform_machine != 'aarch64' and platform_machine != 'arm64'",
|
||||
"version": "==1.22.2"
|
||||
"version": "==1.22.3"
|
||||
},
|
||||
"packaging": {
|
||||
"hashes": [
|
||||
@@ -446,7 +446,7 @@
|
||||
},
|
||||
"polyphemus": {
|
||||
"git": "https://github.com/bellingcat/polyphemus.git",
|
||||
"ref": "8506fd43770661cdcf92c5cac2356cba74778834"
|
||||
"ref": "c85dea215ae720e3df71d2ed1aaa82f7b8a6a2ed"
|
||||
},
|
||||
"py": {
|
||||
"hashes": [
|
||||
@@ -456,6 +456,30 @@
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==1.11.0"
|
||||
},
|
||||
"pyaes": {
|
||||
"hashes": [
|
||||
"sha256:02c1b1405c38d3c370b085fb952dd8bea3fadcee6411ad99f312cc129c536d8f"
|
||||
],
|
||||
"version": "==1.6.1"
|
||||
},
|
||||
"pyasn1": {
|
||||
"hashes": [
|
||||
"sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359",
|
||||
"sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576",
|
||||
"sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf",
|
||||
"sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7",
|
||||
"sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d",
|
||||
"sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00",
|
||||
"sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8",
|
||||
"sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86",
|
||||
"sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12",
|
||||
"sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776",
|
||||
"sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba",
|
||||
"sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2",
|
||||
"sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"
|
||||
],
|
||||
"version": "==0.4.8"
|
||||
},
|
||||
"pygments": {
|
||||
"hashes": [
|
||||
"sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65",
|
||||
@@ -592,9 +616,6 @@
|
||||
"version": "==2022.3.2"
|
||||
},
|
||||
"requests": {
|
||||
"extras": [
|
||||
"socks"
|
||||
],
|
||||
"hashes": [
|
||||
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
|
||||
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
|
||||
@@ -602,6 +623,14 @@
|
||||
"index": "pypi",
|
||||
"version": "==2.27.1"
|
||||
},
|
||||
"rsa": {
|
||||
"hashes": [
|
||||
"sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17",
|
||||
"sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"
|
||||
],
|
||||
"markers": "python_version >= '3.6' and python_version < '4.0'",
|
||||
"version": "==4.8"
|
||||
},
|
||||
"s3transfer": {
|
||||
"hashes": [
|
||||
"sha256:7a6f4c4d1fdb9a2b640244008e142cbc2cd3ae34b386584ef044dd0f27101971",
|
||||
@@ -695,45 +724,52 @@
|
||||
},
|
||||
"sqlalchemy": {
|
||||
"hashes": [
|
||||
"sha256:05fa14f279d43df68964ad066f653193187909950aa0163320b728edfc400167",
|
||||
"sha256:0ddc5e5ccc0160e7ad190e5c61eb57560f38559e22586955f205e537cda26034",
|
||||
"sha256:15a03261aa1e68f208e71ae3cd845b00063d242cbf8c87348a0c2c0fc6e1f2ac",
|
||||
"sha256:289465162b1fa1e7a982f8abe59d26a8331211cad4942e8031d2b7db1f75e649",
|
||||
"sha256:2e216c13ecc7fcdcbb86bb3225425b3ed338e43a8810c7089ddb472676124b9b",
|
||||
"sha256:2fd4d3ca64c41dae31228b80556ab55b6489275fb204827f6560b65f95692cf3",
|
||||
"sha256:330eb45395874cc7787214fdd4489e2afb931bc49e0a7a8f9cd56d6e9c5b1639",
|
||||
"sha256:3c7ed6c69debaf6198fadb1c16ae1253a29a7670bbf0646f92582eb465a0b999",
|
||||
"sha256:4ad31cec8b49fd718470328ad9711f4dc703507d434fd45461096da0a7135ee0",
|
||||
"sha256:57205844f246bab9b666a32f59b046add8995c665d9ecb2b7b837b087df90639",
|
||||
"sha256:582b59d1e5780a447aada22b461e50b404a9dc05768da1d87368ad8190468418",
|
||||
"sha256:5e9c7b3567edbc2183607f7d9f3e7e89355b8f8984eec4d2cd1e1513c8f7b43f",
|
||||
"sha256:6a01ec49ca54ce03bc14e10de55dfc64187a2194b3b0e5ac0fdbe9b24767e79e",
|
||||
"sha256:6f22c040d196f841168b1456e77c30a18a3dc16b336ddbc5a24ce01ab4e95ae0",
|
||||
"sha256:81f2dd355b57770fdf292b54f3e0a9823ec27a543f947fa2eb4ec0df44f35f0d",
|
||||
"sha256:85e4c244e1de056d48dae466e9baf9437980c19fcde493e0db1a0a986e6d75b4",
|
||||
"sha256:8d0949b11681380b4a50ac3cd075e4816afe9fa4a8c8ae006c1ca26f0fa40ad8",
|
||||
"sha256:975f5c0793892c634c4920057da0de3a48bbbbd0a5c86f5fcf2f2fedf41b76da",
|
||||
"sha256:9e4fb2895b83993831ba2401b6404de953fdbfa9d7d4fa6a4756294a83bbc94f",
|
||||
"sha256:b35dca159c1c9fa8a5f9005e42133eed82705bf8e243da371a5e5826440e65ca",
|
||||
"sha256:b7b20c88873675903d6438d8b33fba027997193e274b9367421e610d9da76c08",
|
||||
"sha256:bb4b15fb1f0aafa65cbdc62d3c2078bea1ceecbfccc9a1f23a2113c9ac1191fa",
|
||||
"sha256:c0c7171aa5a57e522a04a31b84798b6c926234cb559c0939840c3235cf068813",
|
||||
"sha256:c317ddd7c586af350a6aef22b891e84b16bff1a27886ed5b30f15c1ed59caeaa",
|
||||
"sha256:c3abc34fed19fdeaead0ced8cf56dd121f08198008c033596aa6aae7cc58f59f",
|
||||
"sha256:ca68c52e3cae491ace2bf39b35fef4ce26c192fd70b4cd90f040d419f70893b5",
|
||||
"sha256:cf2cd387409b12d0a8b801610d6336ee7d24043b6dd965950eaec09b73e7262f",
|
||||
"sha256:d046a9aeba9bc53e88a41e58beb72b6205abb9a20f6c136161adf9128e589db5",
|
||||
"sha256:d5c20c8415173b119762b6110af64448adccd4d11f273fb9f718a9865b88a99c",
|
||||
"sha256:d86132922531f0dc5a4f424c7580a472a924dd737602638e704841c9cb24aea2",
|
||||
"sha256:dccff41478050e823271642837b904d5f9bda3f5cf7d371ce163f00a694118d6",
|
||||
"sha256:de85c26a5a1c72e695ab0454e92f60213b4459b8d7c502e0be7a6369690eeb1a",
|
||||
"sha256:e3a86b59b6227ef72ffc10d4b23f0fe994bef64d4667eab4fb8cd43de4223bec",
|
||||
"sha256:e79e73d5ee24196d3057340e356e6254af4d10e1fc22d3207ea8342fc5ffb977",
|
||||
"sha256:ea8210090a816d48a4291a47462bac750e3bc5c2442e6d64f7b8137a7c3f9ac5",
|
||||
"sha256:f3b7ec97e68b68cb1f9ddb82eda17b418f19a034fa8380a0ac04e8fe01532875"
|
||||
"sha256:04164e0063feb7aedd9d073db0fd496edb244be40d46ea1f0d8990815e4b8c34",
|
||||
"sha256:159c2f69dd6efd28e894f261ffca1100690f28210f34cfcd70b895e0ea7a64f3",
|
||||
"sha256:199dc6d0068753b6a8c0bd3aceb86a3e782df118260ebc1fa981ea31ee054674",
|
||||
"sha256:1bbac3e8293b34c4403d297e21e8f10d2a57756b75cff101dc62186adec725f5",
|
||||
"sha256:20e9eba7fd86ef52e0df25bea83b8b518dfdf0bce09b336cfe51671f52aaaa3f",
|
||||
"sha256:290cbdf19129ae520d4bdce392648c6fcdbee763bc8f750b53a5ab51880cb9c9",
|
||||
"sha256:316270e5867566376e69a0ac738b863d41396e2b63274616817e1d34156dff0e",
|
||||
"sha256:3f88a4ee192142eeed3fe173f673ea6ab1f5a863810a9d85dbf6c67a9bd08f97",
|
||||
"sha256:4aa96e957141006181ca58e792e900ee511085b8dae06c2d08c00f108280fb8a",
|
||||
"sha256:4b2bcab3a914715d332ca783e9bda13bc570d8b9ef087563210ba63082c18c16",
|
||||
"sha256:576684771456d02e24078047c2567025f2011977aa342063468577d94e194b00",
|
||||
"sha256:5a2e73508f939175363d8a4be9dcdc84cf16a92578d7fa86e6e4ca0e6b3667b2",
|
||||
"sha256:5ba59761c19b800bc2e1c9324da04d35ef51e4ee9621ff37534bc2290d258f71",
|
||||
"sha256:5dc9801ae9884e822ba942ca493642fb50f049c06b6dbe3178691fce48ceb089",
|
||||
"sha256:6fdd2dc5931daab778c2b65b03df6ae68376e028a3098eb624d0909d999885bc",
|
||||
"sha256:708973b5d9e1e441188124aaf13c121e5b03b6054c2df59b32219175a25aa13e",
|
||||
"sha256:7ff72b3cc9242d1a1c9b84bd945907bf174d74fc2519efe6184d6390a8df478b",
|
||||
"sha256:8679f9aba5ac22e7bce54ccd8a77641d3aea3e2d96e73e4356c887ebf8ff1082",
|
||||
"sha256:8b9a395122770a6f08ebfd0321546d7379f43505882c7419d7886856a07caa13",
|
||||
"sha256:8e1e5d96b744a4f91163290b01045430f3f32579e46d87282449e5b14d27d4ac",
|
||||
"sha256:9a0195af6b9050c9322a97cf07514f66fe511968e623ca87b2df5e3cf6349615",
|
||||
"sha256:9cb5698c896fa72f88e7ef04ef62572faf56809093180771d9be8d9f2e264a13",
|
||||
"sha256:b3f1d9b3aa09ab9adc7f8c4b40fc3e081eb903054c9a6f9ae1633fe15ae503b4",
|
||||
"sha256:bb42f9b259c33662c6a9b866012f6908a91731a419e69304e1261ba3ab87b8d1",
|
||||
"sha256:bca714d831e5b8860c3ab134c93aec63d1a4f493bed20084f54e3ce9f0a3bf99",
|
||||
"sha256:bedd89c34ab62565d44745212814e4b57ef1c24ad4af9b29c504ce40f0dc6558",
|
||||
"sha256:bfec934aac7f9fa95fc82147a4ba5db0a8bdc4ebf1e33b585ab8860beb10232f",
|
||||
"sha256:c7046f7aa2db445daccc8424f50b47a66c4039c9f058246b43796aa818f8b751",
|
||||
"sha256:d7e483f4791fbda60e23926b098702340504f7684ce7e1fd2c1bf02029288423",
|
||||
"sha256:dd93162615870c976dba43963a24bb418b28448fef584f30755990c134a06a55",
|
||||
"sha256:e4607d2d16330757818c9d6fba322c2e80b4b112ff24295d1343a80b876eb0ed",
|
||||
"sha256:e9a680d9665f88346ed339888781f5236347933906c5a56348abb8261282ec48",
|
||||
"sha256:edfcf93fd92e2f9eef640b3a7a40db20fe3c1d7c2c74faa41424c63dead61b76",
|
||||
"sha256:f7e4a3c0c3c596296b37f8427c467c8e4336dc8d50f8ed38042e8ba79507b2c9",
|
||||
"sha256:fff677fa4522dafb5a5e2c0cf909790d5d367326321aeabc0dffc9047cb235bd"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.4.31"
|
||||
"version": "==1.4.32"
|
||||
},
|
||||
"telethon": {
|
||||
"hashes": [
|
||||
"sha256:04fdc5fa4ed3e886e6ecf4bad79205ab8880c6aefbd42c29c89c689a502aa816",
|
||||
"sha256:818cb61281ed3f75ba4da9b68cb69486bed9474d2db4e0aa16e482053117452c"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.24.0"
|
||||
},
|
||||
"tomli": {
|
||||
"hashes": [
|
||||
|
||||
@@ -6,4 +6,5 @@ from .gettr import GettrScraper
|
||||
from .odysee import OdyseeScraper
|
||||
from .rumble import RumbleScraper
|
||||
from .telegram_snscrape import TelegramSnscrapeScraper
|
||||
from .telegram_telethon import TelegramTelethonScraper
|
||||
from .twitter import TwitterScraper
|
||||
@@ -4,7 +4,6 @@ from io import BytesIO
|
||||
from urllib.parse import urlparse
|
||||
import tempfile
|
||||
|
||||
import requests
|
||||
import boto3
|
||||
from loguru import logger
|
||||
import ffmpeg
|
||||
@@ -84,7 +83,7 @@ class Scraper:
|
||||
def can_handle(self, channel: Channel) -> bool:
|
||||
raise NotImplementedError
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@@ -102,8 +101,9 @@ class ScraperController:
|
||||
|
||||
def register_scrapers(self, scraper: List[Scraper]):
|
||||
self.scrapers.extend(scraper)
|
||||
|
||||
def scrape_channels(self, channels: List[Channel]):
|
||||
|
||||
@logger.catch
|
||||
def scrape_channels(self, channels: List[Channel], media: bool = True):
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
@@ -128,7 +128,7 @@ class ScraperController:
|
||||
else:
|
||||
since = None
|
||||
|
||||
posts = scraper.get_posts(channel, since=since)
|
||||
posts = scraper.get_posts(channel, since=since, media=media)
|
||||
|
||||
for post in posts:
|
||||
session.add(post)
|
||||
|
||||
@@ -11,6 +11,7 @@ from bs4 import BeautifulSoup
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class BitchuteScraper(Scraper):
|
||||
"""An implementation of a Scraper for Bitchute, using classes from the 4cat
|
||||
library"""
|
||||
@@ -21,7 +22,7 @@ class BitchuteScraper(Scraper):
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
session = requests.Session()
|
||||
session.headers.update(self.headers)
|
||||
@@ -42,11 +43,12 @@ class BitchuteScraper(Scraper):
|
||||
|
||||
archived_urls = {}
|
||||
|
||||
if 'video_url' in post:
|
||||
url = post['video_url']
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
if media:
|
||||
if 'video_url' in post:
|
||||
url = post['video_url']
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
|
||||
@@ -6,6 +6,7 @@ from garc import Garc
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class GabScraper(Scraper):
|
||||
"""An implementation of a Scraper for Gab, using GARC library"""
|
||||
__version__ = "GabScraper 0.0.1"
|
||||
@@ -15,7 +16,7 @@ class GabScraper(Scraper):
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
client = Garc(profile = 'main')
|
||||
username = GabScraper.get_username_from_url(channel.url)
|
||||
|
||||
@@ -28,15 +29,17 @@ class GabScraper(Scraper):
|
||||
media_urls = []
|
||||
archived_urls = {}
|
||||
|
||||
media_urls.extend([p['url'] for p in post['media_attachments']])
|
||||
if media:
|
||||
|
||||
if post.get('repost') is not None:
|
||||
media_urls.extend([p['url'] for p in post['repost']['media_attachments']])
|
||||
media_urls.extend([p['url'] for p in post['media_attachments']])
|
||||
|
||||
for url in media_urls:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
if post.get('repost') is not None:
|
||||
media_urls.extend([p['url'] for p in post['repost']['media_attachments']])
|
||||
|
||||
for url in media_urls:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
|
||||
@@ -7,6 +7,7 @@ from gogettr import PublicClient
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class GettrScraper(Scraper):
|
||||
"""An implementation of a Scraper for Gettr, using gogettr library"""
|
||||
__version__ = "GettrScraper 0.0.1"
|
||||
@@ -18,7 +19,7 @@ class GettrScraper(Scraper):
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
client = PublicClient()
|
||||
username = GettrScraper.get_username_from_url(channel.url)
|
||||
scraper = client.user_activity(username=username, type="posts")
|
||||
@@ -29,24 +30,26 @@ class GettrScraper(Scraper):
|
||||
|
||||
archived_urls = {}
|
||||
|
||||
if 'imgs' in post:
|
||||
for img in post['imgs']:
|
||||
url = "https://media.gettr.com/" + img
|
||||
if media:
|
||||
|
||||
if 'imgs' in post:
|
||||
for img in post['imgs']:
|
||||
url = "https://media.gettr.com/" + img
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[img] = archived_url
|
||||
|
||||
if 'main' in post:
|
||||
url = "https://media.gettr.com/" + post['main']
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[img] = archived_url
|
||||
archived_urls[post['main']] = archived_url
|
||||
|
||||
if 'main' in post:
|
||||
url = "https://media.gettr.com/" + post['main']
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[post['main']] = archived_url
|
||||
|
||||
if 'vid' in post:
|
||||
url = "https://media.gettr.com/" + post['vid']
|
||||
media_blob, content_type, key = self.m3u8_url_to_blob(url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[post['vid']] = archived_url
|
||||
if 'vid' in post:
|
||||
url = "https://media.gettr.com/" + post['vid']
|
||||
media_blob, content_type, key = self.m3u8_url_to_blob(url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[post['vid']] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
|
||||
@@ -19,7 +19,7 @@ class OdyseeScraper(Scraper):
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = OdyseeScraper.get_username_from_url(channel.url)
|
||||
odysee_channel = OdyseeChannel(channel_name = username)
|
||||
@@ -31,17 +31,19 @@ class OdyseeScraper(Scraper):
|
||||
break
|
||||
|
||||
archived_urls = {}
|
||||
url = video.info['streaming_url']
|
||||
|
||||
# Check if file is a video file or an m3u8 file
|
||||
r = requests.head(url)
|
||||
if r.headers['Content-Type'] == 'text/html; charset=utf-8':
|
||||
media_blob, content_type, key = self.m3u8_url_to_blob(url)
|
||||
else:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
if media:
|
||||
url = video.info['streaming_url']
|
||||
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
# Check if file is a video file or an m3u8 file
|
||||
r = requests.head(url)
|
||||
if r.headers['Content-Type'] == 'text/html; charset=utf-8':
|
||||
media_blob, content_type, key = self.m3u8_url_to_blob(url)
|
||||
else:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
all_comments = video.get_all_comments()
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ class RumbleScraper(Scraper):
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = RumbleScraper.get_username_from_url(channel.url)
|
||||
scraper = get_channel_videos(username)
|
||||
@@ -33,11 +33,13 @@ class RumbleScraper(Scraper):
|
||||
|
||||
archived_urls = {}
|
||||
|
||||
url = post['media_url']
|
||||
if media:
|
||||
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[post['media_url']] = archived_url
|
||||
url = post['media_url']
|
||||
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[post['media_url']] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
|
||||
@@ -14,7 +14,7 @@ class TelegramSnscrapeScraper(Scraper):
|
||||
if channel.platform == "Telegram" and channel.public and not channel.chat:
|
||||
return True
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
scr = snscrape.modules.telegram.TelegramChannelScraper(
|
||||
channel.screenname)
|
||||
|
||||
@@ -29,17 +29,19 @@ class TelegramSnscrapeScraper(Scraper):
|
||||
|
||||
archived_urls = {}
|
||||
|
||||
for image_url in post.images:
|
||||
logger.debug(f'Archiving image: {image_url}')
|
||||
media_blob, content_type, key = self.url_to_blob(image_url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[image_url] = archived_url
|
||||
if media:
|
||||
|
||||
if post.video:
|
||||
logger.debug(f'Archiving video: {post.video}')
|
||||
media_blob, content_type, key = self.url_to_blob(post.video)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[post.video] = archived_url
|
||||
for image_url in post.images:
|
||||
logger.debug(f'Archiving image: {image_url}')
|
||||
media_blob, content_type, key = self.url_to_blob(image_url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[image_url] = archived_url
|
||||
|
||||
if post.video:
|
||||
logger.debug(f'Archiving video: {post.video}')
|
||||
media_blob, content_type, key = self.url_to_blob(post.video)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[post.video] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
|
||||
75
cisticola/scraper/telegram_telethon.py
Normal file
75
cisticola/scraper/telegram_telethon.py
Normal file
@@ -0,0 +1,75 @@
|
||||
from typing import Generator
|
||||
from datetime import datetime, timezone
|
||||
import os
|
||||
import json
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from loguru import logger
|
||||
from telethon.sync import TelegramClient
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
|
||||
|
||||
class TelegramTelethonScraper(Scraper):
|
||||
__version__ = "TelegramTelethonScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(self, url):
|
||||
username = url.split('https://t.me/')[1]
|
||||
if username.startswith('s/'):
|
||||
username = username.split('s/')[1]
|
||||
return username
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Telegram" and channel.public and not channel.chat:
|
||||
return True
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
api_id = os.environ['TELEGRAM_API_ID_1']
|
||||
api_hash = os.environ['TELEGRAM_API_HASH_1']
|
||||
phone = os.environ['TELEGRAM_PHONE_1']
|
||||
|
||||
with TelegramClient(phone, api_id, api_hash) as client:
|
||||
|
||||
for post in client.iter_messages(username):
|
||||
|
||||
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
||||
logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}')
|
||||
break
|
||||
|
||||
post_url = f'{channel.url}/{post.id}'
|
||||
key = f'{username}_{post.id}'
|
||||
|
||||
archived_urls = {}
|
||||
|
||||
if media:
|
||||
|
||||
if post.media is not None:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
output_file = Path(temp_dir, key)
|
||||
client.download_media(post.media, output_file)
|
||||
|
||||
output_file_with_ext = os.listdir(temp_dir)[0]
|
||||
filename = Path(temp_dir, output_file_with_ext)
|
||||
|
||||
with open(filename, 'rb') as f:
|
||||
blob = f.read()
|
||||
|
||||
# TODO specify Content-Type
|
||||
archived_url = self.archive_media(blob = blob, content_type = '', key = output_file_with_ext)
|
||||
archived_urls[post_url] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Telegram",
|
||||
channel=channel.id,
|
||||
platform_id=post_url,
|
||||
date=post.date.replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(post.to_dict(), default=str),
|
||||
archived_urls=archived_urls)
|
||||
@@ -12,7 +12,7 @@ class TwitterScraper(Scraper):
|
||||
"""An implementation of a Scraper for Twitter, using snscrape library"""
|
||||
__version__ = "TwitterScraper 0.0.1"
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
scraper = TwitterProfileScraper(channel.platform_id)
|
||||
|
||||
first = True
|
||||
|
||||
@@ -10,4 +10,7 @@ addopts =
|
||||
--cov='cisticola'
|
||||
--cov-report html:reports/coverage
|
||||
--html='reports/tests.html'
|
||||
--self-contained-html
|
||||
--self-contained-html
|
||||
filterwarnings =
|
||||
ignore:the imp module is deprecated:DeprecationWarning
|
||||
ignore:The localize method is no longer necessary, as this time zone supports the fold attribute
|
||||
@@ -10,7 +10,7 @@ from cisticola.scraper import (
|
||||
|
||||
logger.remove()
|
||||
logger.add(sys.stderr, level="INFO")
|
||||
logger.add("../russian_telegram_ingest.log", level = "INFO")
|
||||
logger.add("../russian_telegram_ingest.log")
|
||||
|
||||
test_channels = [
|
||||
# Channel(
|
||||
|
||||
6
test.py
6
test.py
@@ -9,6 +9,7 @@ from cisticola.scraper import (
|
||||
OdyseeScraper,
|
||||
RumbleScraper,
|
||||
TelegramSnscrapeScraper,
|
||||
TelegramTelethonScraper,
|
||||
TwitterScraper)
|
||||
|
||||
test_channels = [
|
||||
@@ -117,11 +118,12 @@ scrapers = [
|
||||
OdyseeScraper(),
|
||||
RumbleScraper(),
|
||||
TelegramSnscrapeScraper(),
|
||||
TwitterScraper()]
|
||||
TwitterScraper()
|
||||
TelegramTelethonScraper()]
|
||||
|
||||
controller.register_scrapers(scrapers)
|
||||
|
||||
engine = create_engine('sqlite:///test3.db')
|
||||
controller.connect_to_db(engine)
|
||||
|
||||
controller.scrape_channels(test_channels)
|
||||
controller.scrape_channels(test_channels, media = True)
|
||||
@@ -81,7 +81,7 @@ RUMBLE_CHANNEL_KWARGS = {
|
||||
'chat': False,
|
||||
'notes': ''}
|
||||
|
||||
TELEGRAM_SNSCRAPE_CHANNEL_KWARGS = {
|
||||
TELEGRAM_CHANNEL_KWARGS = {
|
||||
'id': 5,
|
||||
'name': 'South West Ohio Proud Boys (test)',
|
||||
'platform_id': -1001276612436,
|
||||
@@ -141,7 +141,7 @@ def channel_kwargs():
|
||||
'gettr' : GETTR_CHANNEL_KWARGS,
|
||||
'odysee' : ODYSEE_CHANNEL_KWARGS,
|
||||
'rumble' : RUMBLE_CHANNEL_KWARGS,
|
||||
'telegram_snscrape' : TELEGRAM_SNSCRAPE_CHANNEL_KWARGS,
|
||||
'telegram' : TELEGRAM_CHANNEL_KWARGS,
|
||||
'twitter' : TWITTER_CHANNEL_KWARGS}
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
@@ -1,8 +1,14 @@
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import BitchuteScraper
|
||||
|
||||
def test_scrape_bitchute_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['bitchute'])]
|
||||
controller.register_scraper(scraper = BitchuteScraper())
|
||||
controller.scrape_channels(channels = channels, media = False)
|
||||
|
||||
def test_scrape_bitchute_channel(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['bitchute'])]
|
||||
controller.register_scraper(BitchuteScraper())
|
||||
controller.scrape_channels(channels)
|
||||
controller.register_scraper(scraper = BitchuteScraper())
|
||||
controller.scrape_channels(channels = channels, media = True)
|
||||
|
||||
@@ -1,8 +1,14 @@
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import GabScraper
|
||||
|
||||
def test_scrape_gab_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['gab'])]
|
||||
controller.register_scraper(scraper = GabScraper())
|
||||
controller.scrape_channels(channels = channels, media = False)
|
||||
|
||||
def test_scrape_gab_channel(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['gab'])]
|
||||
controller.register_scraper(GabScraper())
|
||||
controller.scrape_channels(channels)
|
||||
controller.register_scraper(scraper = GabScraper())
|
||||
controller.scrape_channels(channels = channels, media = True)
|
||||
|
||||
@@ -1,8 +1,14 @@
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import GettrScraper
|
||||
|
||||
def test_scrape_gettr_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['gettr'])]
|
||||
controller.register_scraper(scraper = GettrScraper())
|
||||
controller.scrape_channels(channels = channels, media = False)
|
||||
|
||||
def test_scrape_gettr_channel(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['gettr'])]
|
||||
controller.register_scraper(GettrScraper())
|
||||
controller.scrape_channels(channels)
|
||||
controller.register_scraper(scraper = GettrScraper())
|
||||
controller.scrape_channels(channels = channels, media = True)
|
||||
|
||||
@@ -1,8 +1,14 @@
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import OdyseeScraper
|
||||
|
||||
def test_scrape_odysee_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['odysee'])]
|
||||
controller.register_scraper(scraper = OdyseeScraper())
|
||||
controller.scrape_channels(channels = channels, media = False)
|
||||
|
||||
def test_scrape_odysee_channel(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['odysee'])]
|
||||
controller.register_scraper(OdyseeScraper())
|
||||
controller.scrape_channels(channels)
|
||||
controller.register_scraper(scraper = OdyseeScraper())
|
||||
controller.scrape_channels(channels = channels, media = True)
|
||||
|
||||
@@ -1,8 +1,14 @@
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import RumbleScraper
|
||||
|
||||
def test_scrape_rumble_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['rumble'])]
|
||||
controller.register_scraper(scraper = RumbleScraper())
|
||||
controller.scrape_channels(channels = channels, media = False)
|
||||
|
||||
def test_scrape_rumble_channel(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['rumble'])]
|
||||
controller.register_scraper(RumbleScraper())
|
||||
controller.scrape_channels(channels)
|
||||
controller.register_scraper(scraper = RumbleScraper())
|
||||
controller.scrape_channels(channels = channels, media = True)
|
||||
|
||||
@@ -1,8 +1,14 @@
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import TelegramSnscrapeScraper
|
||||
|
||||
def test_scrape_telegram_snscrape_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['telegram'])]
|
||||
controller.register_scraper(scraper = TelegramSnscrapeScraper())
|
||||
controller.scrape_channels(channels = channels, media = False)
|
||||
|
||||
def test_scrape_telegram_snscrape_channel(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['telegram_snscrape'])]
|
||||
controller.register_scraper(TelegramSnscrapeScraper())
|
||||
controller.scrape_channels(channels)
|
||||
channels = [Channel(**channel_kwargs['telegram'])]
|
||||
controller.register_scraper(scraper = TelegramSnscrapeScraper())
|
||||
controller.scrape_channels(channels = channels, media = True)
|
||||
|
||||
14
tests/scraper/telegram_telethon.py
Normal file
14
tests/scraper/telegram_telethon.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import TelegramTelethonScraper
|
||||
|
||||
def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['telegram'])]
|
||||
controller.register_scraper(scraper = TelegramTelethonScraper())
|
||||
controller.scrape_channels(channels = channels, media = False)
|
||||
|
||||
def test_scrape_telegram_telethon_channel(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['telegram'])]
|
||||
controller.register_scraper(scraper = TelegramTelethonScraper())
|
||||
controller.scrape_channels(channels = channels, media = True)
|
||||
@@ -1,8 +1,14 @@
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import TwitterScraper
|
||||
|
||||
def test_scrape_twitter_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['twitter'])]
|
||||
controller.register_scraper(scraper = TwitterScraper())
|
||||
controller.scrape_channels(channels = channels, media = False)
|
||||
|
||||
def test_scrape_twitter_channel(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['twitter'])]
|
||||
controller.register_scraper(TwitterScraper())
|
||||
controller.scrape_channels(channels)
|
||||
controller.register_scraper(scraper = TwitterScraper())
|
||||
controller.scrape_channels(channels = channels, media = True)
|
||||
|
||||
Reference in New Issue
Block a user