From 739e1d848419d7641d509924d8f77668b7523a51 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Wed, 9 Mar 2022 12:12:01 -0600 Subject: [PATCH] added capability of running scraper without archiving media, and implemented prototype Telethon scraper for Telegram --- .gitignore | 1 + Pipfile | 1 + Pipfile.lock | 172 +++++++++++++++---------- cisticola/scraper/__init__.py | 1 + cisticola/scraper/base.py | 10 +- cisticola/scraper/bitchute.py | 14 +- cisticola/scraper/gab.py | 19 +-- cisticola/scraper/gettr.py | 35 ++--- cisticola/scraper/odysee.py | 22 ++-- cisticola/scraper/rumble.py | 12 +- cisticola/scraper/telegram_snscrape.py | 24 ++-- cisticola/scraper/telegram_telethon.py | 75 +++++++++++ cisticola/scraper/twitter.py | 2 +- pytest.ini | 5 +- russian_telegram_ingest.py | 2 +- test.py | 6 +- tests/conftest.py | 4 +- tests/scraper/bitchute.py | 10 +- tests/scraper/gab.py | 10 +- tests/scraper/gettr.py | 10 +- tests/scraper/odysee.py | 10 +- tests/scraper/rumble.py | 10 +- tests/scraper/telegram_snscrape.py | 12 +- tests/scraper/telegram_telethon.py | 14 ++ tests/scraper/twitter.py | 10 +- 25 files changed, 340 insertions(+), 151 deletions(-) create mode 100644 cisticola/scraper/telegram_telethon.py create mode 100644 tests/scraper/telegram_telethon.py diff --git a/.gitignore b/.gitignore index 1f00762..747d85e 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ docs/source/_* *.ipynb *.db .env +*.session # Unit test / coverage reports reports diff --git a/Pipfile b/Pipfile index c8eabfc..62f2c74 100644 --- a/Pipfile +++ b/Pipfile @@ -17,6 +17,7 @@ ffmpeg-python = "*" polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"} garc = "*" youtube-dl = "*" +telethon = "*" [dev-packages] pytest = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 76b4c3f..0ca0eda 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "ea2a1f1dff68fa0bd30dab06553e913f467c3b1399388b97f0ed913ab74c6e85" + "sha256": "3d293e1f3802d64ae7a8fbfc4c1d742cc33cd4c520a6263f93e566f89faa7013" }, "pipfile-spec": 6, "requires": { @@ -49,19 +49,19 @@ }, "boto3": { "hashes": [ - "sha256:75709628320cea8ce137975dc33b75213c2e4f6e7cd09e55290de7245e2c79e2", - "sha256:c92ec20a670721b5a1bc013b305a84db2b7f9c716653b3056ce7e2fbd2a180ef" + "sha256:30394729b38d5ce2f845440428a55161c6d45478044e553a12ca1acf56d7278a", + "sha256:895489900eb882777124c3b64a13df49785cf77f7bd1504e783464fb3b4c8163" ], "index": "pypi", - "version": "==1.21.12" + "version": "==1.21.15" }, "botocore": { "hashes": [ - "sha256:0174999a04b0a2e42457106093ace9b36fa94772a442d9bcf60750263d1d073e", - "sha256:0cd7395311a3fef4aad8df8f511b4f7d221c24ae30934bd5c03458b0fc096d0c" + "sha256:405082f92a9e524e1aee96cbc90134668026d7da3c12f86990c91a12620ca28b", + "sha256:fa4816e94e72111a9341204061e760bcbde74ca5d900d3f2206c2c2e8e4b56e4" ], "markers": "python_version >= '3.6'", - "version": "==1.24.12" + "version": "==1.24.15" }, "bs4": { "hashes": [ @@ -378,28 +378,28 @@ }, "numpy": { "hashes": [ - "sha256:03ae5850619abb34a879d5f2d4bb4dcd025d6d8fb72f5e461dae84edccfe129f", - "sha256:076aee5a3763d41da6bef9565fdf3cb987606f567cd8b104aded2b38b7b47abf", - "sha256:0b536b6840e84c1c6a410f3a5aa727821e6108f3454d81a5cd5900999ef04f89", - "sha256:15efb7b93806d438e3bc590ca8ef2f953b0ce4f86f337ef4559d31ec6cf9d7dd", - "sha256:168259b1b184aa83a514f307352c25c56af111c269ffc109d9704e81f72e764b", - "sha256:2638389562bda1635b564490d76713695ff497242a83d9b684d27bb4a6cc9d7a", - "sha256:3556c5550de40027d3121ebbb170f61bbe19eb639c7ad0c7b482cd9b560cd23b", - "sha256:4a176959b6e7e00b5a0d6f549a479f869829bfd8150282c590deee6d099bbb6e", - "sha256:515a8b6edbb904594685da6e176ac9fbea8f73a5ebae947281de6613e27f1956", - "sha256:55535c7c2f61e2b2fc817c5cbe1af7cb907c7f011e46ae0a52caa4be1f19afe2", - "sha256:59153979d60f5bfe9e4c00e401e24dfe0469ef8da6d68247439d3278f30a180f", - "sha256:60cb8e5933193a3cc2912ee29ca331e9c15b2da034f76159b7abc520b3d1233a", - "sha256:6767ad399e9327bfdbaa40871be4254d1995f4a3ca3806127f10cec778bd9896", - "sha256:76a4f9bce0278becc2da7da3b8ef854bed41a991f4226911a24a9711baad672c", - "sha256:8cf33634b60c9cef346663a222d9841d3bbbc0a2f00221d6bcfd0d993d5543f6", - "sha256:94dd11d9f13ea1be17bac39c1942f527cbf7065f94953cf62dfe805653da2f8f", - "sha256:aafa46b5a39a27aca566198d3312fb3bde95ce9677085efd02c86f7ef6be4ec7", - "sha256:badca914580eb46385e7f7e4e426fea6de0a37b9e06bec252e481ae7ec287082", - "sha256:d76a26c5118c4d96e264acc9e3242d72e1a2b92e739807b3b69d8d47684b6677" + "sha256:07a8c89a04997625236c5ecb7afe35a02af3896c8aa01890a849913a2309c676", + "sha256:08d9b008d0156c70dc392bb3ab3abb6e7a711383c3247b410b39962263576cd4", + "sha256:201b4d0552831f7250a08d3b38de0d989d6f6e4658b709a02a73c524ccc6ffce", + "sha256:2c10a93606e0b4b95c9b04b77dc349b398fdfbda382d2a39ba5a822f669a0123", + "sha256:3ca688e1b9b95d80250bca34b11a05e389b1420d00e87a0d12dc45f131f704a1", + "sha256:48a3aecd3b997bf452a2dedb11f4e79bc5bfd21a1d4cc760e703c31d57c84b3e", + "sha256:568dfd16224abddafb1cbcce2ff14f522abe037268514dd7e42c6776a1c3f8e5", + "sha256:5bfb1bb598e8229c2d5d48db1860bcf4311337864ea3efdbe1171fb0c5da515d", + "sha256:639b54cdf6aa4f82fe37ebf70401bbb74b8508fddcf4797f9fe59615b8c5813a", + "sha256:8251ed96f38b47b4295b1ae51631de7ffa8260b5b087808ef09a39a9d66c97ab", + "sha256:92bfa69cfbdf7dfc3040978ad09a48091143cffb778ec3b03fa170c494118d75", + "sha256:97098b95aa4e418529099c26558eeb8486e66bd1e53a6b606d684d0c3616b168", + "sha256:a3bae1a2ed00e90b3ba5f7bd0a7c7999b55d609e0c54ceb2b076a25e345fa9f4", + "sha256:c34ea7e9d13a70bf2ab64a2532fe149a9aced424cd05a2c4ba662fd989e3e45f", + "sha256:dbc7601a3b7472d559dc7b933b18b4b66f9aa7452c120e87dfb33d02008c8a18", + "sha256:e7927a589df200c5e23c57970bafbd0cd322459aa7b1ff73b7c2e84d6e3eae62", + "sha256:f8c1f39caad2c896bc0018f699882b345b2a63708008be29b1f355ebf6f933fe", + "sha256:fade0d4f4d292b6f39951b6836d7a3c7ef5b2347f3c420cd9820a1d90d794802", + "sha256:fdf3c08bce27132395d3c3ba1503cac12e17282358cb4bddc25cc46b0aca07aa" ], "markers": "python_version < '3.10' and platform_machine != 'aarch64' and platform_machine != 'arm64'", - "version": "==1.22.2" + "version": "==1.22.3" }, "packaging": { "hashes": [ @@ -446,7 +446,7 @@ }, "polyphemus": { "git": "https://github.com/bellingcat/polyphemus.git", - "ref": "8506fd43770661cdcf92c5cac2356cba74778834" + "ref": "c85dea215ae720e3df71d2ed1aaa82f7b8a6a2ed" }, "py": { "hashes": [ @@ -456,6 +456,30 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==1.11.0" }, + "pyaes": { + "hashes": [ + "sha256:02c1b1405c38d3c370b085fb952dd8bea3fadcee6411ad99f312cc129c536d8f" + ], + "version": "==1.6.1" + }, + "pyasn1": { + "hashes": [ + "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359", + "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576", + "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf", + "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7", + "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d", + "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00", + "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8", + "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86", + "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12", + "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776", + "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba", + "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2", + "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3" + ], + "version": "==0.4.8" + }, "pygments": { "hashes": [ "sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65", @@ -592,9 +616,6 @@ "version": "==2022.3.2" }, "requests": { - "extras": [ - "socks" - ], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" @@ -602,6 +623,14 @@ "index": "pypi", "version": "==2.27.1" }, + "rsa": { + "hashes": [ + "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17", + "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb" + ], + "markers": "python_version >= '3.6' and python_version < '4.0'", + "version": "==4.8" + }, "s3transfer": { "hashes": [ "sha256:7a6f4c4d1fdb9a2b640244008e142cbc2cd3ae34b386584ef044dd0f27101971", @@ -695,45 +724,52 @@ }, "sqlalchemy": { "hashes": [ - "sha256:05fa14f279d43df68964ad066f653193187909950aa0163320b728edfc400167", - "sha256:0ddc5e5ccc0160e7ad190e5c61eb57560f38559e22586955f205e537cda26034", - "sha256:15a03261aa1e68f208e71ae3cd845b00063d242cbf8c87348a0c2c0fc6e1f2ac", - "sha256:289465162b1fa1e7a982f8abe59d26a8331211cad4942e8031d2b7db1f75e649", - "sha256:2e216c13ecc7fcdcbb86bb3225425b3ed338e43a8810c7089ddb472676124b9b", - "sha256:2fd4d3ca64c41dae31228b80556ab55b6489275fb204827f6560b65f95692cf3", - "sha256:330eb45395874cc7787214fdd4489e2afb931bc49e0a7a8f9cd56d6e9c5b1639", - "sha256:3c7ed6c69debaf6198fadb1c16ae1253a29a7670bbf0646f92582eb465a0b999", - "sha256:4ad31cec8b49fd718470328ad9711f4dc703507d434fd45461096da0a7135ee0", - "sha256:57205844f246bab9b666a32f59b046add8995c665d9ecb2b7b837b087df90639", - "sha256:582b59d1e5780a447aada22b461e50b404a9dc05768da1d87368ad8190468418", - "sha256:5e9c7b3567edbc2183607f7d9f3e7e89355b8f8984eec4d2cd1e1513c8f7b43f", - "sha256:6a01ec49ca54ce03bc14e10de55dfc64187a2194b3b0e5ac0fdbe9b24767e79e", - "sha256:6f22c040d196f841168b1456e77c30a18a3dc16b336ddbc5a24ce01ab4e95ae0", - "sha256:81f2dd355b57770fdf292b54f3e0a9823ec27a543f947fa2eb4ec0df44f35f0d", - "sha256:85e4c244e1de056d48dae466e9baf9437980c19fcde493e0db1a0a986e6d75b4", - "sha256:8d0949b11681380b4a50ac3cd075e4816afe9fa4a8c8ae006c1ca26f0fa40ad8", - "sha256:975f5c0793892c634c4920057da0de3a48bbbbd0a5c86f5fcf2f2fedf41b76da", - "sha256:9e4fb2895b83993831ba2401b6404de953fdbfa9d7d4fa6a4756294a83bbc94f", - "sha256:b35dca159c1c9fa8a5f9005e42133eed82705bf8e243da371a5e5826440e65ca", - "sha256:b7b20c88873675903d6438d8b33fba027997193e274b9367421e610d9da76c08", - "sha256:bb4b15fb1f0aafa65cbdc62d3c2078bea1ceecbfccc9a1f23a2113c9ac1191fa", - "sha256:c0c7171aa5a57e522a04a31b84798b6c926234cb559c0939840c3235cf068813", - "sha256:c317ddd7c586af350a6aef22b891e84b16bff1a27886ed5b30f15c1ed59caeaa", - "sha256:c3abc34fed19fdeaead0ced8cf56dd121f08198008c033596aa6aae7cc58f59f", - "sha256:ca68c52e3cae491ace2bf39b35fef4ce26c192fd70b4cd90f040d419f70893b5", - "sha256:cf2cd387409b12d0a8b801610d6336ee7d24043b6dd965950eaec09b73e7262f", - "sha256:d046a9aeba9bc53e88a41e58beb72b6205abb9a20f6c136161adf9128e589db5", - "sha256:d5c20c8415173b119762b6110af64448adccd4d11f273fb9f718a9865b88a99c", - "sha256:d86132922531f0dc5a4f424c7580a472a924dd737602638e704841c9cb24aea2", - "sha256:dccff41478050e823271642837b904d5f9bda3f5cf7d371ce163f00a694118d6", - "sha256:de85c26a5a1c72e695ab0454e92f60213b4459b8d7c502e0be7a6369690eeb1a", - "sha256:e3a86b59b6227ef72ffc10d4b23f0fe994bef64d4667eab4fb8cd43de4223bec", - "sha256:e79e73d5ee24196d3057340e356e6254af4d10e1fc22d3207ea8342fc5ffb977", - "sha256:ea8210090a816d48a4291a47462bac750e3bc5c2442e6d64f7b8137a7c3f9ac5", - "sha256:f3b7ec97e68b68cb1f9ddb82eda17b418f19a034fa8380a0ac04e8fe01532875" + "sha256:04164e0063feb7aedd9d073db0fd496edb244be40d46ea1f0d8990815e4b8c34", + "sha256:159c2f69dd6efd28e894f261ffca1100690f28210f34cfcd70b895e0ea7a64f3", + "sha256:199dc6d0068753b6a8c0bd3aceb86a3e782df118260ebc1fa981ea31ee054674", + "sha256:1bbac3e8293b34c4403d297e21e8f10d2a57756b75cff101dc62186adec725f5", + "sha256:20e9eba7fd86ef52e0df25bea83b8b518dfdf0bce09b336cfe51671f52aaaa3f", + "sha256:290cbdf19129ae520d4bdce392648c6fcdbee763bc8f750b53a5ab51880cb9c9", + "sha256:316270e5867566376e69a0ac738b863d41396e2b63274616817e1d34156dff0e", + "sha256:3f88a4ee192142eeed3fe173f673ea6ab1f5a863810a9d85dbf6c67a9bd08f97", + "sha256:4aa96e957141006181ca58e792e900ee511085b8dae06c2d08c00f108280fb8a", + "sha256:4b2bcab3a914715d332ca783e9bda13bc570d8b9ef087563210ba63082c18c16", + "sha256:576684771456d02e24078047c2567025f2011977aa342063468577d94e194b00", + "sha256:5a2e73508f939175363d8a4be9dcdc84cf16a92578d7fa86e6e4ca0e6b3667b2", + "sha256:5ba59761c19b800bc2e1c9324da04d35ef51e4ee9621ff37534bc2290d258f71", + "sha256:5dc9801ae9884e822ba942ca493642fb50f049c06b6dbe3178691fce48ceb089", + "sha256:6fdd2dc5931daab778c2b65b03df6ae68376e028a3098eb624d0909d999885bc", + "sha256:708973b5d9e1e441188124aaf13c121e5b03b6054c2df59b32219175a25aa13e", + "sha256:7ff72b3cc9242d1a1c9b84bd945907bf174d74fc2519efe6184d6390a8df478b", + "sha256:8679f9aba5ac22e7bce54ccd8a77641d3aea3e2d96e73e4356c887ebf8ff1082", + "sha256:8b9a395122770a6f08ebfd0321546d7379f43505882c7419d7886856a07caa13", + "sha256:8e1e5d96b744a4f91163290b01045430f3f32579e46d87282449e5b14d27d4ac", + "sha256:9a0195af6b9050c9322a97cf07514f66fe511968e623ca87b2df5e3cf6349615", + "sha256:9cb5698c896fa72f88e7ef04ef62572faf56809093180771d9be8d9f2e264a13", + "sha256:b3f1d9b3aa09ab9adc7f8c4b40fc3e081eb903054c9a6f9ae1633fe15ae503b4", + "sha256:bb42f9b259c33662c6a9b866012f6908a91731a419e69304e1261ba3ab87b8d1", + "sha256:bca714d831e5b8860c3ab134c93aec63d1a4f493bed20084f54e3ce9f0a3bf99", + "sha256:bedd89c34ab62565d44745212814e4b57ef1c24ad4af9b29c504ce40f0dc6558", + "sha256:bfec934aac7f9fa95fc82147a4ba5db0a8bdc4ebf1e33b585ab8860beb10232f", + "sha256:c7046f7aa2db445daccc8424f50b47a66c4039c9f058246b43796aa818f8b751", + "sha256:d7e483f4791fbda60e23926b098702340504f7684ce7e1fd2c1bf02029288423", + "sha256:dd93162615870c976dba43963a24bb418b28448fef584f30755990c134a06a55", + "sha256:e4607d2d16330757818c9d6fba322c2e80b4b112ff24295d1343a80b876eb0ed", + "sha256:e9a680d9665f88346ed339888781f5236347933906c5a56348abb8261282ec48", + "sha256:edfcf93fd92e2f9eef640b3a7a40db20fe3c1d7c2c74faa41424c63dead61b76", + "sha256:f7e4a3c0c3c596296b37f8427c467c8e4336dc8d50f8ed38042e8ba79507b2c9", + "sha256:fff677fa4522dafb5a5e2c0cf909790d5d367326321aeabc0dffc9047cb235bd" ], "index": "pypi", - "version": "==1.4.31" + "version": "==1.4.32" + }, + "telethon": { + "hashes": [ + "sha256:04fdc5fa4ed3e886e6ecf4bad79205ab8880c6aefbd42c29c89c689a502aa816", + "sha256:818cb61281ed3f75ba4da9b68cb69486bed9474d2db4e0aa16e482053117452c" + ], + "index": "pypi", + "version": "==1.24.0" }, "tomli": { "hashes": [ diff --git a/cisticola/scraper/__init__.py b/cisticola/scraper/__init__.py index 65b41e8..2d692e8 100644 --- a/cisticola/scraper/__init__.py +++ b/cisticola/scraper/__init__.py @@ -6,4 +6,5 @@ from .gettr import GettrScraper from .odysee import OdyseeScraper from .rumble import RumbleScraper from .telegram_snscrape import TelegramSnscrapeScraper +from .telegram_telethon import TelegramTelethonScraper from .twitter import TwitterScraper \ No newline at end of file diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 0d8e951..9b717d5 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -4,7 +4,6 @@ from io import BytesIO from urllib.parse import urlparse import tempfile -import requests import boto3 from loguru import logger import ffmpeg @@ -84,7 +83,7 @@ class Scraper: def can_handle(self, channel: Channel) -> bool: raise NotImplementedError - def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]: raise NotImplementedError @@ -102,8 +101,9 @@ class ScraperController: def register_scrapers(self, scraper: List[Scraper]): self.scrapers.extend(scraper) - - def scrape_channels(self, channels: List[Channel]): + + @logger.catch + def scrape_channels(self, channels: List[Channel], media: bool = True): if self.session is None: logger.error("No DB session") return @@ -128,7 +128,7 @@ class ScraperController: else: since = None - posts = scraper.get_posts(channel, since=since) + posts = scraper.get_posts(channel, since=since, media=media) for post in posts: session.add(post) diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index aa8d53c..9e0957b 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -11,6 +11,7 @@ from bs4 import BeautifulSoup from cisticola.base import Channel, ScraperResult from cisticola.scraper.base import Scraper + class BitchuteScraper(Scraper): """An implementation of a Scraper for Bitchute, using classes from the 4cat library""" @@ -21,7 +22,7 @@ class BitchuteScraper(Scraper): return username - def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]: session = requests.Session() session.headers.update(self.headers) @@ -42,11 +43,12 @@ class BitchuteScraper(Scraper): archived_urls = {} - if 'video_url' in post: - url = post['video_url'] - media_blob, content_type, key = self.url_to_blob(url) - archived_url = self.archive_media(media_blob, content_type, key) - archived_urls[url] = archived_url + if media: + if 'video_url' in post: + url = post['video_url'] + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) + archived_urls[url] = archived_url yield ScraperResult( scraper=self.__version__, diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py index b3cd5e4..16f058a 100644 --- a/cisticola/scraper/gab.py +++ b/cisticola/scraper/gab.py @@ -6,6 +6,7 @@ from garc import Garc from cisticola.base import Channel, ScraperResult from cisticola.scraper.base import Scraper + class GabScraper(Scraper): """An implementation of a Scraper for Gab, using GARC library""" __version__ = "GabScraper 0.0.1" @@ -15,7 +16,7 @@ class GabScraper(Scraper): return username - def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]: client = Garc(profile = 'main') username = GabScraper.get_username_from_url(channel.url) @@ -28,15 +29,17 @@ class GabScraper(Scraper): media_urls = [] archived_urls = {} - media_urls.extend([p['url'] for p in post['media_attachments']]) + if media: - if post.get('repost') is not None: - media_urls.extend([p['url'] for p in post['repost']['media_attachments']]) + media_urls.extend([p['url'] for p in post['media_attachments']]) - for url in media_urls: - media_blob, content_type, key = self.url_to_blob(url) - archived_url = self.archive_media(media_blob, content_type, key) - archived_urls[url] = archived_url + if post.get('repost') is not None: + media_urls.extend([p['url'] for p in post['repost']['media_attachments']]) + + for url in media_urls: + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) + archived_urls[url] = archived_url yield ScraperResult( scraper=self.__version__, diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index 4a1c206..8f4cc19 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -7,6 +7,7 @@ from gogettr import PublicClient from cisticola.base import Channel, ScraperResult from cisticola.scraper.base import Scraper + class GettrScraper(Scraper): """An implementation of a Scraper for Gettr, using gogettr library""" __version__ = "GettrScraper 0.0.1" @@ -18,7 +19,7 @@ class GettrScraper(Scraper): return username - def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]: client = PublicClient() username = GettrScraper.get_username_from_url(channel.url) scraper = client.user_activity(username=username, type="posts") @@ -29,24 +30,26 @@ class GettrScraper(Scraper): archived_urls = {} - if 'imgs' in post: - for img in post['imgs']: - url = "https://media.gettr.com/" + img + if media: + + if 'imgs' in post: + for img in post['imgs']: + url = "https://media.gettr.com/" + img + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) + archived_urls[img] = archived_url + + if 'main' in post: + url = "https://media.gettr.com/" + post['main'] media_blob, content_type, key = self.url_to_blob(url) archived_url = self.archive_media(media_blob, content_type, key) - archived_urls[img] = archived_url + archived_urls[post['main']] = archived_url - if 'main' in post: - url = "https://media.gettr.com/" + post['main'] - media_blob, content_type, key = self.url_to_blob(url) - archived_url = self.archive_media(media_blob, content_type, key) - archived_urls[post['main']] = archived_url - - if 'vid' in post: - url = "https://media.gettr.com/" + post['vid'] - media_blob, content_type, key = self.m3u8_url_to_blob(url) - archived_url = self.archive_media(media_blob, content_type, key) - archived_urls[post['vid']] = archived_url + if 'vid' in post: + url = "https://media.gettr.com/" + post['vid'] + media_blob, content_type, key = self.m3u8_url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) + archived_urls[post['vid']] = archived_url yield ScraperResult( scraper=self.__version__, diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index bd1d3aa..27921bd 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -19,7 +19,7 @@ class OdyseeScraper(Scraper): return username - def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]: username = OdyseeScraper.get_username_from_url(channel.url) odysee_channel = OdyseeChannel(channel_name = username) @@ -31,17 +31,19 @@ class OdyseeScraper(Scraper): break archived_urls = {} - url = video.info['streaming_url'] - # Check if file is a video file or an m3u8 file - r = requests.head(url) - if r.headers['Content-Type'] == 'text/html; charset=utf-8': - media_blob, content_type, key = self.m3u8_url_to_blob(url) - else: - media_blob, content_type, key = self.url_to_blob(url) + if media: + url = video.info['streaming_url'] - archived_url = self.archive_media(media_blob, content_type, key) - archived_urls[url] = archived_url + # Check if file is a video file or an m3u8 file + r = requests.head(url) + if r.headers['Content-Type'] == 'text/html; charset=utf-8': + media_blob, content_type, key = self.m3u8_url_to_blob(url) + else: + media_blob, content_type, key = self.url_to_blob(url) + + archived_url = self.archive_media(media_blob, content_type, key) + archived_urls[url] = archived_url all_comments = video.get_all_comments() diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index 7c681f4..e4316d5 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -22,7 +22,7 @@ class RumbleScraper(Scraper): return username - def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]: username = RumbleScraper.get_username_from_url(channel.url) scraper = get_channel_videos(username) @@ -33,11 +33,13 @@ class RumbleScraper(Scraper): archived_urls = {} - url = post['media_url'] + if media: - media_blob, content_type, key = self.url_to_blob(url) - archived_url = self.archive_media(media_blob, content_type, key) - archived_urls[post['media_url']] = archived_url + url = post['media_url'] + + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) + archived_urls[post['media_url']] = archived_url yield ScraperResult( scraper=self.__version__, diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py index e2f4fd8..458f726 100644 --- a/cisticola/scraper/telegram_snscrape.py +++ b/cisticola/scraper/telegram_snscrape.py @@ -14,7 +14,7 @@ class TelegramSnscrapeScraper(Scraper): if channel.platform == "Telegram" and channel.public and not channel.chat: return True - def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]: scr = snscrape.modules.telegram.TelegramChannelScraper( channel.screenname) @@ -29,17 +29,19 @@ class TelegramSnscrapeScraper(Scraper): archived_urls = {} - for image_url in post.images: - logger.debug(f'Archiving image: {image_url}') - media_blob, content_type, key = self.url_to_blob(image_url) - archived_url = self.archive_media(media_blob, content_type, key) - archived_urls[image_url] = archived_url + if media: - if post.video: - logger.debug(f'Archiving video: {post.video}') - media_blob, content_type, key = self.url_to_blob(post.video) - archived_url = self.archive_media(media_blob, content_type, key) - archived_urls[post.video] = archived_url + for image_url in post.images: + logger.debug(f'Archiving image: {image_url}') + media_blob, content_type, key = self.url_to_blob(image_url) + archived_url = self.archive_media(media_blob, content_type, key) + archived_urls[image_url] = archived_url + + if post.video: + logger.debug(f'Archiving video: {post.video}') + media_blob, content_type, key = self.url_to_blob(post.video) + archived_url = self.archive_media(media_blob, content_type, key) + archived_urls[post.video] = archived_url yield ScraperResult( scraper=self.__version__, diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py new file mode 100644 index 0000000..5eb17e7 --- /dev/null +++ b/cisticola/scraper/telegram_telethon.py @@ -0,0 +1,75 @@ +from typing import Generator +from datetime import datetime, timezone +import os +import json +import tempfile +from pathlib import Path + +from loguru import logger +from telethon.sync import TelegramClient + +from cisticola.base import Channel, ScraperResult +from cisticola.scraper.base import Scraper + +MEDIA_TYPES = ['photo', 'video', 'document', 'webpage'] + +class TelegramTelethonScraper(Scraper): + __version__ = "TelegramTelethonScraper 0.0.1" + + def get_username_from_url(self, url): + username = url.split('https://t.me/')[1] + if username.startswith('s/'): + username = username.split('s/')[1] + return username + + def can_handle(self, channel): + if channel.platform == "Telegram" and channel.public and not channel.chat: + return True + + def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]: + + username = self.get_username_from_url(channel.url) + + api_id = os.environ['TELEGRAM_API_ID_1'] + api_hash = os.environ['TELEGRAM_API_HASH_1'] + phone = os.environ['TELEGRAM_PHONE_1'] + + with TelegramClient(phone, api_id, api_hash) as client: + + for post in client.iter_messages(username): + + if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): + logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}') + break + + post_url = f'{channel.url}/{post.id}' + key = f'{username}_{post.id}' + + archived_urls = {} + + if media: + + if post.media is not None: + with tempfile.TemporaryDirectory() as temp_dir: + output_file = Path(temp_dir, key) + client.download_media(post.media, output_file) + + output_file_with_ext = os.listdir(temp_dir)[0] + filename = Path(temp_dir, output_file_with_ext) + + with open(filename, 'rb') as f: + blob = f.read() + + # TODO specify Content-Type + archived_url = self.archive_media(blob = blob, content_type = '', key = output_file_with_ext) + archived_urls[post_url] = archived_url + + yield ScraperResult( + scraper=self.__version__, + platform="Telegram", + channel=channel.id, + platform_id=post_url, + date=post.date.replace(tzinfo=timezone.utc), + date_archived=datetime.now(timezone.utc), + raw_data=json.dumps(post.to_dict(), default=str), + archived_urls=archived_urls) diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index de72de2..19eb33c 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -12,7 +12,7 @@ class TwitterScraper(Scraper): """An implementation of a Scraper for Twitter, using snscrape library""" __version__ = "TwitterScraper 0.0.1" - def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]: scraper = TwitterProfileScraper(channel.platform_id) first = True diff --git a/pytest.ini b/pytest.ini index 844d239..09a94e1 100644 --- a/pytest.ini +++ b/pytest.ini @@ -10,4 +10,7 @@ addopts = --cov='cisticola' --cov-report html:reports/coverage --html='reports/tests.html' - --self-contained-html \ No newline at end of file + --self-contained-html +filterwarnings = + ignore:the imp module is deprecated:DeprecationWarning + ignore:The localize method is no longer necessary, as this time zone supports the fold attribute \ No newline at end of file diff --git a/russian_telegram_ingest.py b/russian_telegram_ingest.py index 97c58a0..7f490ca 100644 --- a/russian_telegram_ingest.py +++ b/russian_telegram_ingest.py @@ -10,7 +10,7 @@ from cisticola.scraper import ( logger.remove() logger.add(sys.stderr, level="INFO") -logger.add("../russian_telegram_ingest.log", level = "INFO") +logger.add("../russian_telegram_ingest.log") test_channels = [ # Channel( diff --git a/test.py b/test.py index aec6b22..21add12 100644 --- a/test.py +++ b/test.py @@ -9,6 +9,7 @@ from cisticola.scraper import ( OdyseeScraper, RumbleScraper, TelegramSnscrapeScraper, + TelegramTelethonScraper, TwitterScraper) test_channels = [ @@ -117,11 +118,12 @@ scrapers = [ OdyseeScraper(), RumbleScraper(), TelegramSnscrapeScraper(), - TwitterScraper()] + TwitterScraper() + TelegramTelethonScraper()] controller.register_scrapers(scrapers) engine = create_engine('sqlite:///test3.db') controller.connect_to_db(engine) -controller.scrape_channels(test_channels) \ No newline at end of file +controller.scrape_channels(test_channels, media = True) \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 547d02f..161439d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -81,7 +81,7 @@ RUMBLE_CHANNEL_KWARGS = { 'chat': False, 'notes': ''} -TELEGRAM_SNSCRAPE_CHANNEL_KWARGS = { +TELEGRAM_CHANNEL_KWARGS = { 'id': 5, 'name': 'South West Ohio Proud Boys (test)', 'platform_id': -1001276612436, @@ -141,7 +141,7 @@ def channel_kwargs(): 'gettr' : GETTR_CHANNEL_KWARGS, 'odysee' : ODYSEE_CHANNEL_KWARGS, 'rumble' : RUMBLE_CHANNEL_KWARGS, - 'telegram_snscrape' : TELEGRAM_SNSCRAPE_CHANNEL_KWARGS, + 'telegram' : TELEGRAM_CHANNEL_KWARGS, 'twitter' : TWITTER_CHANNEL_KWARGS} #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/tests/scraper/bitchute.py b/tests/scraper/bitchute.py index 83883d8..2071568 100644 --- a/tests/scraper/bitchute.py +++ b/tests/scraper/bitchute.py @@ -1,8 +1,14 @@ from cisticola.base import Channel from cisticola.scraper import BitchuteScraper +def test_scrape_bitchute_channel_no_media(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['bitchute'])] + controller.register_scraper(scraper = BitchuteScraper()) + controller.scrape_channels(channels = channels, media = False) + def test_scrape_bitchute_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['bitchute'])] - controller.register_scraper(BitchuteScraper()) - controller.scrape_channels(channels) + controller.register_scraper(scraper = BitchuteScraper()) + controller.scrape_channels(channels = channels, media = True) diff --git a/tests/scraper/gab.py b/tests/scraper/gab.py index a360af3..daf100a 100644 --- a/tests/scraper/gab.py +++ b/tests/scraper/gab.py @@ -1,8 +1,14 @@ from cisticola.base import Channel from cisticola.scraper import GabScraper +def test_scrape_gab_channel_no_media(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['gab'])] + controller.register_scraper(scraper = GabScraper()) + controller.scrape_channels(channels = channels, media = False) + def test_scrape_gab_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['gab'])] - controller.register_scraper(GabScraper()) - controller.scrape_channels(channels) + controller.register_scraper(scraper = GabScraper()) + controller.scrape_channels(channels = channels, media = True) diff --git a/tests/scraper/gettr.py b/tests/scraper/gettr.py index ac08db7..13800c2 100644 --- a/tests/scraper/gettr.py +++ b/tests/scraper/gettr.py @@ -1,8 +1,14 @@ from cisticola.base import Channel from cisticola.scraper import GettrScraper +def test_scrape_gettr_channel_no_media(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['gettr'])] + controller.register_scraper(scraper = GettrScraper()) + controller.scrape_channels(channels = channels, media = False) + def test_scrape_gettr_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['gettr'])] - controller.register_scraper(GettrScraper()) - controller.scrape_channels(channels) + controller.register_scraper(scraper = GettrScraper()) + controller.scrape_channels(channels = channels, media = True) diff --git a/tests/scraper/odysee.py b/tests/scraper/odysee.py index c13d08d..0fda0a7 100644 --- a/tests/scraper/odysee.py +++ b/tests/scraper/odysee.py @@ -1,8 +1,14 @@ from cisticola.base import Channel from cisticola.scraper import OdyseeScraper +def test_scrape_odysee_channel_no_media(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['odysee'])] + controller.register_scraper(scraper = OdyseeScraper()) + controller.scrape_channels(channels = channels, media = False) + def test_scrape_odysee_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['odysee'])] - controller.register_scraper(OdyseeScraper()) - controller.scrape_channels(channels) + controller.register_scraper(scraper = OdyseeScraper()) + controller.scrape_channels(channels = channels, media = True) diff --git a/tests/scraper/rumble.py b/tests/scraper/rumble.py index 8c00aa5..0f43463 100644 --- a/tests/scraper/rumble.py +++ b/tests/scraper/rumble.py @@ -1,8 +1,14 @@ from cisticola.base import Channel from cisticola.scraper import RumbleScraper +def test_scrape_rumble_channel_no_media(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['rumble'])] + controller.register_scraper(scraper = RumbleScraper()) + controller.scrape_channels(channels = channels, media = False) + def test_scrape_rumble_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['rumble'])] - controller.register_scraper(RumbleScraper()) - controller.scrape_channels(channels) + controller.register_scraper(scraper = RumbleScraper()) + controller.scrape_channels(channels = channels, media = True) diff --git a/tests/scraper/telegram_snscrape.py b/tests/scraper/telegram_snscrape.py index 077f1bb..677d949 100644 --- a/tests/scraper/telegram_snscrape.py +++ b/tests/scraper/telegram_snscrape.py @@ -1,8 +1,14 @@ from cisticola.base import Channel from cisticola.scraper import TelegramSnscrapeScraper +def test_scrape_telegram_snscrape_channel_no_media(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['telegram'])] + controller.register_scraper(scraper = TelegramSnscrapeScraper()) + controller.scrape_channels(channels = channels, media = False) + def test_scrape_telegram_snscrape_channel(controller, channel_kwargs): - channels = [Channel(**channel_kwargs['telegram_snscrape'])] - controller.register_scraper(TelegramSnscrapeScraper()) - controller.scrape_channels(channels) + channels = [Channel(**channel_kwargs['telegram'])] + controller.register_scraper(scraper = TelegramSnscrapeScraper()) + controller.scrape_channels(channels = channels, media = True) diff --git a/tests/scraper/telegram_telethon.py b/tests/scraper/telegram_telethon.py new file mode 100644 index 0000000..3590c2e --- /dev/null +++ b/tests/scraper/telegram_telethon.py @@ -0,0 +1,14 @@ +from cisticola.base import Channel +from cisticola.scraper import TelegramTelethonScraper + +def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['telegram'])] + controller.register_scraper(scraper = TelegramTelethonScraper()) + controller.scrape_channels(channels = channels, media = False) + +def test_scrape_telegram_telethon_channel(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['telegram'])] + controller.register_scraper(scraper = TelegramTelethonScraper()) + controller.scrape_channels(channels = channels, media = True) diff --git a/tests/scraper/twitter.py b/tests/scraper/twitter.py index 5c22b62..cb03045 100644 --- a/tests/scraper/twitter.py +++ b/tests/scraper/twitter.py @@ -1,8 +1,14 @@ from cisticola.base import Channel from cisticola.scraper import TwitterScraper +def test_scrape_twitter_channel_no_media(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['twitter'])] + controller.register_scraper(scraper = TwitterScraper()) + controller.scrape_channels(channels = channels, media = False) + def test_scrape_twitter_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['twitter'])] - controller.register_scraper(TwitterScraper()) - controller.scrape_channels(channels) + controller.register_scraper(scraper = TwitterScraper()) + controller.scrape_channels(channels = channels, media = True)