added capability of running scraper without archiving media, and implemented prototype Telethon scraper for Telegram

This commit is contained in:
Tristan Lee
2022-03-09 12:12:01 -06:00
parent 506fb54a53
commit 739e1d8484
25 changed files with 340 additions and 151 deletions

1
.gitignore vendored
View File

@@ -8,6 +8,7 @@ docs/source/_*
*.ipynb
*.db
.env
*.session
# Unit test / coverage reports
reports

View File

@@ -17,6 +17,7 @@ ffmpeg-python = "*"
polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"}
garc = "*"
youtube-dl = "*"
telethon = "*"
[dev-packages]
pytest = "*"

172
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "ea2a1f1dff68fa0bd30dab06553e913f467c3b1399388b97f0ed913ab74c6e85"
"sha256": "3d293e1f3802d64ae7a8fbfc4c1d742cc33cd4c520a6263f93e566f89faa7013"
},
"pipfile-spec": 6,
"requires": {
@@ -49,19 +49,19 @@
},
"boto3": {
"hashes": [
"sha256:75709628320cea8ce137975dc33b75213c2e4f6e7cd09e55290de7245e2c79e2",
"sha256:c92ec20a670721b5a1bc013b305a84db2b7f9c716653b3056ce7e2fbd2a180ef"
"sha256:30394729b38d5ce2f845440428a55161c6d45478044e553a12ca1acf56d7278a",
"sha256:895489900eb882777124c3b64a13df49785cf77f7bd1504e783464fb3b4c8163"
],
"index": "pypi",
"version": "==1.21.12"
"version": "==1.21.15"
},
"botocore": {
"hashes": [
"sha256:0174999a04b0a2e42457106093ace9b36fa94772a442d9bcf60750263d1d073e",
"sha256:0cd7395311a3fef4aad8df8f511b4f7d221c24ae30934bd5c03458b0fc096d0c"
"sha256:405082f92a9e524e1aee96cbc90134668026d7da3c12f86990c91a12620ca28b",
"sha256:fa4816e94e72111a9341204061e760bcbde74ca5d900d3f2206c2c2e8e4b56e4"
],
"markers": "python_version >= '3.6'",
"version": "==1.24.12"
"version": "==1.24.15"
},
"bs4": {
"hashes": [
@@ -378,28 +378,28 @@
},
"numpy": {
"hashes": [
"sha256:03ae5850619abb34a879d5f2d4bb4dcd025d6d8fb72f5e461dae84edccfe129f",
"sha256:076aee5a3763d41da6bef9565fdf3cb987606f567cd8b104aded2b38b7b47abf",
"sha256:0b536b6840e84c1c6a410f3a5aa727821e6108f3454d81a5cd5900999ef04f89",
"sha256:15efb7b93806d438e3bc590ca8ef2f953b0ce4f86f337ef4559d31ec6cf9d7dd",
"sha256:168259b1b184aa83a514f307352c25c56af111c269ffc109d9704e81f72e764b",
"sha256:2638389562bda1635b564490d76713695ff497242a83d9b684d27bb4a6cc9d7a",
"sha256:3556c5550de40027d3121ebbb170f61bbe19eb639c7ad0c7b482cd9b560cd23b",
"sha256:4a176959b6e7e00b5a0d6f549a479f869829bfd8150282c590deee6d099bbb6e",
"sha256:515a8b6edbb904594685da6e176ac9fbea8f73a5ebae947281de6613e27f1956",
"sha256:55535c7c2f61e2b2fc817c5cbe1af7cb907c7f011e46ae0a52caa4be1f19afe2",
"sha256:59153979d60f5bfe9e4c00e401e24dfe0469ef8da6d68247439d3278f30a180f",
"sha256:60cb8e5933193a3cc2912ee29ca331e9c15b2da034f76159b7abc520b3d1233a",
"sha256:6767ad399e9327bfdbaa40871be4254d1995f4a3ca3806127f10cec778bd9896",
"sha256:76a4f9bce0278becc2da7da3b8ef854bed41a991f4226911a24a9711baad672c",
"sha256:8cf33634b60c9cef346663a222d9841d3bbbc0a2f00221d6bcfd0d993d5543f6",
"sha256:94dd11d9f13ea1be17bac39c1942f527cbf7065f94953cf62dfe805653da2f8f",
"sha256:aafa46b5a39a27aca566198d3312fb3bde95ce9677085efd02c86f7ef6be4ec7",
"sha256:badca914580eb46385e7f7e4e426fea6de0a37b9e06bec252e481ae7ec287082",
"sha256:d76a26c5118c4d96e264acc9e3242d72e1a2b92e739807b3b69d8d47684b6677"
"sha256:07a8c89a04997625236c5ecb7afe35a02af3896c8aa01890a849913a2309c676",
"sha256:08d9b008d0156c70dc392bb3ab3abb6e7a711383c3247b410b39962263576cd4",
"sha256:201b4d0552831f7250a08d3b38de0d989d6f6e4658b709a02a73c524ccc6ffce",
"sha256:2c10a93606e0b4b95c9b04b77dc349b398fdfbda382d2a39ba5a822f669a0123",
"sha256:3ca688e1b9b95d80250bca34b11a05e389b1420d00e87a0d12dc45f131f704a1",
"sha256:48a3aecd3b997bf452a2dedb11f4e79bc5bfd21a1d4cc760e703c31d57c84b3e",
"sha256:568dfd16224abddafb1cbcce2ff14f522abe037268514dd7e42c6776a1c3f8e5",
"sha256:5bfb1bb598e8229c2d5d48db1860bcf4311337864ea3efdbe1171fb0c5da515d",
"sha256:639b54cdf6aa4f82fe37ebf70401bbb74b8508fddcf4797f9fe59615b8c5813a",
"sha256:8251ed96f38b47b4295b1ae51631de7ffa8260b5b087808ef09a39a9d66c97ab",
"sha256:92bfa69cfbdf7dfc3040978ad09a48091143cffb778ec3b03fa170c494118d75",
"sha256:97098b95aa4e418529099c26558eeb8486e66bd1e53a6b606d684d0c3616b168",
"sha256:a3bae1a2ed00e90b3ba5f7bd0a7c7999b55d609e0c54ceb2b076a25e345fa9f4",
"sha256:c34ea7e9d13a70bf2ab64a2532fe149a9aced424cd05a2c4ba662fd989e3e45f",
"sha256:dbc7601a3b7472d559dc7b933b18b4b66f9aa7452c120e87dfb33d02008c8a18",
"sha256:e7927a589df200c5e23c57970bafbd0cd322459aa7b1ff73b7c2e84d6e3eae62",
"sha256:f8c1f39caad2c896bc0018f699882b345b2a63708008be29b1f355ebf6f933fe",
"sha256:fade0d4f4d292b6f39951b6836d7a3c7ef5b2347f3c420cd9820a1d90d794802",
"sha256:fdf3c08bce27132395d3c3ba1503cac12e17282358cb4bddc25cc46b0aca07aa"
],
"markers": "python_version < '3.10' and platform_machine != 'aarch64' and platform_machine != 'arm64'",
"version": "==1.22.2"
"version": "==1.22.3"
},
"packaging": {
"hashes": [
@@ -446,7 +446,7 @@
},
"polyphemus": {
"git": "https://github.com/bellingcat/polyphemus.git",
"ref": "8506fd43770661cdcf92c5cac2356cba74778834"
"ref": "c85dea215ae720e3df71d2ed1aaa82f7b8a6a2ed"
},
"py": {
"hashes": [
@@ -456,6 +456,30 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==1.11.0"
},
"pyaes": {
"hashes": [
"sha256:02c1b1405c38d3c370b085fb952dd8bea3fadcee6411ad99f312cc129c536d8f"
],
"version": "==1.6.1"
},
"pyasn1": {
"hashes": [
"sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359",
"sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576",
"sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf",
"sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7",
"sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d",
"sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00",
"sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8",
"sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86",
"sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12",
"sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776",
"sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba",
"sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2",
"sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"
],
"version": "==0.4.8"
},
"pygments": {
"hashes": [
"sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65",
@@ -592,9 +616,6 @@
"version": "==2022.3.2"
},
"requests": {
"extras": [
"socks"
],
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
@@ -602,6 +623,14 @@
"index": "pypi",
"version": "==2.27.1"
},
"rsa": {
"hashes": [
"sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17",
"sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"
],
"markers": "python_version >= '3.6' and python_version < '4.0'",
"version": "==4.8"
},
"s3transfer": {
"hashes": [
"sha256:7a6f4c4d1fdb9a2b640244008e142cbc2cd3ae34b386584ef044dd0f27101971",
@@ -695,45 +724,52 @@
},
"sqlalchemy": {
"hashes": [
"sha256:05fa14f279d43df68964ad066f653193187909950aa0163320b728edfc400167",
"sha256:0ddc5e5ccc0160e7ad190e5c61eb57560f38559e22586955f205e537cda26034",
"sha256:15a03261aa1e68f208e71ae3cd845b00063d242cbf8c87348a0c2c0fc6e1f2ac",
"sha256:289465162b1fa1e7a982f8abe59d26a8331211cad4942e8031d2b7db1f75e649",
"sha256:2e216c13ecc7fcdcbb86bb3225425b3ed338e43a8810c7089ddb472676124b9b",
"sha256:2fd4d3ca64c41dae31228b80556ab55b6489275fb204827f6560b65f95692cf3",
"sha256:330eb45395874cc7787214fdd4489e2afb931bc49e0a7a8f9cd56d6e9c5b1639",
"sha256:3c7ed6c69debaf6198fadb1c16ae1253a29a7670bbf0646f92582eb465a0b999",
"sha256:4ad31cec8b49fd718470328ad9711f4dc703507d434fd45461096da0a7135ee0",
"sha256:57205844f246bab9b666a32f59b046add8995c665d9ecb2b7b837b087df90639",
"sha256:582b59d1e5780a447aada22b461e50b404a9dc05768da1d87368ad8190468418",
"sha256:5e9c7b3567edbc2183607f7d9f3e7e89355b8f8984eec4d2cd1e1513c8f7b43f",
"sha256:6a01ec49ca54ce03bc14e10de55dfc64187a2194b3b0e5ac0fdbe9b24767e79e",
"sha256:6f22c040d196f841168b1456e77c30a18a3dc16b336ddbc5a24ce01ab4e95ae0",
"sha256:81f2dd355b57770fdf292b54f3e0a9823ec27a543f947fa2eb4ec0df44f35f0d",
"sha256:85e4c244e1de056d48dae466e9baf9437980c19fcde493e0db1a0a986e6d75b4",
"sha256:8d0949b11681380b4a50ac3cd075e4816afe9fa4a8c8ae006c1ca26f0fa40ad8",
"sha256:975f5c0793892c634c4920057da0de3a48bbbbd0a5c86f5fcf2f2fedf41b76da",
"sha256:9e4fb2895b83993831ba2401b6404de953fdbfa9d7d4fa6a4756294a83bbc94f",
"sha256:b35dca159c1c9fa8a5f9005e42133eed82705bf8e243da371a5e5826440e65ca",
"sha256:b7b20c88873675903d6438d8b33fba027997193e274b9367421e610d9da76c08",
"sha256:bb4b15fb1f0aafa65cbdc62d3c2078bea1ceecbfccc9a1f23a2113c9ac1191fa",
"sha256:c0c7171aa5a57e522a04a31b84798b6c926234cb559c0939840c3235cf068813",
"sha256:c317ddd7c586af350a6aef22b891e84b16bff1a27886ed5b30f15c1ed59caeaa",
"sha256:c3abc34fed19fdeaead0ced8cf56dd121f08198008c033596aa6aae7cc58f59f",
"sha256:ca68c52e3cae491ace2bf39b35fef4ce26c192fd70b4cd90f040d419f70893b5",
"sha256:cf2cd387409b12d0a8b801610d6336ee7d24043b6dd965950eaec09b73e7262f",
"sha256:d046a9aeba9bc53e88a41e58beb72b6205abb9a20f6c136161adf9128e589db5",
"sha256:d5c20c8415173b119762b6110af64448adccd4d11f273fb9f718a9865b88a99c",
"sha256:d86132922531f0dc5a4f424c7580a472a924dd737602638e704841c9cb24aea2",
"sha256:dccff41478050e823271642837b904d5f9bda3f5cf7d371ce163f00a694118d6",
"sha256:de85c26a5a1c72e695ab0454e92f60213b4459b8d7c502e0be7a6369690eeb1a",
"sha256:e3a86b59b6227ef72ffc10d4b23f0fe994bef64d4667eab4fb8cd43de4223bec",
"sha256:e79e73d5ee24196d3057340e356e6254af4d10e1fc22d3207ea8342fc5ffb977",
"sha256:ea8210090a816d48a4291a47462bac750e3bc5c2442e6d64f7b8137a7c3f9ac5",
"sha256:f3b7ec97e68b68cb1f9ddb82eda17b418f19a034fa8380a0ac04e8fe01532875"
"sha256:04164e0063feb7aedd9d073db0fd496edb244be40d46ea1f0d8990815e4b8c34",
"sha256:159c2f69dd6efd28e894f261ffca1100690f28210f34cfcd70b895e0ea7a64f3",
"sha256:199dc6d0068753b6a8c0bd3aceb86a3e782df118260ebc1fa981ea31ee054674",
"sha256:1bbac3e8293b34c4403d297e21e8f10d2a57756b75cff101dc62186adec725f5",
"sha256:20e9eba7fd86ef52e0df25bea83b8b518dfdf0bce09b336cfe51671f52aaaa3f",
"sha256:290cbdf19129ae520d4bdce392648c6fcdbee763bc8f750b53a5ab51880cb9c9",
"sha256:316270e5867566376e69a0ac738b863d41396e2b63274616817e1d34156dff0e",
"sha256:3f88a4ee192142eeed3fe173f673ea6ab1f5a863810a9d85dbf6c67a9bd08f97",
"sha256:4aa96e957141006181ca58e792e900ee511085b8dae06c2d08c00f108280fb8a",
"sha256:4b2bcab3a914715d332ca783e9bda13bc570d8b9ef087563210ba63082c18c16",
"sha256:576684771456d02e24078047c2567025f2011977aa342063468577d94e194b00",
"sha256:5a2e73508f939175363d8a4be9dcdc84cf16a92578d7fa86e6e4ca0e6b3667b2",
"sha256:5ba59761c19b800bc2e1c9324da04d35ef51e4ee9621ff37534bc2290d258f71",
"sha256:5dc9801ae9884e822ba942ca493642fb50f049c06b6dbe3178691fce48ceb089",
"sha256:6fdd2dc5931daab778c2b65b03df6ae68376e028a3098eb624d0909d999885bc",
"sha256:708973b5d9e1e441188124aaf13c121e5b03b6054c2df59b32219175a25aa13e",
"sha256:7ff72b3cc9242d1a1c9b84bd945907bf174d74fc2519efe6184d6390a8df478b",
"sha256:8679f9aba5ac22e7bce54ccd8a77641d3aea3e2d96e73e4356c887ebf8ff1082",
"sha256:8b9a395122770a6f08ebfd0321546d7379f43505882c7419d7886856a07caa13",
"sha256:8e1e5d96b744a4f91163290b01045430f3f32579e46d87282449e5b14d27d4ac",
"sha256:9a0195af6b9050c9322a97cf07514f66fe511968e623ca87b2df5e3cf6349615",
"sha256:9cb5698c896fa72f88e7ef04ef62572faf56809093180771d9be8d9f2e264a13",
"sha256:b3f1d9b3aa09ab9adc7f8c4b40fc3e081eb903054c9a6f9ae1633fe15ae503b4",
"sha256:bb42f9b259c33662c6a9b866012f6908a91731a419e69304e1261ba3ab87b8d1",
"sha256:bca714d831e5b8860c3ab134c93aec63d1a4f493bed20084f54e3ce9f0a3bf99",
"sha256:bedd89c34ab62565d44745212814e4b57ef1c24ad4af9b29c504ce40f0dc6558",
"sha256:bfec934aac7f9fa95fc82147a4ba5db0a8bdc4ebf1e33b585ab8860beb10232f",
"sha256:c7046f7aa2db445daccc8424f50b47a66c4039c9f058246b43796aa818f8b751",
"sha256:d7e483f4791fbda60e23926b098702340504f7684ce7e1fd2c1bf02029288423",
"sha256:dd93162615870c976dba43963a24bb418b28448fef584f30755990c134a06a55",
"sha256:e4607d2d16330757818c9d6fba322c2e80b4b112ff24295d1343a80b876eb0ed",
"sha256:e9a680d9665f88346ed339888781f5236347933906c5a56348abb8261282ec48",
"sha256:edfcf93fd92e2f9eef640b3a7a40db20fe3c1d7c2c74faa41424c63dead61b76",
"sha256:f7e4a3c0c3c596296b37f8427c467c8e4336dc8d50f8ed38042e8ba79507b2c9",
"sha256:fff677fa4522dafb5a5e2c0cf909790d5d367326321aeabc0dffc9047cb235bd"
],
"index": "pypi",
"version": "==1.4.31"
"version": "==1.4.32"
},
"telethon": {
"hashes": [
"sha256:04fdc5fa4ed3e886e6ecf4bad79205ab8880c6aefbd42c29c89c689a502aa816",
"sha256:818cb61281ed3f75ba4da9b68cb69486bed9474d2db4e0aa16e482053117452c"
],
"index": "pypi",
"version": "==1.24.0"
},
"tomli": {
"hashes": [

View File

@@ -6,4 +6,5 @@ from .gettr import GettrScraper
from .odysee import OdyseeScraper
from .rumble import RumbleScraper
from .telegram_snscrape import TelegramSnscrapeScraper
from .telegram_telethon import TelegramTelethonScraper
from .twitter import TwitterScraper

View File

@@ -4,7 +4,6 @@ from io import BytesIO
from urllib.parse import urlparse
import tempfile
import requests
import boto3
from loguru import logger
import ffmpeg
@@ -84,7 +83,7 @@ class Scraper:
def can_handle(self, channel: Channel) -> bool:
raise NotImplementedError
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
raise NotImplementedError
@@ -102,8 +101,9 @@ class ScraperController:
def register_scrapers(self, scraper: List[Scraper]):
self.scrapers.extend(scraper)
def scrape_channels(self, channels: List[Channel]):
@logger.catch
def scrape_channels(self, channels: List[Channel], media: bool = True):
if self.session is None:
logger.error("No DB session")
return
@@ -128,7 +128,7 @@ class ScraperController:
else:
since = None
posts = scraper.get_posts(channel, since=since)
posts = scraper.get_posts(channel, since=since, media=media)
for post in posts:
session.add(post)

View File

@@ -11,6 +11,7 @@ from bs4 import BeautifulSoup
from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
class BitchuteScraper(Scraper):
"""An implementation of a Scraper for Bitchute, using classes from the 4cat
library"""
@@ -21,7 +22,7 @@ class BitchuteScraper(Scraper):
return username
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
session = requests.Session()
session.headers.update(self.headers)
@@ -42,11 +43,12 @@ class BitchuteScraper(Scraper):
archived_urls = {}
if 'video_url' in post:
url = post['video_url']
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[url] = archived_url
if media:
if 'video_url' in post:
url = post['video_url']
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,

View File

@@ -6,6 +6,7 @@ from garc import Garc
from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
class GabScraper(Scraper):
"""An implementation of a Scraper for Gab, using GARC library"""
__version__ = "GabScraper 0.0.1"
@@ -15,7 +16,7 @@ class GabScraper(Scraper):
return username
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
client = Garc(profile = 'main')
username = GabScraper.get_username_from_url(channel.url)
@@ -28,15 +29,17 @@ class GabScraper(Scraper):
media_urls = []
archived_urls = {}
media_urls.extend([p['url'] for p in post['media_attachments']])
if media:
if post.get('repost') is not None:
media_urls.extend([p['url'] for p in post['repost']['media_attachments']])
media_urls.extend([p['url'] for p in post['media_attachments']])
for url in media_urls:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[url] = archived_url
if post.get('repost') is not None:
media_urls.extend([p['url'] for p in post['repost']['media_attachments']])
for url in media_urls:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,

View File

@@ -7,6 +7,7 @@ from gogettr import PublicClient
from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
class GettrScraper(Scraper):
"""An implementation of a Scraper for Gettr, using gogettr library"""
__version__ = "GettrScraper 0.0.1"
@@ -18,7 +19,7 @@ class GettrScraper(Scraper):
return username
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
client = PublicClient()
username = GettrScraper.get_username_from_url(channel.url)
scraper = client.user_activity(username=username, type="posts")
@@ -29,24 +30,26 @@ class GettrScraper(Scraper):
archived_urls = {}
if 'imgs' in post:
for img in post['imgs']:
url = "https://media.gettr.com/" + img
if media:
if 'imgs' in post:
for img in post['imgs']:
url = "https://media.gettr.com/" + img
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[img] = archived_url
if 'main' in post:
url = "https://media.gettr.com/" + post['main']
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[img] = archived_url
archived_urls[post['main']] = archived_url
if 'main' in post:
url = "https://media.gettr.com/" + post['main']
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[post['main']] = archived_url
if 'vid' in post:
url = "https://media.gettr.com/" + post['vid']
media_blob, content_type, key = self.m3u8_url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[post['vid']] = archived_url
if 'vid' in post:
url = "https://media.gettr.com/" + post['vid']
media_blob, content_type, key = self.m3u8_url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[post['vid']] = archived_url
yield ScraperResult(
scraper=self.__version__,

View File

@@ -19,7 +19,7 @@ class OdyseeScraper(Scraper):
return username
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
username = OdyseeScraper.get_username_from_url(channel.url)
odysee_channel = OdyseeChannel(channel_name = username)
@@ -31,17 +31,19 @@ class OdyseeScraper(Scraper):
break
archived_urls = {}
url = video.info['streaming_url']
# Check if file is a video file or an m3u8 file
r = requests.head(url)
if r.headers['Content-Type'] == 'text/html; charset=utf-8':
media_blob, content_type, key = self.m3u8_url_to_blob(url)
else:
media_blob, content_type, key = self.url_to_blob(url)
if media:
url = video.info['streaming_url']
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[url] = archived_url
# Check if file is a video file or an m3u8 file
r = requests.head(url)
if r.headers['Content-Type'] == 'text/html; charset=utf-8':
media_blob, content_type, key = self.m3u8_url_to_blob(url)
else:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[url] = archived_url
all_comments = video.get_all_comments()

View File

@@ -22,7 +22,7 @@ class RumbleScraper(Scraper):
return username
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
username = RumbleScraper.get_username_from_url(channel.url)
scraper = get_channel_videos(username)
@@ -33,11 +33,13 @@ class RumbleScraper(Scraper):
archived_urls = {}
url = post['media_url']
if media:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[post['media_url']] = archived_url
url = post['media_url']
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[post['media_url']] = archived_url
yield ScraperResult(
scraper=self.__version__,

View File

@@ -14,7 +14,7 @@ class TelegramSnscrapeScraper(Scraper):
if channel.platform == "Telegram" and channel.public and not channel.chat:
return True
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
scr = snscrape.modules.telegram.TelegramChannelScraper(
channel.screenname)
@@ -29,17 +29,19 @@ class TelegramSnscrapeScraper(Scraper):
archived_urls = {}
for image_url in post.images:
logger.debug(f'Archiving image: {image_url}')
media_blob, content_type, key = self.url_to_blob(image_url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[image_url] = archived_url
if media:
if post.video:
logger.debug(f'Archiving video: {post.video}')
media_blob, content_type, key = self.url_to_blob(post.video)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[post.video] = archived_url
for image_url in post.images:
logger.debug(f'Archiving image: {image_url}')
media_blob, content_type, key = self.url_to_blob(image_url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[image_url] = archived_url
if post.video:
logger.debug(f'Archiving video: {post.video}')
media_blob, content_type, key = self.url_to_blob(post.video)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[post.video] = archived_url
yield ScraperResult(
scraper=self.__version__,

View File

@@ -0,0 +1,75 @@
from typing import Generator
from datetime import datetime, timezone
import os
import json
import tempfile
from pathlib import Path
from loguru import logger
from telethon.sync import TelegramClient
from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
class TelegramTelethonScraper(Scraper):
__version__ = "TelegramTelethonScraper 0.0.1"
def get_username_from_url(self, url):
username = url.split('https://t.me/')[1]
if username.startswith('s/'):
username = username.split('s/')[1]
return username
def can_handle(self, channel):
if channel.platform == "Telegram" and channel.public and not channel.chat:
return True
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
username = self.get_username_from_url(channel.url)
api_id = os.environ['TELEGRAM_API_ID_1']
api_hash = os.environ['TELEGRAM_API_HASH_1']
phone = os.environ['TELEGRAM_PHONE_1']
with TelegramClient(phone, api_id, api_hash) as client:
for post in client.iter_messages(username):
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}')
break
post_url = f'{channel.url}/{post.id}'
key = f'{username}_{post.id}'
archived_urls = {}
if media:
if post.media is not None:
with tempfile.TemporaryDirectory() as temp_dir:
output_file = Path(temp_dir, key)
client.download_media(post.media, output_file)
output_file_with_ext = os.listdir(temp_dir)[0]
filename = Path(temp_dir, output_file_with_ext)
with open(filename, 'rb') as f:
blob = f.read()
# TODO specify Content-Type
archived_url = self.archive_media(blob = blob, content_type = '', key = output_file_with_ext)
archived_urls[post_url] = archived_url
yield ScraperResult(
scraper=self.__version__,
platform="Telegram",
channel=channel.id,
platform_id=post_url,
date=post.date.replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post.to_dict(), default=str),
archived_urls=archived_urls)

View File

@@ -12,7 +12,7 @@ class TwitterScraper(Scraper):
"""An implementation of a Scraper for Twitter, using snscrape library"""
__version__ = "TwitterScraper 0.0.1"
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
scraper = TwitterProfileScraper(channel.platform_id)
first = True

View File

@@ -10,4 +10,7 @@ addopts =
--cov='cisticola'
--cov-report html:reports/coverage
--html='reports/tests.html'
--self-contained-html
--self-contained-html
filterwarnings =
ignore:the imp module is deprecated:DeprecationWarning
ignore:The localize method is no longer necessary, as this time zone supports the fold attribute

View File

@@ -10,7 +10,7 @@ from cisticola.scraper import (
logger.remove()
logger.add(sys.stderr, level="INFO")
logger.add("../russian_telegram_ingest.log", level = "INFO")
logger.add("../russian_telegram_ingest.log")
test_channels = [
# Channel(

View File

@@ -9,6 +9,7 @@ from cisticola.scraper import (
OdyseeScraper,
RumbleScraper,
TelegramSnscrapeScraper,
TelegramTelethonScraper,
TwitterScraper)
test_channels = [
@@ -117,11 +118,12 @@ scrapers = [
OdyseeScraper(),
RumbleScraper(),
TelegramSnscrapeScraper(),
TwitterScraper()]
TwitterScraper()
TelegramTelethonScraper()]
controller.register_scrapers(scrapers)
engine = create_engine('sqlite:///test3.db')
controller.connect_to_db(engine)
controller.scrape_channels(test_channels)
controller.scrape_channels(test_channels, media = True)

View File

@@ -81,7 +81,7 @@ RUMBLE_CHANNEL_KWARGS = {
'chat': False,
'notes': ''}
TELEGRAM_SNSCRAPE_CHANNEL_KWARGS = {
TELEGRAM_CHANNEL_KWARGS = {
'id': 5,
'name': 'South West Ohio Proud Boys (test)',
'platform_id': -1001276612436,
@@ -141,7 +141,7 @@ def channel_kwargs():
'gettr' : GETTR_CHANNEL_KWARGS,
'odysee' : ODYSEE_CHANNEL_KWARGS,
'rumble' : RUMBLE_CHANNEL_KWARGS,
'telegram_snscrape' : TELEGRAM_SNSCRAPE_CHANNEL_KWARGS,
'telegram' : TELEGRAM_CHANNEL_KWARGS,
'twitter' : TWITTER_CHANNEL_KWARGS}
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -1,8 +1,14 @@
from cisticola.base import Channel
from cisticola.scraper import BitchuteScraper
def test_scrape_bitchute_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['bitchute'])]
controller.register_scraper(scraper = BitchuteScraper())
controller.scrape_channels(channels = channels, media = False)
def test_scrape_bitchute_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['bitchute'])]
controller.register_scraper(BitchuteScraper())
controller.scrape_channels(channels)
controller.register_scraper(scraper = BitchuteScraper())
controller.scrape_channels(channels = channels, media = True)

View File

@@ -1,8 +1,14 @@
from cisticola.base import Channel
from cisticola.scraper import GabScraper
def test_scrape_gab_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gab'])]
controller.register_scraper(scraper = GabScraper())
controller.scrape_channels(channels = channels, media = False)
def test_scrape_gab_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gab'])]
controller.register_scraper(GabScraper())
controller.scrape_channels(channels)
controller.register_scraper(scraper = GabScraper())
controller.scrape_channels(channels = channels, media = True)

View File

@@ -1,8 +1,14 @@
from cisticola.base import Channel
from cisticola.scraper import GettrScraper
def test_scrape_gettr_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gettr'])]
controller.register_scraper(scraper = GettrScraper())
controller.scrape_channels(channels = channels, media = False)
def test_scrape_gettr_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gettr'])]
controller.register_scraper(GettrScraper())
controller.scrape_channels(channels)
controller.register_scraper(scraper = GettrScraper())
controller.scrape_channels(channels = channels, media = True)

View File

@@ -1,8 +1,14 @@
from cisticola.base import Channel
from cisticola.scraper import OdyseeScraper
def test_scrape_odysee_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['odysee'])]
controller.register_scraper(scraper = OdyseeScraper())
controller.scrape_channels(channels = channels, media = False)
def test_scrape_odysee_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['odysee'])]
controller.register_scraper(OdyseeScraper())
controller.scrape_channels(channels)
controller.register_scraper(scraper = OdyseeScraper())
controller.scrape_channels(channels = channels, media = True)

View File

@@ -1,8 +1,14 @@
from cisticola.base import Channel
from cisticola.scraper import RumbleScraper
def test_scrape_rumble_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['rumble'])]
controller.register_scraper(scraper = RumbleScraper())
controller.scrape_channels(channels = channels, media = False)
def test_scrape_rumble_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['rumble'])]
controller.register_scraper(RumbleScraper())
controller.scrape_channels(channels)
controller.register_scraper(scraper = RumbleScraper())
controller.scrape_channels(channels = channels, media = True)

View File

@@ -1,8 +1,14 @@
from cisticola.base import Channel
from cisticola.scraper import TelegramSnscrapeScraper
def test_scrape_telegram_snscrape_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramSnscrapeScraper())
controller.scrape_channels(channels = channels, media = False)
def test_scrape_telegram_snscrape_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['telegram_snscrape'])]
controller.register_scraper(TelegramSnscrapeScraper())
controller.scrape_channels(channels)
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramSnscrapeScraper())
controller.scrape_channels(channels = channels, media = True)

View File

@@ -0,0 +1,14 @@
from cisticola.base import Channel
from cisticola.scraper import TelegramTelethonScraper
def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramTelethonScraper())
controller.scrape_channels(channels = channels, media = False)
def test_scrape_telegram_telethon_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramTelethonScraper())
controller.scrape_channels(channels = channels, media = True)

View File

@@ -1,8 +1,14 @@
from cisticola.base import Channel
from cisticola.scraper import TwitterScraper
def test_scrape_twitter_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['twitter'])]
controller.register_scraper(scraper = TwitterScraper())
controller.scrape_channels(channels = channels, media = False)
def test_scrape_twitter_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['twitter'])]
controller.register_scraper(TwitterScraper())
controller.scrape_channels(channels)
controller.register_scraper(scraper = TwitterScraper())
controller.scrape_channels(channels = channels, media = True)