made tesets work, fixed several issues with Rumble scraper

This commit is contained in:
Tristan Lee
2022-03-29 16:09:51 -05:00
parent 67d1abf024
commit b805d50132
12 changed files with 180 additions and 59 deletions

130
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "26955249044f1cd4bb4504c14f00f0c50508192338026227fc7b889e9f4fc11c"
"sha256": "3fb247a6b9b76ed811db7636b02ad848365d38dadb0da6a27c090e559e5540ec"
},
"pipfile-spec": 6,
"requires": {
@@ -34,19 +34,19 @@
},
"boto3": {
"hashes": [
"sha256:788aa3281e91413bc201268a251c9d4ca2e9deb3a4af74daea2389cf66e5132e",
"sha256:ca37b9b4ade72f6d4fa2b7bee584dd5b1c7585f07f22ff1edbc9ecc0c4173b1f"
"sha256:127ebdf58c8825b53f1eff111e08c49ffffeb1f6d7a5665c9907ce8128fe14b1",
"sha256:b7ce3bf013f0f60e40c2676d5a7b620ed927cfad0aa348a606b10e9a0387f249"
],
"index": "pypi",
"version": "==1.21.28"
"version": "==1.21.29"
},
"botocore": {
"hashes": [
"sha256:03c41d26d1e765380b8175d4b136d3144aa051f17a86eebfdf9a885a5a9a6a72",
"sha256:102eb24b44d473adea6bb8728b20fb9547fa5858c3293df7cad67ef17ea736a7"
"sha256:b467d64cd773dc4d49ef31b18a8dded554f284f799720bd12e989fe2138fd5b8",
"sha256:de87907d42682179946ddfa113b9334e3c4258404aef19edd8c92381ff54775c"
],
"markers": "python_version >= '3.6'",
"version": "==1.24.28"
"version": "==1.24.29"
},
"brotli": {
"hashes": [
@@ -138,6 +138,61 @@
],
"version": "==2021.10.8"
},
"cffi": {
"hashes": [
"sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3",
"sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2",
"sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636",
"sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20",
"sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728",
"sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27",
"sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66",
"sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443",
"sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0",
"sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7",
"sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39",
"sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605",
"sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a",
"sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37",
"sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029",
"sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139",
"sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc",
"sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df",
"sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14",
"sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880",
"sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2",
"sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a",
"sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e",
"sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474",
"sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024",
"sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8",
"sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0",
"sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e",
"sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a",
"sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e",
"sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032",
"sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6",
"sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e",
"sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b",
"sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e",
"sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954",
"sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962",
"sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c",
"sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4",
"sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55",
"sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962",
"sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023",
"sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c",
"sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6",
"sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8",
"sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382",
"sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7",
"sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc",
"sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997",
"sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"
],
"version": "==1.15.0"
},
"charset-normalizer": {
"hashes": [
"sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
@@ -154,6 +209,46 @@
"markers": "python_version >= '3.7'",
"version": "==8.1.0"
},
"cryptg": {
"hashes": [
"sha256:02b31622a75a49a5dcd25e589c85faae54575f018e055bd21a17df97c8bb9095",
"sha256:0da1b367056e57a5c01d22608da0cd50e597b917c1b2d9631767aa3c0640a99a",
"sha256:135688c6fbda90748924c2cb047f63785ebf4397d81acc4a05357950653c5096",
"sha256:1fb6c6d4561a54406593197c1f5f23662ab320f4af4ab11834e1583e9d27a49a",
"sha256:2516557e89803637fa7342de43dbcc5f84bf68ae05b1064a354a62d423447d9f",
"sha256:29001dafd3d6a054365222b1f89b12876723c89cdd10aa0e5885a05dfd034eeb",
"sha256:2cc8115960e49a038091ffb2d09de59e0acbdc76de10d7d415b7671a06bae0a9",
"sha256:2cd8224eb64af756f45cdceab16d048494313db8acec1e38d75d97716082267b",
"sha256:307bf96a6ac9c87b44531d8da5fe3a6c5d856e1dc69b68136ef9c4fb66ad17ac",
"sha256:31cf7682de69022c9a77739cdcf7116b06522b128b9b51c7593f277f38c38dbf",
"sha256:3bc2f372dec3a7753c0c0d72c69fcbe44af5473f870a3406978e07e8560a1aa6",
"sha256:46960979542155c9d903656a3a39770061b09a3691a23296f06dc168fe4ff962",
"sha256:47ad5916be4558f4d674c12800e8d9663ce938b0046f19cdc869ba3a7ca280ec",
"sha256:5faed49d972c7f44ce4d6fa1a64169c85a11209fa1fbe1c8a333fb1454888725",
"sha256:695636cca0ee938bd7113658ee60bfaf89afa19708c40ecae5f4a222c2ec544a",
"sha256:6c5d66975fc59adca203fa91e2a104240457114468162d30e9213661239ac1d6",
"sha256:72a5485ece10a70160170ceb658b1836db82dccab08a1f7029c54d81cf6b1d43",
"sha256:7fc8e1893775c6f53dceda1959f19833cc27a67a80492c10e2415dc601b36650",
"sha256:890584db41c8e1e046ae40dee0074614470d36ebd6b7e57bb91303300066601f",
"sha256:a1fb178702730b59267f1e6c6dfe16c7bb9c1350cee4183221982ad2dba4e7f5",
"sha256:a4de1730ca56aa8a945f176c25586901ed5e9f15ffb70c6459eedf466eb6299b",
"sha256:b6352555e47f389ed502269bdb537233d0a928b12d9f4caa57e8c707151acd30",
"sha256:b8896394b72ff7dbf38072ad4c2cd59abdd9e388bb55e1c369102beb8e569f9d",
"sha256:bbd05b52d09e78bdc595f229c0481f4f2e1daf3959847322a6b2c1f76119305f",
"sha256:bf00943924cddb0838f8a65f5aae31f6fe2ad64a5d7e6f10a6b900b3f01b0ae0",
"sha256:bf15aae0fa01aeec728ab16b920cf4c6b2793099c71f62f30ff100d6fe8c9859",
"sha256:c09a5b14494532fc3226f5c5f57ef2a651c935ed6a1d2d0f9eff110046725524",
"sha256:c4812802ce4cd6f08189ce0fa8b79e9a96ac941e69e6b3032bb6908baefde2ba",
"sha256:c69c1e19884108e508697919de0cd43e2ca4e9af418962aa235273b3c51a0e37",
"sha256:ce08c04ebb06ce1ac417597c1bb514a3c1b36cf5c286b8c60f23df2e65703bf3",
"sha256:e29b0d944176cf88fe52d1c58f46017b5bddc9cc54ec0fc6fac20043febefc32",
"sha256:e48ab84e0ed364436d5e449c59762c5963f08ad87f6508f4cb7644745b5559a8",
"sha256:eff15f0a1eee678dd9ec747b58ce86edb78b608036ac4e02d8349f5f35202495",
"sha256:fdd62c2be23eeabb9ebd2ad41bf153f5ec48b968885ef14e676515407cd56339"
],
"index": "pypi",
"version": "==0.2.post4"
},
"dateparser": {
"hashes": [
"sha256:038196b1f12c7397e38aad3d61588833257f6f552baa63a1499e6987fa8d42d9",
@@ -394,7 +489,7 @@
"sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1",
"sha256:9c9f243fcec7f410f138cb12c21c84c64fde4195481a30c9bfb05b5f003adfed"
],
"markers": "python_version >= '3.5' and python_version < '4.0'",
"markers": "python_version >= '3.5' and python_version < '4'",
"version": "==1.45.1"
},
"numpy": {
@@ -517,7 +612,7 @@
},
"polyphemus": {
"git": "https://github.com/bellingcat/polyphemus.git",
"ref": "c85dea215ae720e3df71d2ed1aaa82f7b8a6a2ed"
"ref": "00a5123a3768a55ffe29f2c803a4181895f17890"
},
"py": {
"hashes": [
@@ -569,6 +664,13 @@
],
"version": "==0.2.8"
},
"pycparser": {
"hashes": [
"sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9",
"sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"
],
"version": "==2.21"
},
"pycryptodomex": {
"hashes": [
"sha256:1ca8e1b4c62038bb2da55451385246f51f412c5f5eabd64812c01766a5989b4a",
@@ -742,7 +844,6 @@
"version": "==2022.3.2"
},
"requests": {
"extras": [],
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
@@ -763,7 +864,7 @@
"sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17",
"sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"
],
"markers": "python_version >= '3.6' and python_version < '4.0'",
"markers": "python_version >= '3.6' and python_version < '4'",
"version": "==4.8"
},
"s3transfer": {
@@ -784,7 +885,7 @@
},
"snscrape": {
"git": "https://github.com/bellingcat/snscrape.git",
"ref": "de4ebed81f3f6a4bb4c65630daab6ec63784959b"
"ref": "fb8d73ac95011b7ad848a6048d3eed1880e80f21"
},
"soupsieve": {
"hashes": [
@@ -872,7 +973,7 @@
"sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14",
"sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'",
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.26.9"
},
"websockets": {
@@ -1259,7 +1360,6 @@
"version": "==2022.1"
},
"requests": {
"extras": [],
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
@@ -1359,7 +1459,7 @@
"sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14",
"sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'",
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.26.9"
},
"zipp": {

View File

@@ -1,5 +1,5 @@
from cisticola.utils import make_request
from .base import Scraper, ScraperController
from .base import Scraper, ScraperController, ChannelDoesNotExistError
from .bitchute import BitchuteScraper
from .gab import GabScraper
from .gettr import GettrScraper

View File

@@ -412,4 +412,7 @@ class ScraperController:
"""
mapper_registry.metadata.drop_all(bind=self.engine)
self.connect_to_db(self.engine)
self.connect_to_db(self.engine)
class ChannelDoesNotExistError(Exception):
"""The specified channel does not exist or has been deleted."""

View File

@@ -58,7 +58,8 @@ class BitchuteScraper(Scraper):
date=datetime.fromtimestamp(post['timestamp']),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
archived_urls=archived_urls)
archived_urls=archived_urls,
media_archived=archive_media)
def can_handle(self, channel):
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
@@ -88,14 +89,19 @@ class BitchuteScraper(Scraper):
response = session.post(canonical_url + 'counts/', data = data, headers = headers)
counts = json.loads(response.text)
owner_soup = soup.find('p', {'class' : 'owner'})
if owner_soup.text == '[email\xa0protected]':
owner_name = decode_cfemail(owner_soup.find('span', {'class': "__cf_email__"})['data-cfemail'])
else:
owner_name = owner_soup.text
profile = {
'description' : description_soup.text.strip(),
'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)],
'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
'videos' : int(info_list[1].text.split('videos')[0].strip()),
'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'],
'owner_name' : decode_cfemail(soup.find('p', {'class' : 'owner'}).find('span', {'class': "__cf_email__"})['data-cfemail']),
'category' : info_list[-1].text.split('Category')[1].strip(),
'owner_name' : owner_name,
'image' : about_soup.find('img', {'alt' : 'Channel Image'}).get('data-src'),
'subscribers': counts['subscriber_count'],
'views': int(counts['about_view_count'].split(' ')[0])}

View File

@@ -49,7 +49,8 @@ class GabScraper(Scraper):
date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
archived_urls=archived_urls)
archived_urls=archived_urls,
media_archived=archive_media)
def can_handle(self, channel: Channel) -> bool:
if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:

View File

@@ -59,7 +59,8 @@ class GettrScraper(Scraper):
date=datetime.fromtimestamp(post['cdate']/1000.),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
archived_urls=archived_urls)
archived_urls=archived_urls,
media_archived=archive_media)
def can_handle(self, channel):
if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None:

View File

@@ -80,7 +80,8 @@ class InstagramScraper(Scraper):
date=post.date_utc,
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post._asdict(), default=str),
archived_urls=archived_urls)
archived_urls=archived_urls,
media_archived=archive_media)
for comment in post.get_comments():
@@ -96,7 +97,8 @@ class InstagramScraper(Scraper):
date=comment.created_at_utc,
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(comment_dict, default=str),
archived_urls={})
archived_urls={},
media_archived=archive_media)
def can_handle(self, channel):
if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None:

View File

@@ -3,9 +3,11 @@ import json
from typing import Generator
from urllib.parse import urlparse
from polyphemus.base import OdyseeChannel
import requests
from loguru import logger
from polyphemus.base import OdyseeChannel
from polyphemus.api import get_auth_token
from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
@@ -13,6 +15,10 @@ class OdyseeScraper(Scraper):
"""An implementation of a Scraper for Odysee, using polyphemus library"""
__version__ = "OdyseeScraper 0.0.1"
def __init__(self):
super().__init__()
self.auth_token = get_auth_token()
def get_username_from_url(self, url):
username = url.split('odysee.com/')[-1].strip('@').split(':')[0]
@@ -22,12 +28,12 @@ class OdyseeScraper(Scraper):
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = self.get_username_from_url(channel.url)
odysee_channel = OdyseeChannel(channel_name = username)
odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token)
all_videos = odysee_channel.get_all_videos()
for video in all_videos:
if since is not None and datetime.fromtimestamp(video['created']) <= since.date:
if since is not None and datetime.fromtimestamp(video.info['created']) <= since.date:
break
archived_urls = {}
@@ -55,7 +61,8 @@ class OdyseeScraper(Scraper):
date=datetime.fromtimestamp(video.info['created']),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(video.info),
archived_urls=archived_urls)
archived_urls=archived_urls,
media_archived=archive_media)
for comment in all_comments:
@@ -67,7 +74,8 @@ class OdyseeScraper(Scraper):
date=datetime.fromtimestamp(comment.info['created']),
date_archived=datetime.now(),
raw_data=json.dumps(comment.info),
archived_urls={})
archived_urls={},
media_archived=True)
def can_handle(self, channel):
if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None:
@@ -82,7 +90,7 @@ class OdyseeScraper(Scraper):
def get_profile(self, channel: Channel) -> dict:
username = self.get_username_from_url(channel.url)
odysee_channel = OdyseeChannel(channel_name = username)
odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token)
profile = odysee_channel.info
return profile

View File

@@ -14,18 +14,12 @@ class RumbleScraper(Scraper):
"""An implementation of a Scraper for Rumble, using custom functions"""
__version__ = "RumbleScraper 0.0.1"
def get_username_from_url(self, url):
username = url.split('https://rumble.com/c/')[1]
return username
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = self.get_username_from_url(channel.url)
scraper = get_channel_videos(username)
scraper = get_channel_videos(channel.url)
for post in scraper:
if since is not None and datetime.fromtimestamp(post['cdate']*0.001) <= since.date:
if since is not None and post['datetime'].replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
break
archived_urls = {}
@@ -43,10 +37,11 @@ class RumbleScraper(Scraper):
platform="Rumble",
channel=channel.id,
platform_id=post['media_url'].split('/')[-2],
date=datetime.fromisoformat(post['datetime']).replace(tzinfo=timezone.utc),
date=post['datetime'].replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
archived_urls=archived_urls)
raw_data=json.dumps(post, default = str),
archived_urls=archived_urls,
media_archived=archive_media)
def url_to_key(self, url: str, content_type: str) -> str:
ext = '.' + content_type.split('/')[-1]
@@ -54,13 +49,12 @@ class RumbleScraper(Scraper):
return key
def can_handle(self, channel):
if channel.platform == "Rumble" and self.get_username_from_url(channel.url) is not None:
if channel.platform == "Rumble" and channel.url is not None:
return True
def get_profile(self, channel: Channel) -> dict:
username = self.get_username_from_url(channel.url)
profile = get_channel_profile(username = username)
profile = get_channel_profile(url = channel.url)
return profile
@@ -69,7 +63,7 @@ class RumbleScraper(Scraper):
def get_media_url(url):
r = make_request(url = url)
soup = BeautifulSoup(r.content, features = 'lxml')
soup = BeautifulSoup(r.content, features = 'html.parser')
script = json.loads(''.join(soup.find('script', {'type':'application/ld+json'}).text))
media_url = script[0]['embedUrl']
@@ -91,16 +85,16 @@ def process_video(video):
'views' : video.find('span', {'class' : 'video-item--views'})['data-value'],
'rumbles' : rumbles,
'duration' : video.find('span', {'class' : 'video-item--duration'})['data-value'],
'datetime' : video.find('time')['datetime']}
'datetime' : datetime.fromisoformat(video.find('time')['datetime'])}
info['media_url'] = get_media_url(info['link'])
return info
def get_channel_videos(username):
def get_channel_videos(url):
page = 1
channel_url = f'{BASE_URL}/c/{username}?page='
channel_url = f'{url}?page='
while True:
url = channel_url + str(page)
@@ -118,9 +112,9 @@ def get_channel_videos(username):
page += 1
def get_channel_profile(username):
def get_channel_profile(url):
channel_url = f'{BASE_URL}/c/{username}'
channel_url = f'{url}'
r = make_request(url = channel_url)
soup = BeautifulSoup(r.content, features = 'lxml')
@@ -133,7 +127,7 @@ def get_channel_profile(username):
'verified': verified_svg is not None,
'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None,
'cover': cover_soup.get('src') if cover_soup else None,
'subscribers': int(soup.find('span', {'class' : 'subscribe-button-count'}).text)}
'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text}
return profile
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -6,7 +6,7 @@ from snscrape.modules.twitter import TwitterProfileScraper, TwitterUserScraper,
from loguru import logger
from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
from cisticola.scraper.base import Scraper, ChannelDoesNotExistError
class TwitterScraper(Scraper):
"""An implementation of a Scraper for Twitter, using snscrape library"""
@@ -67,7 +67,8 @@ class TwitterScraper(Scraper):
date=tweet.date,
date_archived=datetime.now(timezone.utc),
raw_data=tweet.json(),
archived_urls=archived_urls)
archived_urls=archived_urls,
media_archived=archive_media)
def can_handle(self, channel):
if channel.platform == "Twitter" and channel.platform_id:
@@ -92,7 +93,10 @@ class TwitterScraper(Scraper):
def get_profile(self, channel: Channel) -> dict:
scraper = TwitterUserScraper(channel.platform_id)
scraper = TwitterUserScraper(channel.screenname)
entity = scraper._get_entity()
profile = scraper._get_entity().__dict__
return profile
if entity is None:
raise ChannelDoesNotExistError(channel.url)
else:
return entity.__dict__

View File

@@ -25,7 +25,7 @@ class VkontakteScraper(Scraper):
first = True
for post in scraper.get_items():
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
if since is not None and datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
# with VKontakteUserScraper, the first tweet could be an old pinned tweet
if first:
first = False
@@ -63,7 +63,8 @@ class VkontakteScraper(Scraper):
date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=post.json(),
archived_urls=archived_urls)
archived_urls=archived_urls,
media_archived=archive_media)
def can_handle(self, channel):
if channel.platform == "Vkontakte" and channel.platform_id:

View File

@@ -72,7 +72,8 @@ class YoutubeScraper(Scraper):
date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(video, default = str),
archived_urls=archived_urls)
archived_urls=archived_urls,
media_archived=archive_media)
def can_handle(self, channel):
if channel.platform == "Youtube" and channel.url: