diff --git a/Pipfile.lock b/Pipfile.lock index 79b1b1c..d78928c 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "26955249044f1cd4bb4504c14f00f0c50508192338026227fc7b889e9f4fc11c" + "sha256": "3fb247a6b9b76ed811db7636b02ad848365d38dadb0da6a27c090e559e5540ec" }, "pipfile-spec": 6, "requires": { @@ -34,19 +34,19 @@ }, "boto3": { "hashes": [ - "sha256:788aa3281e91413bc201268a251c9d4ca2e9deb3a4af74daea2389cf66e5132e", - "sha256:ca37b9b4ade72f6d4fa2b7bee584dd5b1c7585f07f22ff1edbc9ecc0c4173b1f" + "sha256:127ebdf58c8825b53f1eff111e08c49ffffeb1f6d7a5665c9907ce8128fe14b1", + "sha256:b7ce3bf013f0f60e40c2676d5a7b620ed927cfad0aa348a606b10e9a0387f249" ], "index": "pypi", - "version": "==1.21.28" + "version": "==1.21.29" }, "botocore": { "hashes": [ - "sha256:03c41d26d1e765380b8175d4b136d3144aa051f17a86eebfdf9a885a5a9a6a72", - "sha256:102eb24b44d473adea6bb8728b20fb9547fa5858c3293df7cad67ef17ea736a7" + "sha256:b467d64cd773dc4d49ef31b18a8dded554f284f799720bd12e989fe2138fd5b8", + "sha256:de87907d42682179946ddfa113b9334e3c4258404aef19edd8c92381ff54775c" ], "markers": "python_version >= '3.6'", - "version": "==1.24.28" + "version": "==1.24.29" }, "brotli": { "hashes": [ @@ -138,6 +138,61 @@ ], "version": "==2021.10.8" }, + "cffi": { + "hashes": [ + "sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3", + "sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2", + "sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636", + "sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20", + "sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728", + "sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27", + "sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66", + "sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443", + "sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0", + "sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7", + "sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39", + "sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605", + "sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a", + "sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37", + "sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029", + "sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139", + "sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc", + "sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df", + "sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14", + "sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880", + "sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2", + "sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a", + "sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e", + "sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474", + "sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024", + "sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8", + "sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0", + "sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e", + "sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a", + "sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e", + "sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032", + "sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6", + "sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e", + "sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b", + "sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e", + "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954", + "sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962", + "sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c", + "sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4", + "sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55", + "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962", + "sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023", + "sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c", + "sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6", + "sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8", + "sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382", + "sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7", + "sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc", + "sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997", + "sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796" + ], + "version": "==1.15.0" + }, "charset-normalizer": { "hashes": [ "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597", @@ -154,6 +209,46 @@ "markers": "python_version >= '3.7'", "version": "==8.1.0" }, + "cryptg": { + "hashes": [ + "sha256:02b31622a75a49a5dcd25e589c85faae54575f018e055bd21a17df97c8bb9095", + "sha256:0da1b367056e57a5c01d22608da0cd50e597b917c1b2d9631767aa3c0640a99a", + "sha256:135688c6fbda90748924c2cb047f63785ebf4397d81acc4a05357950653c5096", + "sha256:1fb6c6d4561a54406593197c1f5f23662ab320f4af4ab11834e1583e9d27a49a", + "sha256:2516557e89803637fa7342de43dbcc5f84bf68ae05b1064a354a62d423447d9f", + "sha256:29001dafd3d6a054365222b1f89b12876723c89cdd10aa0e5885a05dfd034eeb", + "sha256:2cc8115960e49a038091ffb2d09de59e0acbdc76de10d7d415b7671a06bae0a9", + "sha256:2cd8224eb64af756f45cdceab16d048494313db8acec1e38d75d97716082267b", + "sha256:307bf96a6ac9c87b44531d8da5fe3a6c5d856e1dc69b68136ef9c4fb66ad17ac", + "sha256:31cf7682de69022c9a77739cdcf7116b06522b128b9b51c7593f277f38c38dbf", + "sha256:3bc2f372dec3a7753c0c0d72c69fcbe44af5473f870a3406978e07e8560a1aa6", + "sha256:46960979542155c9d903656a3a39770061b09a3691a23296f06dc168fe4ff962", + "sha256:47ad5916be4558f4d674c12800e8d9663ce938b0046f19cdc869ba3a7ca280ec", + "sha256:5faed49d972c7f44ce4d6fa1a64169c85a11209fa1fbe1c8a333fb1454888725", + "sha256:695636cca0ee938bd7113658ee60bfaf89afa19708c40ecae5f4a222c2ec544a", + "sha256:6c5d66975fc59adca203fa91e2a104240457114468162d30e9213661239ac1d6", + "sha256:72a5485ece10a70160170ceb658b1836db82dccab08a1f7029c54d81cf6b1d43", + "sha256:7fc8e1893775c6f53dceda1959f19833cc27a67a80492c10e2415dc601b36650", + "sha256:890584db41c8e1e046ae40dee0074614470d36ebd6b7e57bb91303300066601f", + "sha256:a1fb178702730b59267f1e6c6dfe16c7bb9c1350cee4183221982ad2dba4e7f5", + "sha256:a4de1730ca56aa8a945f176c25586901ed5e9f15ffb70c6459eedf466eb6299b", + "sha256:b6352555e47f389ed502269bdb537233d0a928b12d9f4caa57e8c707151acd30", + "sha256:b8896394b72ff7dbf38072ad4c2cd59abdd9e388bb55e1c369102beb8e569f9d", + "sha256:bbd05b52d09e78bdc595f229c0481f4f2e1daf3959847322a6b2c1f76119305f", + "sha256:bf00943924cddb0838f8a65f5aae31f6fe2ad64a5d7e6f10a6b900b3f01b0ae0", + "sha256:bf15aae0fa01aeec728ab16b920cf4c6b2793099c71f62f30ff100d6fe8c9859", + "sha256:c09a5b14494532fc3226f5c5f57ef2a651c935ed6a1d2d0f9eff110046725524", + "sha256:c4812802ce4cd6f08189ce0fa8b79e9a96ac941e69e6b3032bb6908baefde2ba", + "sha256:c69c1e19884108e508697919de0cd43e2ca4e9af418962aa235273b3c51a0e37", + "sha256:ce08c04ebb06ce1ac417597c1bb514a3c1b36cf5c286b8c60f23df2e65703bf3", + "sha256:e29b0d944176cf88fe52d1c58f46017b5bddc9cc54ec0fc6fac20043febefc32", + "sha256:e48ab84e0ed364436d5e449c59762c5963f08ad87f6508f4cb7644745b5559a8", + "sha256:eff15f0a1eee678dd9ec747b58ce86edb78b608036ac4e02d8349f5f35202495", + "sha256:fdd62c2be23eeabb9ebd2ad41bf153f5ec48b968885ef14e676515407cd56339" + ], + "index": "pypi", + "version": "==0.2.post4" + }, "dateparser": { "hashes": [ "sha256:038196b1f12c7397e38aad3d61588833257f6f552baa63a1499e6987fa8d42d9", @@ -394,7 +489,7 @@ "sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1", "sha256:9c9f243fcec7f410f138cb12c21c84c64fde4195481a30c9bfb05b5f003adfed" ], - "markers": "python_version >= '3.5' and python_version < '4.0'", + "markers": "python_version >= '3.5' and python_version < '4'", "version": "==1.45.1" }, "numpy": { @@ -517,7 +612,7 @@ }, "polyphemus": { "git": "https://github.com/bellingcat/polyphemus.git", - "ref": "c85dea215ae720e3df71d2ed1aaa82f7b8a6a2ed" + "ref": "00a5123a3768a55ffe29f2c803a4181895f17890" }, "py": { "hashes": [ @@ -569,6 +664,13 @@ ], "version": "==0.2.8" }, + "pycparser": { + "hashes": [ + "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9", + "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206" + ], + "version": "==2.21" + }, "pycryptodomex": { "hashes": [ "sha256:1ca8e1b4c62038bb2da55451385246f51f412c5f5eabd64812c01766a5989b4a", @@ -742,7 +844,6 @@ "version": "==2022.3.2" }, "requests": { - "extras": [], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" @@ -763,7 +864,7 @@ "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17", "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb" ], - "markers": "python_version >= '3.6' and python_version < '4.0'", + "markers": "python_version >= '3.6' and python_version < '4'", "version": "==4.8" }, "s3transfer": { @@ -784,7 +885,7 @@ }, "snscrape": { "git": "https://github.com/bellingcat/snscrape.git", - "ref": "de4ebed81f3f6a4bb4c65630daab6ec63784959b" + "ref": "fb8d73ac95011b7ad848a6048d3eed1880e80f21" }, "soupsieve": { "hashes": [ @@ -872,7 +973,7 @@ "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14", "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", "version": "==1.26.9" }, "websockets": { @@ -1259,7 +1360,6 @@ "version": "==2022.1" }, "requests": { - "extras": [], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" @@ -1359,7 +1459,7 @@ "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14", "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", "version": "==1.26.9" }, "zipp": { diff --git a/cisticola/scraper/__init__.py b/cisticola/scraper/__init__.py index 36e6cd5..8796633 100644 --- a/cisticola/scraper/__init__.py +++ b/cisticola/scraper/__init__.py @@ -1,5 +1,5 @@ from cisticola.utils import make_request -from .base import Scraper, ScraperController +from .base import Scraper, ScraperController, ChannelDoesNotExistError from .bitchute import BitchuteScraper from .gab import GabScraper from .gettr import GettrScraper diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index c887ee1..023fa3c 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -412,4 +412,7 @@ class ScraperController: """ mapper_registry.metadata.drop_all(bind=self.engine) - self.connect_to_db(self.engine) \ No newline at end of file + self.connect_to_db(self.engine) + +class ChannelDoesNotExistError(Exception): + """The specified channel does not exist or has been deleted.""" \ No newline at end of file diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index d8d3f0b..e9a9770 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -58,7 +58,8 @@ class BitchuteScraper(Scraper): date=datetime.fromtimestamp(post['timestamp']), date_archived=datetime.now(timezone.utc), raw_data=json.dumps(post), - archived_urls=archived_urls) + archived_urls=archived_urls, + media_archived=archive_media) def can_handle(self, channel): if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None: @@ -88,14 +89,19 @@ class BitchuteScraper(Scraper): response = session.post(canonical_url + 'counts/', data = data, headers = headers) counts = json.loads(response.text) + owner_soup = soup.find('p', {'class' : 'owner'}) + if owner_soup.text == '[email\xa0protected]': + owner_name = decode_cfemail(owner_soup.find('span', {'class': "__cf_email__"})['data-cfemail']) + else: + owner_name = owner_soup.text + profile = { 'description' : description_soup.text.strip(), 'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)], 'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')), 'videos' : int(info_list[1].text.split('videos')[0].strip()), 'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'], - 'owner_name' : decode_cfemail(soup.find('p', {'class' : 'owner'}).find('span', {'class': "__cf_email__"})['data-cfemail']), - 'category' : info_list[-1].text.split('Category')[1].strip(), + 'owner_name' : owner_name, 'image' : about_soup.find('img', {'alt' : 'Channel Image'}).get('data-src'), 'subscribers': counts['subscriber_count'], 'views': int(counts['about_view_count'].split(' ')[0])} diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py index aa698ad..126f500 100644 --- a/cisticola/scraper/gab.py +++ b/cisticola/scraper/gab.py @@ -49,7 +49,8 @@ class GabScraper(Scraper): date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc), date_archived=datetime.now(timezone.utc), raw_data=json.dumps(post), - archived_urls=archived_urls) + archived_urls=archived_urls, + media_archived=archive_media) def can_handle(self, channel: Channel) -> bool: if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None: diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index 5e8d7ac..c8e63f9 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -59,7 +59,8 @@ class GettrScraper(Scraper): date=datetime.fromtimestamp(post['cdate']/1000.), date_archived=datetime.now(timezone.utc), raw_data=json.dumps(post), - archived_urls=archived_urls) + archived_urls=archived_urls, + media_archived=archive_media) def can_handle(self, channel): if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None: diff --git a/cisticola/scraper/instagram.py b/cisticola/scraper/instagram.py index a5613a4..f045011 100644 --- a/cisticola/scraper/instagram.py +++ b/cisticola/scraper/instagram.py @@ -80,7 +80,8 @@ class InstagramScraper(Scraper): date=post.date_utc, date_archived=datetime.now(timezone.utc), raw_data=json.dumps(post._asdict(), default=str), - archived_urls=archived_urls) + archived_urls=archived_urls, + media_archived=archive_media) for comment in post.get_comments(): @@ -96,7 +97,8 @@ class InstagramScraper(Scraper): date=comment.created_at_utc, date_archived=datetime.now(timezone.utc), raw_data=json.dumps(comment_dict, default=str), - archived_urls={}) + archived_urls={}, + media_archived=archive_media) def can_handle(self, channel): if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None: diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index 980653c..4ff80e0 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -3,9 +3,11 @@ import json from typing import Generator from urllib.parse import urlparse -from polyphemus.base import OdyseeChannel import requests +from loguru import logger +from polyphemus.base import OdyseeChannel +from polyphemus.api import get_auth_token from cisticola.base import Channel, ScraperResult from cisticola.scraper.base import Scraper @@ -13,6 +15,10 @@ class OdyseeScraper(Scraper): """An implementation of a Scraper for Odysee, using polyphemus library""" __version__ = "OdyseeScraper 0.0.1" + def __init__(self): + super().__init__() + self.auth_token = get_auth_token() + def get_username_from_url(self, url): username = url.split('odysee.com/')[-1].strip('@').split(':')[0] @@ -22,12 +28,12 @@ class OdyseeScraper(Scraper): def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: username = self.get_username_from_url(channel.url) - odysee_channel = OdyseeChannel(channel_name = username) + odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token) all_videos = odysee_channel.get_all_videos() for video in all_videos: - if since is not None and datetime.fromtimestamp(video['created']) <= since.date: + if since is not None and datetime.fromtimestamp(video.info['created']) <= since.date: break archived_urls = {} @@ -55,7 +61,8 @@ class OdyseeScraper(Scraper): date=datetime.fromtimestamp(video.info['created']), date_archived=datetime.now(timezone.utc), raw_data=json.dumps(video.info), - archived_urls=archived_urls) + archived_urls=archived_urls, + media_archived=archive_media) for comment in all_comments: @@ -67,7 +74,8 @@ class OdyseeScraper(Scraper): date=datetime.fromtimestamp(comment.info['created']), date_archived=datetime.now(), raw_data=json.dumps(comment.info), - archived_urls={}) + archived_urls={}, + media_archived=True) def can_handle(self, channel): if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None: @@ -82,7 +90,7 @@ class OdyseeScraper(Scraper): def get_profile(self, channel: Channel) -> dict: username = self.get_username_from_url(channel.url) - odysee_channel = OdyseeChannel(channel_name = username) + odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token) profile = odysee_channel.info return profile \ No newline at end of file diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index 32e40e8..39a29ba 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -14,18 +14,12 @@ class RumbleScraper(Scraper): """An implementation of a Scraper for Rumble, using custom functions""" __version__ = "RumbleScraper 0.0.1" - def get_username_from_url(self, url): - username = url.split('https://rumble.com/c/')[1] - - return username - def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: - username = self.get_username_from_url(channel.url) - scraper = get_channel_videos(username) + scraper = get_channel_videos(channel.url) for post in scraper: - if since is not None and datetime.fromtimestamp(post['cdate']*0.001) <= since.date: + if since is not None and post['datetime'].replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc): break archived_urls = {} @@ -43,10 +37,11 @@ class RumbleScraper(Scraper): platform="Rumble", channel=channel.id, platform_id=post['media_url'].split('/')[-2], - date=datetime.fromisoformat(post['datetime']).replace(tzinfo=timezone.utc), + date=post['datetime'].replace(tzinfo=timezone.utc), date_archived=datetime.now(timezone.utc), - raw_data=json.dumps(post), - archived_urls=archived_urls) + raw_data=json.dumps(post, default = str), + archived_urls=archived_urls, + media_archived=archive_media) def url_to_key(self, url: str, content_type: str) -> str: ext = '.' + content_type.split('/')[-1] @@ -54,13 +49,12 @@ class RumbleScraper(Scraper): return key def can_handle(self, channel): - if channel.platform == "Rumble" and self.get_username_from_url(channel.url) is not None: + if channel.platform == "Rumble" and channel.url is not None: return True def get_profile(self, channel: Channel) -> dict: - username = self.get_username_from_url(channel.url) - profile = get_channel_profile(username = username) + profile = get_channel_profile(url = channel.url) return profile @@ -69,7 +63,7 @@ class RumbleScraper(Scraper): def get_media_url(url): r = make_request(url = url) - soup = BeautifulSoup(r.content, features = 'lxml') + soup = BeautifulSoup(r.content, features = 'html.parser') script = json.loads(''.join(soup.find('script', {'type':'application/ld+json'}).text)) media_url = script[0]['embedUrl'] @@ -91,16 +85,16 @@ def process_video(video): 'views' : video.find('span', {'class' : 'video-item--views'})['data-value'], 'rumbles' : rumbles, 'duration' : video.find('span', {'class' : 'video-item--duration'})['data-value'], - 'datetime' : video.find('time')['datetime']} + 'datetime' : datetime.fromisoformat(video.find('time')['datetime'])} info['media_url'] = get_media_url(info['link']) return info -def get_channel_videos(username): +def get_channel_videos(url): page = 1 - channel_url = f'{BASE_URL}/c/{username}?page=' + channel_url = f'{url}?page=' while True: url = channel_url + str(page) @@ -118,9 +112,9 @@ def get_channel_videos(username): page += 1 -def get_channel_profile(username): +def get_channel_profile(url): - channel_url = f'{BASE_URL}/c/{username}' + channel_url = f'{url}' r = make_request(url = channel_url) soup = BeautifulSoup(r.content, features = 'lxml') @@ -133,7 +127,7 @@ def get_channel_profile(username): 'verified': verified_svg is not None, 'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None, 'cover': cover_soup.get('src') if cover_soup else None, - 'subscribers': int(soup.find('span', {'class' : 'subscribe-button-count'}).text)} + 'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text} return profile #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index 126d75d..cc0afb9 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -6,7 +6,7 @@ from snscrape.modules.twitter import TwitterProfileScraper, TwitterUserScraper, from loguru import logger from cisticola.base import Channel, ScraperResult -from cisticola.scraper.base import Scraper +from cisticola.scraper.base import Scraper, ChannelDoesNotExistError class TwitterScraper(Scraper): """An implementation of a Scraper for Twitter, using snscrape library""" @@ -67,7 +67,8 @@ class TwitterScraper(Scraper): date=tweet.date, date_archived=datetime.now(timezone.utc), raw_data=tweet.json(), - archived_urls=archived_urls) + archived_urls=archived_urls, + media_archived=archive_media) def can_handle(self, channel): if channel.platform == "Twitter" and channel.platform_id: @@ -92,7 +93,10 @@ class TwitterScraper(Scraper): def get_profile(self, channel: Channel) -> dict: - scraper = TwitterUserScraper(channel.platform_id) + scraper = TwitterUserScraper(channel.screenname) + entity = scraper._get_entity() - profile = scraper._get_entity().__dict__ - return profile \ No newline at end of file + if entity is None: + raise ChannelDoesNotExistError(channel.url) + else: + return entity.__dict__ \ No newline at end of file diff --git a/cisticola/scraper/vkontakte.py b/cisticola/scraper/vkontakte.py index 78dfe98..97724c6 100644 --- a/cisticola/scraper/vkontakte.py +++ b/cisticola/scraper/vkontakte.py @@ -25,7 +25,7 @@ class VkontakteScraper(Scraper): first = True for post in scraper.get_items(): - if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc): + if since is not None and datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc): # with VKontakteUserScraper, the first tweet could be an old pinned tweet if first: first = False @@ -63,7 +63,8 @@ class VkontakteScraper(Scraper): date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc), date_archived=datetime.now(timezone.utc), raw_data=post.json(), - archived_urls=archived_urls) + archived_urls=archived_urls, + media_archived=archive_media) def can_handle(self, channel): if channel.platform == "Vkontakte" and channel.platform_id: diff --git a/cisticola/scraper/youtube.py b/cisticola/scraper/youtube.py index 88c75a4..1e2346b 100644 --- a/cisticola/scraper/youtube.py +++ b/cisticola/scraper/youtube.py @@ -72,7 +72,8 @@ class YoutubeScraper(Scraper): date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc), date_archived=datetime.now(timezone.utc), raw_data=json.dumps(video, default = str), - archived_urls=archived_urls) + archived_urls=archived_urls, + media_archived=archive_media) def can_handle(self, channel): if channel.platform == "Youtube" and channel.url: