Use new RawChannelInfo class

2026-06-27 04:28:42 +03:00 · 2022-03-31 15:17:25 +02:00
parent 61c99d33f6
commit 2dc9213d64
17 changed files with 1698 additions and 79 deletions
--- a/cisticola/scraper/bitchute.py
+++ b/cisticola/scraper/bitchute.py
@@ -9,7 +9,7 @@ from typing import Generator
 import requests
 from bs4 import BeautifulSoup

-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper.base import Scraper

 class BitchuteScraper(Scraper):
@@ -57,7 +57,7 @@ class BitchuteScraper(Scraper):
                platform_id=post['id'],
                date=datetime.fromtimestamp(post['timestamp']),
                date_archived=datetime.now(timezone.utc),
-                raw_data=json.dumps(post),
+                raw_posts=json.dumps(post),
                archived_urls=archived_urls,
                media_archived=archive_media)

@@ -65,7 +65,7 @@ class BitchuteScraper(Scraper):
        if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
            return True

-    def get_profile(self, channel: Channel) -> dict:
+    def get_profile(self, channel: Channel) -> RawChannelInfo:

        base_url = channel.url
        
@@ -106,8 +106,12 @@ class BitchuteScraper(Scraper):
            'subscribers': counts['subscriber_count'],
            'views': int(counts['about_view_count'].split(' ')[0])}
        
-        return profile
-
+        
+        return RawChannelInfo(scraper=self.__version__,
+            platform=channel.platform,
+            channel=channel.id,
+            raw_data=json.dumps(profile),
+            date_archived=datetime.now(timezone.utc))
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

 def strip_tags(html, convert_newlines=True):
--- a/cisticola/scraper/gab.py
+++ b/cisticola/scraper/gab.py
@@ -5,7 +5,7 @@ import os

 from gabber.client import Client, GAB_API_BASE_URL

-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper.base import Scraper

 class GabScraper(Scraper):
@@ -80,7 +80,7 @@ class GabScraper(Scraper):
                platform_id=post['id'],
                date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc),
                date_archived=datetime.now(timezone.utc),
-                raw_data=json.dumps(post),
+                raw_posts=json.dumps(post),
                archived_urls=archived_urls,
                media_archived=archive_media)

@@ -88,7 +88,7 @@ class GabScraper(Scraper):
        if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
            return True

-    def get_profile(self, channel: Channel) -> dict:
+    def get_profile(self, channel: Channel) -> RawChannelInfo:

        client = Client(
            username = os.environ['GAB_USER'],
@@ -106,4 +106,8 @@ class GabScraper(Scraper):

            profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()

-        return profile
+        return RawChannelInfo(scraper=self.__version__,
+            platform=channel.platform,
+            channel=channel.id,
+            raw_data=json.dumps(profile),
+            date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/gettr.py
+++ b/cisticola/scraper/gettr.py
@@ -5,7 +5,7 @@ from urllib.parse import urlparse

 from gogettr import PublicClient

-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper.base import Scraper

 class GettrScraper(Scraper):
@@ -58,7 +58,7 @@ class GettrScraper(Scraper):
                platform_id=post['_id'],
                date=datetime.fromtimestamp(post['cdate']/1000.),
                date_archived=datetime.now(timezone.utc),
-                raw_data=json.dumps(post),
+                raw_posts=json.dumps(post),
                archived_urls=archived_urls,
                media_archived=archive_media)

@@ -71,9 +71,13 @@ class GettrScraper(Scraper):
        key = urlparse(url).path.split('/')[-2] + ext
        return key 

-    def get_profile(self, channel: Channel) -> dict:
+    def get_profile(self, channel: Channel) -> RawChannelInfo:
        client = client = PublicClient()
        username = self.get_username_from_url(channel.url)
        profile = client.user_info(username)

-        return profile
+        return RawChannelInfo(scraper=self.__version__,
+            platform=channel.platform,
+            channel=channel.id,
+            raw_data=json.dumps(profile),
+            date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/instagram.py
+++ b/cisticola/scraper/instagram.py
@@ -8,7 +8,7 @@ from pathlib import Path
 from loguru import logger
 import instaloader 

-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper.base import Scraper

 BASE_URL = 'https://www.instagram.com/'
@@ -79,7 +79,7 @@ class InstagramScraper(Scraper):
                platform_id=post.mediaid,
                date=post.date_utc,
                date_archived=datetime.now(timezone.utc),
-                raw_data=json.dumps(post._asdict(), default=str),
+                raw_posts=json.dumps(post._asdict(), default=str),
                archived_urls=archived_urls,
                media_archived=archive_media)

@@ -96,7 +96,7 @@ class InstagramScraper(Scraper):
                    platform_id=post.mediaid,
                    date=comment.created_at_utc,
                    date_archived=datetime.now(timezone.utc),
-                    raw_data=json.dumps(comment_dict, default=str),
+                    raw_posts=json.dumps(comment_dict, default=str),
                    archived_urls={},
                    media_archived=archive_media)

@@ -104,7 +104,7 @@ class InstagramScraper(Scraper):
        if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None:
            return True

-    def get_profile(self, channel: Channel) -> dict:
+    def get_profile(self, channel: Channel) -> RawChannelInfo:

        username = self.get_username_from_url(channel.url)

@@ -125,4 +125,8 @@ class InstagramScraper(Scraper):
        profile['followers'] = user_profile.followers
        profile['followees'] = user_profile.followees

-        return profile
+        return RawChannelInfo(scraper=self.__version__,
+                        platform=channel.platform,
+                        channel=channel.id,
+                        raw_data=json.dumps(profile),
+                        date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/odysee.py
+++ b/cisticola/scraper/odysee.py
@@ -8,7 +8,7 @@ from loguru import logger

 from polyphemus.base import OdyseeChannel
 from polyphemus.api import get_auth_token
-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper.base import Scraper

 class OdyseeScraper(Scraper):
@@ -60,7 +60,7 @@ class OdyseeScraper(Scraper):
                platform_id=video.info['claim_id'],
                date=datetime.fromtimestamp(video.info['created']),
                date_archived=datetime.now(timezone.utc),
-                raw_data=json.dumps(video.info),
+                raw_posts=json.dumps(video.info),
                archived_urls=archived_urls,
                media_archived=archive_media)

@@ -73,7 +73,7 @@ class OdyseeScraper(Scraper):
                    platform_id=comment.info['claim_id'],
                    date=datetime.fromtimestamp(comment.info['created']),
                    date_archived=datetime.now(),
-                    raw_data=json.dumps(comment.info),
+                    raw_posts=json.dumps(comment.info),
                    archived_urls={},
                    media_archived=True)

@@ -87,10 +87,14 @@ class OdyseeScraper(Scraper):

        return f'{key}.{ext}'

-    def get_profile(self, channel: Channel) -> dict:
+    def get_profile(self, channel: Channel) -> RawChannelInfo:

        username = self.get_username_from_url(channel.url)
        odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token)
        profile = odysee_channel.info

-        return profile
+        return RawChannelInfo(scraper=self.__version__,
+                        platform=channel.platform,
+                        channel=channel.id,
+                        raw_data=json.dumps(profile),
+                        date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/rumble.py
+++ b/cisticola/scraper/rumble.py
@@ -5,7 +5,7 @@ from urllib.parse import urlparse

 from bs4 import BeautifulSoup

-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper import Scraper, make_request

 BASE_URL = 'https://rumble.com'
@@ -39,7 +39,7 @@ class RumbleScraper(Scraper):
                platform_id=post['media_url'].split('/')[-2],
                date=post['datetime'].replace(tzinfo=timezone.utc),
                date_archived=datetime.now(timezone.utc),
-                raw_data=json.dumps(post, default = str),
+                raw_posts=json.dumps(post, default = str),
                archived_urls=archived_urls,
                media_archived=archive_media)

@@ -52,11 +52,15 @@ class RumbleScraper(Scraper):
        if channel.platform == "Rumble" and channel.url is not None:
            return True

-    def get_profile(self, channel: Channel) -> dict:
+    def get_profile(self, channel: Channel) -> RawChannelInfo:

        profile = get_channel_profile(url = channel.url)

-        return profile
+        return RawChannelInfo(scraper=self.__version__,
+                        platform=channel.platform,
+                        channel=channel.id,
+                        raw_data=json.dumps(profile),
+                        date_archived=datetime.now(timezone.utc))

 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

@@ -128,6 +132,7 @@ def get_channel_profile(url):
        'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None,
        'cover':  cover_soup.get('src') if cover_soup else None,
        'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text}
+        
    return profile

 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
--- a/cisticola/scraper/telegram_snscrape.py
+++ b/cisticola/scraper/telegram_snscrape.py
@@ -1,10 +1,10 @@
 from typing import Generator
 from datetime import datetime, timezone
-
+import json
 import snscrape.modules
 from loguru import logger

-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper.base import Scraper

 class TelegramSnscrapeScraper(Scraper):
@@ -49,15 +49,20 @@ class TelegramSnscrapeScraper(Scraper):
                platform_id=post.url,
                date=post.date,
                date_archived=datetime.now(timezone.utc),
-                raw_data=post.json(),
+                raw_posts=post.json(),
                archived_urls=archived_urls,
                media_archived=archive_media
            )

-    def get_profile(self, channel: Channel) -> dict:
+    def get_profile(self, channel: Channel) -> RawChannelInfo:

        scr = snscrape.modules.telegram.TelegramChannelScraper(
            channel.screenname)

        profile = scr._get_entity().__dict__
-        return profile
+        
+        return RawChannelInfo(scraper=self.__version__,
+            platform=channel.platform,
+            channel=channel.id,
+            raw_data=json.dumps(profile),
+            date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/telegram_telethon.py
+++ b/cisticola/scraper/telegram_telethon.py
@@ -11,7 +11,7 @@ from telethon.sync import TelegramClient
 from telethon.tl.functions.channels import GetFullChannelRequest
 from telethon.tl import types

-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper.base import Scraper

 MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
@@ -44,7 +44,7 @@ class TelegramTelethonScraper(Scraper):
            key = list(result.archived_urls.keys())[0]

            if result.archived_urls[key] is None:
-                raw = json.loads(result.raw_data)
+                raw = json.loads(result.raw_posts)
                    
                message = client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']])

@@ -141,11 +141,11 @@ class TelegramTelethonScraper(Scraper):
                    platform_id=post_url,
                    date=post.date.replace(tzinfo=timezone.utc),
                    date_archived=datetime.now(timezone.utc),
-                    raw_data=json.dumps(post.to_dict(), default=str),
+                    raw_posts=json.dumps(post.to_dict(), default=str),
                    archived_urls=archived_urls,
                    media_archived=archive_media)

-    def get_profile(self, channel: Channel) -> dict:
+    def get_profile(self, channel: Channel) -> RawChannelInfo:

        username = self.get_username_from_url(channel.url)

@@ -157,4 +157,8 @@ class TelegramTelethonScraper(Scraper):
            full_channel = client(GetFullChannelRequest(channel = username))
        profile = full_channel.__dict__

-        return profile
+        return RawChannelInfo(scraper=self.__version__,
+            platform=channel.platform,
+            channel=channel.id,
+            raw_data=json.dumps(profile),
+            date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/twitter.py
+++ b/cisticola/scraper/twitter.py
@@ -1,11 +1,11 @@
 from datetime import datetime, timezone
 from typing import Generator
 from urllib.parse import urlparse, parse_qs
-
 from snscrape.modules.twitter import TwitterProfileScraper, TwitterUserScraper, Video, Gif, Photo
 from loguru import logger
+import json

-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper.base import Scraper, ChannelDoesNotExistError

 class TwitterScraper(Scraper):
@@ -66,7 +66,7 @@ class TwitterScraper(Scraper):
                platform_id=tweet.id,
                date=tweet.date,
                date_archived=datetime.now(timezone.utc),
-                raw_data=tweet.json(),
+                raw_posts=tweet.json(),
                archived_urls=archived_urls,
                media_archived=archive_media)

@@ -91,7 +91,7 @@ class TwitterScraper(Scraper):
        key = parsed_url.path.split('/')[-1] + ext
        return key 

-    def get_profile(self, channel: Channel) -> dict:
+    def get_profile(self, channel: Channel) -> RawChannelInfo:

        scraper = TwitterUserScraper(channel.screenname)
        entity = scraper._get_entity()
@@ -99,4 +99,8 @@ class TwitterScraper(Scraper):
        if entity is None:
            raise ChannelDoesNotExistError(channel.url)
        else:   
-            return entity.__dict__
+            return RawChannelInfo(scraper=self.__version__,
+            platform=channel.platform,
+            channel=channel.id,
+            raw_data=json.dumps(emtity.__dict__),
+            date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/vkontakte.py
+++ b/cisticola/scraper/vkontakte.py
@@ -1,11 +1,10 @@
 from datetime import datetime, timezone
 from typing import Generator
 from urllib.parse import urlparse
-
 from snscrape.modules.vkontakte import VKontakteUserScraper
 from loguru import logger

-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper.base import Scraper

 class VkontakteScraper(Scraper):
@@ -62,7 +61,7 @@ class VkontakteScraper(Scraper):
                platform_id=post.url.split('/')[-1],
                date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc),
                date_archived=datetime.now(timezone.utc),
-                raw_data=post.json(),
+                raw_posts=post.json(),
                archived_urls=archived_urls,
                media_archived=archive_media)

@@ -80,10 +79,15 @@ class VkontakteScraper(Scraper):
            
        return key

-    def get_profile(self, channel: Channel) -> dict:
+    def get_profile(self, channel: Channel) -> RawChannelInfo:

        username = self.get_username_from_url(channel.url)
        scraper = VKontakteUserScraper(username)
        
        profile = scraper._get_entity().__dict__
-        return profile
+
+        return RawChannelInfo(scraper=self.__version__,
+                    platform=channel.platform,
+                    channel=channel.id,
+                    raw_data=json.dumps(profile),
+                    date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/youtube.py
+++ b/cisticola/scraper/youtube.py
@@ -2,10 +2,9 @@ from datetime import datetime, timezone
 import json
 from typing import Generator
 import tempfile
-
 import yt_dlp

-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper import Scraper

 class YoutubeScraper(Scraper):
@@ -71,7 +70,7 @@ class YoutubeScraper(Scraper):
                        platform_id=video_id,
                        date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc),
                        date_archived=datetime.now(timezone.utc),
-                        raw_data=json.dumps(video, default = str),
+                        raw_posts=json.dumps(video, default = str),
                        archived_urls=archived_urls,
                        media_archived=archive_media)
                        
@@ -79,8 +78,7 @@ class YoutubeScraper(Scraper):
        if channel.platform == "Youtube" and channel.url:
            return True

-    def get_profile(self, channel: Channel) -> dict:
-
+    def get_profile(self, channel: Channel) -> RawChannelInfo:
        ydl_opts = {}
        ydl = yt_dlp.YoutubeDL(ydl_opts)

@@ -89,7 +87,12 @@ class YoutubeScraper(Scraper):
            meta = ydl.extract_info(
                channel.url,
                process=False)
+
+            return RawChannelInfo(scraper=self.__version__,
+                    platform=channel.platform,
+                    channel=channel.id,
+                    raw_data=json.dumps(meta),
+                    date_archived=datetime.now(timezone.utc))
+
        except yt_dlp.utils.DownloadError as e:
            raise e
-
-        return meta