got transformers for Bitchute, Rumble, and Gettr working for all raw_posts.

2026-06-08 03:18:34 +03:00 · 2022-06-20 21:45:41 -05:00
parent a2a7882f1c
commit 619fe42a31
6 changed files with 114 additions and 14 deletions
--- a/cisticola/base.py
+++ b/cisticola/base.py
@@ -229,12 +229,27 @@ class Post:
    #: The ID of the Channel that the post was forwarded or quoted from
    forwarded_from: int = None
      
-    #: The ID of the Post that this Post is a reply to or reblog of
+    #: The ID of the Post that this Post is a reply to
    reply_to: int = None

    #: Other users mentioned in the post
    mentions: list = field(default_factory=list)

+    #: Number of positive post reactions (e.g. likes, favorites, rumbles, upvotes, etc.)
+    likes: int = None
+
+    #: Number of times the post was forwarded/retweeted/shared
+    forwards: int = None
+
+    #: Number of times the post was viewed
+    views: int = None
+
+    #: Video title, if post is a video
+    video_title: str = None
+
+    #: Video duration in seconds, if post is a video
+    video_duration: int = None
+
    def hydrate(self):
        URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""

@@ -246,6 +261,7 @@ class Post:
        
        hashtags = re.findall(HASHTAG_REGEX, self.content)
        self.hashtags += hashtags
+        self.hashtags = list(set(hashtag.lower() for hashtag in self.hashtags))

        # regex patterns for finding crypto addresses
        BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b'
@@ -436,7 +452,7 @@ post_table = Table('posts', mapper_registry.metadata,
                       Column('id', Integer, primary_key=True,
                              autoincrement=True),
                       Column('raw_id', Integer, ForeignKey('raw_posts.id'), index=True),
-                       Column('platform_id', Integer, index=True),
+                       Column('platform_id', String, index=True),
                       Column('scraper', String),
                       Column('transformer', String),
                       Column('platform', String),
@@ -455,6 +471,11 @@ post_table = Table('posts', mapper_registry.metadata,
                       Column('hashtags', JSON),
                       Column('outlinks', JSON),
                       Column('mentions', JSON),
+                       Column('likes', Integer),
+                       Column('forwards', Integer),
+                       Column('views', Integer),
+                       Column('video_title', String),
+                       Column('video_duration', Integer),
                       Column('detected_language', String),
                       Column('normalized_content', String)
                       )
--- a/cisticola/transformer/base.py
+++ b/cisticola/transformer/base.py
@@ -171,8 +171,8 @@ class ETLController:
                        session.commit()
                        break

-                    if handled == False:
-                        logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")
+                if handled == False:
+                    logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")

    @logger.catch(reraise=True)
    def transform_all_untransformed(self, hydrate: bool = True):
--- a/cisticola/transformer/bitchute.py
+++ b/cisticola/transformer/bitchute.py
@@ -59,8 +59,29 @@ class BitchuteTransformer(Transformer):
    def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
        raw = json.loads(data.raw_data)

-        soup = BeautifulSoup(raw['body'], features = 'html.parser')
-        content = soup.find_all('p')[-1].text
+        if raw['category'] == 'comment':
+            if raw['parent_id'] is None:
+                reply_to_id = raw['thread_id']
+            else:
+                reply_to_id = raw['parent_id']
+            post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first()
+            if post is None:
+                if raw['parent_id'] is not None:
+                    # this block is for comments whose parent_ids correspond to deleted comments 
+                    post = session.query(Post).filter_by(channel=data.channel, platform_id=raw['thread_id']).first()
+                    reply_to = post.id
+                else:
+                    reply_to = -1
+            else:
+                reply_to = post.id
+            content = raw['body'].strip()
+        else:
+            reply_to = -1
+            soup = BeautifulSoup(raw['body'], features = 'html.parser')
+            soup.find('div', {'class': 'teaser'}).decompose()
+            soup.find('span', {'class': 'more'}).decompose()
+            soup.find('span', {'class': 'less hidden'}).decompose()
+            content = soup.text.strip()

        transformed = Post(
            raw_id=data.id,
@@ -72,12 +93,19 @@ class BitchuteTransformer(Transformer):
            date=data.date,
            date_archived=data.date_archived,
            date_transformed=datetime.now(timezone.utc),
-            url=raw['url'],
+            url=raw['url'] if raw['url'] else None,
            content=content,
            author_id=raw['author_id'],
-            author_username=raw['author'])
+            author_username=raw['author'],
+            reply_to=reply_to,
+            hashtags = list(filter(None, [h.strip('#') for h in raw['hashtags'].split(',')])),
+            likes = raw['likes'],
+            views = int(raw['views']) if raw.get('views') else None,
+            video_title = raw['subject'],
+            video_duration = parse_duration_str(raw['length']))

        transformed = insert(transformed)
+        session.flush()

 def parse_created(created: str, date_archived: datetime) -> datetime:
    """Convert a created string (e.g. ``"1 year, 10 months ago"``) to a datetime 
@@ -93,4 +121,11 @@ def parse_created(created: str, date_archived: datetime) -> datetime:
        _kwargs = {period : int(number) for period, number in dict(reversed(p.split(' ')) for p in periods).items()}
        kwargs = {(k + 's' if k in period_list else k) : v for k, v in _kwargs.items()} 

-        return date_archived - relativedelta(**kwargs)
+        return date_archived - relativedelta(**kwargs)
+
+def parse_duration_str(duration_str: str) -> int:
+    if not duration_str:
+        return None
+    else:
+        duration_list = duration_str.split(':')
+        return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))])
--- a/cisticola/transformer/gettr.py
+++ b/cisticola/transformer/gettr.py
@@ -3,6 +3,8 @@ from loguru import logger
 from typing import Generator, Union, Callable
 import dateutil.parser
 from datetime import datetime, timezone
+from gogettr import PublicClient
+from gogettr.api import GettrApiError

 from cisticola.transformer.base import Transformer 
 from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
@@ -49,7 +51,34 @@ class GettrTransformer(Transformer):
        raw = json.loads(data.raw_data)

        if raw["activity"]["action"] == "shares_pst":
-            forwarded_from = raw["activity"]["uid"]
+            fwd_from = str(raw["activity"]["uid"])
+            channel = session.query(Channel).filter_by(platform_id=str(fwd_from)).first()
+            if channel is None:
+                try:
+                    client = PublicClient()
+                    profile = client.user_info(fwd_from.lower())
+                    screenname = profile.get('_id')
+                    channel = Channel(
+                        name=profile.get('nickname'),
+                        platform_id=screenname,
+                        platform=data.platform,
+                        url="https://gettr.com/user/" + screenname,
+                        screenname=screenname,
+                        category='forwarded',
+                        source=self.__version__,
+                        )
+                except GettrApiError:
+                     channel = Channel(
+                        name=None,
+                        platform_id=fwd_from,
+                        platform=data.platform,
+                        url="https://gettr.com/user/" + fwd_from,
+                        screenname=fwd_from,
+                        category='forwarded',
+                        source=self.__version__,
+                        )
+                channel = insert(channel)
+            forwarded_from = channel.id
        else:
            forwarded_from = None

@@ -69,7 +98,11 @@ class GettrTransformer(Transformer):
            author_username=raw["uid"],
            hashtags=raw.get("htgs", []),
            outlinks = list(filter(None, [raw.get("prevsrc")])),
-            forwarded_from = forwarded_from)
+            forwarded_from = forwarded_from,
+            likes = raw.get('lkbpst'),
+            forwards = raw.get("shbpst"),
+            views = raw.get('vfpst')
+            )

        insert(transformed)

--- a/cisticola/transformer/rumble.py
+++ b/cisticola/transformer/rumble.py
@@ -61,10 +61,19 @@ class RumbleTransformer(Transformer):
            url=raw['link'],
            content=raw['content'],
            author_id=raw['author_id'],
-            author_username=raw['author_name'])
+            author_username=raw['author_name'],
+            views = _process_number(raw.get('views')),
+            likes = _process_number(raw.get('rumbles')))

        insert(transformed)

        # media = self.process_media(raw, transformed.id, data)
        # for m in media:
-        #     insert(m)
+        #     insert(m)
+
+def _process_number(s):
+
+    if s is None:
+        return None
+    else:
+        return int(s.replace(',', ''))
--- a/cisticola/transformer/telegram_telethon.py
+++ b/cisticola/transformer/telegram_telethon.py
@@ -220,7 +220,9 @@ class TelegramTelethonTransformer(Transformer):
            author_username=author_username,
            forwarded_from=fwd_from,
            reply_to=reply_to,
-            mentions = mentions
+            mentions = mentions,
+            forwards = raw.get('forwards'),
+            views = raw.get('views')
        )

        transformed = insert(transformed)