From 7d72c0de050a4e86401f63e6dd990c2d48f5906e Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Fri, 29 Jul 2022 12:16:17 +0200 Subject: [PATCH 1/5] Add index for network analysis --- cisticola/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cisticola/base.py b/cisticola/base.py index d1f80b8..01df6f4 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -6,7 +6,7 @@ import json import io from sqlalchemy.orm import registry -from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean +from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean, Index import pytesseract import PIL import exiftool @@ -515,6 +515,8 @@ post_table = Table('posts', mapper_registry.metadata, Column('normalized_content', String) ) +posts_forwarded_from_channel_index = Index('posts_channel_forwarded_from_idx', post_table.c.channel, post_table.c.forwarded_from) + media_table = Table('media', mapper_registry.metadata, Column('id', Integer, primary_key=True, autoincrement=True), From f3997ff6ae1f28a1aca8ed59c665457908e7ed62 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Mon, 1 Aug 2022 09:58:52 +0000 Subject: [PATCH 2/5] Catch errors in Bitchute channel profile scraper; add multi index on posts forwarded from/channel --- cisticola/base.py | 4 +++- cisticola/scraper/bitchute.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cisticola/base.py b/cisticola/base.py index d1f80b8..01df6f4 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -6,7 +6,7 @@ import json import io from sqlalchemy.orm import registry -from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean +from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean, Index import pytesseract import PIL import exiftool @@ -515,6 +515,8 @@ post_table = Table('posts', mapper_registry.metadata, Column('normalized_content', String) ) +posts_forwarded_from_channel_index = Index('posts_channel_forwarded_from_idx', post_table.c.channel, post_table.c.forwarded_from) + media_table = Table('media', mapper_registry.metadata, Column('id', Integer, primary_key=True, autoincrement=True), diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index b640e1d..284b300 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -71,7 +71,7 @@ class BitchuteScraper(Scraper): if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None: return True - @logger.catch(reraise = True) + @logger.catch def get_profile(self, channel: Channel) -> RawChannelInfo: base_url = channel.url From 4a17c3475de39c7613e3723d07022e4a5ba266d7 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Wed, 24 Aug 2022 15:32:19 +0200 Subject: [PATCH 3/5] Add explicit source column to gsheet --- sync_with_gsheet.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sync_with_gsheet.py b/sync_with_gsheet.py index 72eee59..04d9998 100644 --- a/sync_with_gsheet.py +++ b/sync_with_gsheet.py @@ -55,7 +55,7 @@ def sync_channels(args, session): channel = session.query(Channel).filter_by(platform=str(c["platform"]), screenname=str(c["screenname"])).first() if not channel: - channel = Channel(**c, source="researcher") + channel = Channel(**c) logger.debug(f"{channel} does not exist, adding") session.add(channel) session.flush() @@ -77,7 +77,7 @@ def sync_channels(args, session): channel.public = c["public"] channel.chat = c["chat"] channel.notes = c["notes"] - channel.source = "researcher" + channel.source = c["source"] session.flush() session.commit() @@ -118,7 +118,7 @@ def sync_channels(args, session): channel.public = c["public"] channel.chat = c["chat"] channel.notes = c["notes"] - channel.source = "researcher" + channel.source = c["source"] if channel_info and channel.screenname != channel_info.screenname: channel.screenname = channel_info.screenname From a01d139bef0522ba318bb085962a72a355c31319 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Wed, 24 Aug 2022 15:35:08 +0200 Subject: [PATCH 4/5] Remove normalized_url column from channel creation --- sync_with_gsheet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sync_with_gsheet.py b/sync_with_gsheet.py index 04d9998..1c15fbe 100644 --- a/sync_with_gsheet.py +++ b/sync_with_gsheet.py @@ -34,6 +34,7 @@ def sync_channels(args, session): # add new channel if c["id"] == "" or c["id"] is None: del c["id"] + del c["normalized_url"] # check to see if this already exists, platform_id = None From 86656f8ba3c12fd8f92abaeb2ee374b659991951 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Fri, 26 Aug 2022 15:56:46 +0200 Subject: [PATCH 5/5] Scrape snowball_it channels too --- cisticola/scraper/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index e8d2d7d..96090e0 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -342,7 +342,7 @@ class ScraperController: session = self.session() - channels = session.query(Channel).where(Channel.source=='researcher').all() + channels = session.query(Channel).filter((Channel.source=='researcher')|(Channel.source=='snowball_it')).all() session.close() @@ -359,7 +359,7 @@ class ScraperController: # This will sort the channels by the least recently scraped. most_recently_archived = session.query(func.max(RawChannelInfo.date_archived).label("date"), RawChannelInfo.channel.label("channel")).group_by(RawChannelInfo.channel).subquery() channels = session.query(Channel).\ - where(Channel.source=='researcher').\ + filter((Channel.source=='researcher')|(Channel.source=='snowball_it')).\ outerjoin(most_recently_archived, Channel.id == most_recently_archived.c.channel).\ order_by(nullsfirst(most_recently_archived.c.date.asc())).all()