From 4a17c3475de39c7613e3723d07022e4a5ba266d7 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Wed, 24 Aug 2022 15:32:19 +0200 Subject: [PATCH 1/4] Add explicit source column to gsheet --- sync_with_gsheet.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sync_with_gsheet.py b/sync_with_gsheet.py index 72eee59..04d9998 100644 --- a/sync_with_gsheet.py +++ b/sync_with_gsheet.py @@ -55,7 +55,7 @@ def sync_channels(args, session): channel = session.query(Channel).filter_by(platform=str(c["platform"]), screenname=str(c["screenname"])).first() if not channel: - channel = Channel(**c, source="researcher") + channel = Channel(**c) logger.debug(f"{channel} does not exist, adding") session.add(channel) session.flush() @@ -77,7 +77,7 @@ def sync_channels(args, session): channel.public = c["public"] channel.chat = c["chat"] channel.notes = c["notes"] - channel.source = "researcher" + channel.source = c["source"] session.flush() session.commit() @@ -118,7 +118,7 @@ def sync_channels(args, session): channel.public = c["public"] channel.chat = c["chat"] channel.notes = c["notes"] - channel.source = "researcher" + channel.source = c["source"] if channel_info and channel.screenname != channel_info.screenname: channel.screenname = channel_info.screenname From a01d139bef0522ba318bb085962a72a355c31319 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Wed, 24 Aug 2022 15:35:08 +0200 Subject: [PATCH 2/4] Remove normalized_url column from channel creation --- sync_with_gsheet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sync_with_gsheet.py b/sync_with_gsheet.py index 04d9998..1c15fbe 100644 --- a/sync_with_gsheet.py +++ b/sync_with_gsheet.py @@ -34,6 +34,7 @@ def sync_channels(args, session): # add new channel if c["id"] == "" or c["id"] is None: del c["id"] + del c["normalized_url"] # check to see if this already exists, platform_id = None From 86656f8ba3c12fd8f92abaeb2ee374b659991951 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Fri, 26 Aug 2022 15:56:46 +0200 Subject: [PATCH 3/4] Scrape snowball_it channels too --- cisticola/scraper/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index e8d2d7d..96090e0 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -342,7 +342,7 @@ class ScraperController: session = self.session() - channels = session.query(Channel).where(Channel.source=='researcher').all() + channels = session.query(Channel).filter((Channel.source=='researcher')|(Channel.source=='snowball_it')).all() session.close() @@ -359,7 +359,7 @@ class ScraperController: # This will sort the channels by the least recently scraped. most_recently_archived = session.query(func.max(RawChannelInfo.date_archived).label("date"), RawChannelInfo.channel.label("channel")).group_by(RawChannelInfo.channel).subquery() channels = session.query(Channel).\ - where(Channel.source=='researcher').\ + filter((Channel.source=='researcher')|(Channel.source=='snowball_it')).\ outerjoin(most_recently_archived, Channel.id == most_recently_archived.c.channel).\ order_by(nullsfirst(most_recently_archived.c.date.asc())).all() From 1a29c0606236a97613667c36d23bd0a3d9a9bdd9 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Mon, 29 Aug 2022 09:11:06 +0000 Subject: [PATCH 4/4] Fix case where post is dummy (-1) --- cisticola/scraper/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index e8d2d7d..4343788 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -460,7 +460,7 @@ class ScraperController: for scraper in self.scrapers: # compare major versions - if scraper.__version__.split('.')[0] == post.scraper.split('.')[0]: + if post.scraper is not None and scraper.__version__.split('.')[0] == post.scraper.split('.')[0]: handled = True logger.debug(f"{scraper} is archiving media for ID {post.id}") post = scraper.archive_files(post)