This commit is contained in:
Logan Williams
2022-08-29 09:11:21 +00:00
4 changed files with 10 additions and 7 deletions

View File

@@ -6,7 +6,7 @@ import json
import io
from sqlalchemy.orm import registry
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean, Index
import pytesseract
import PIL
import exiftool
@@ -515,6 +515,8 @@ post_table = Table('posts', mapper_registry.metadata,
Column('normalized_content', String)
)
posts_forwarded_from_channel_index = Index('posts_channel_forwarded_from_idx', post_table.c.channel, post_table.c.forwarded_from)
media_table = Table('media', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),

View File

@@ -342,7 +342,7 @@ class ScraperController:
session = self.session()
channels = session.query(Channel).where(Channel.source=='researcher').all()
channels = session.query(Channel).filter((Channel.source=='researcher')|(Channel.source=='snowball_it')).all()
session.close()
@@ -359,7 +359,7 @@ class ScraperController:
# This will sort the channels by the least recently scraped.
most_recently_archived = session.query(func.max(RawChannelInfo.date_archived).label("date"), RawChannelInfo.channel.label("channel")).group_by(RawChannelInfo.channel).subquery()
channels = session.query(Channel).\
where(Channel.source=='researcher').\
filter((Channel.source=='researcher')|(Channel.source=='snowball_it')).\
outerjoin(most_recently_archived, Channel.id == most_recently_archived.c.channel).\
order_by(nullsfirst(most_recently_archived.c.date.asc())).all()

View File

@@ -71,7 +71,7 @@ class BitchuteScraper(Scraper):
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
return True
@logger.catch(reraise = True)
@logger.catch
def get_profile(self, channel: Channel) -> RawChannelInfo:
base_url = channel.url

View File

@@ -34,6 +34,7 @@ def sync_channels(args, session):
# add new channel
if c["id"] == "" or c["id"] is None:
del c["id"]
del c["normalized_url"]
# check to see if this already exists,
platform_id = None
@@ -55,7 +56,7 @@ def sync_channels(args, session):
channel = session.query(Channel).filter_by(platform=str(c["platform"]), screenname=str(c["screenname"])).first()
if not channel:
channel = Channel(**c, source="researcher")
channel = Channel(**c)
logger.debug(f"{channel} does not exist, adding")
session.add(channel)
session.flush()
@@ -77,7 +78,7 @@ def sync_channels(args, session):
channel.public = c["public"]
channel.chat = c["chat"]
channel.notes = c["notes"]
channel.source = "researcher"
channel.source = c["source"]
session.flush()
session.commit()
@@ -118,7 +119,7 @@ def sync_channels(args, session):
channel.public = c["public"]
channel.chat = c["chat"]
channel.notes = c["notes"]
channel.source = "researcher"
channel.source = c["source"]
if channel_info and channel.screenname != channel_info.screenname:
channel.screenname = channel_info.screenname