mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-11 21:08:34 +03:00
Merge branch 'main' of https://github.com/bellingcat/cisticola into main
This commit is contained in:
@@ -6,7 +6,7 @@ import json
|
||||
import io
|
||||
|
||||
from sqlalchemy.orm import registry
|
||||
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean
|
||||
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean, Index
|
||||
import pytesseract
|
||||
import PIL
|
||||
import exiftool
|
||||
@@ -515,6 +515,8 @@ post_table = Table('posts', mapper_registry.metadata,
|
||||
Column('normalized_content', String)
|
||||
)
|
||||
|
||||
posts_forwarded_from_channel_index = Index('posts_channel_forwarded_from_idx', post_table.c.channel, post_table.c.forwarded_from)
|
||||
|
||||
media_table = Table('media', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
|
||||
@@ -342,7 +342,7 @@ class ScraperController:
|
||||
|
||||
session = self.session()
|
||||
|
||||
channels = session.query(Channel).where(Channel.source=='researcher').all()
|
||||
channels = session.query(Channel).filter((Channel.source=='researcher')|(Channel.source=='snowball_it')).all()
|
||||
|
||||
session.close()
|
||||
|
||||
@@ -359,7 +359,7 @@ class ScraperController:
|
||||
# This will sort the channels by the least recently scraped.
|
||||
most_recently_archived = session.query(func.max(RawChannelInfo.date_archived).label("date"), RawChannelInfo.channel.label("channel")).group_by(RawChannelInfo.channel).subquery()
|
||||
channels = session.query(Channel).\
|
||||
where(Channel.source=='researcher').\
|
||||
filter((Channel.source=='researcher')|(Channel.source=='snowball_it')).\
|
||||
outerjoin(most_recently_archived, Channel.id == most_recently_archived.c.channel).\
|
||||
order_by(nullsfirst(most_recently_archived.c.date.asc())).all()
|
||||
|
||||
|
||||
@@ -71,7 +71,7 @@ class BitchuteScraper(Scraper):
|
||||
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
@logger.catch(reraise = True)
|
||||
@logger.catch
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
base_url = channel.url
|
||||
|
||||
@@ -34,6 +34,7 @@ def sync_channels(args, session):
|
||||
# add new channel
|
||||
if c["id"] == "" or c["id"] is None:
|
||||
del c["id"]
|
||||
del c["normalized_url"]
|
||||
|
||||
# check to see if this already exists,
|
||||
platform_id = None
|
||||
@@ -55,7 +56,7 @@ def sync_channels(args, session):
|
||||
channel = session.query(Channel).filter_by(platform=str(c["platform"]), screenname=str(c["screenname"])).first()
|
||||
|
||||
if not channel:
|
||||
channel = Channel(**c, source="researcher")
|
||||
channel = Channel(**c)
|
||||
logger.debug(f"{channel} does not exist, adding")
|
||||
session.add(channel)
|
||||
session.flush()
|
||||
@@ -77,7 +78,7 @@ def sync_channels(args, session):
|
||||
channel.public = c["public"]
|
||||
channel.chat = c["chat"]
|
||||
channel.notes = c["notes"]
|
||||
channel.source = "researcher"
|
||||
channel.source = c["source"]
|
||||
|
||||
session.flush()
|
||||
session.commit()
|
||||
@@ -118,7 +119,7 @@ def sync_channels(args, session):
|
||||
channel.public = c["public"]
|
||||
channel.chat = c["chat"]
|
||||
channel.notes = c["notes"]
|
||||
channel.source = "researcher"
|
||||
channel.source = c["source"]
|
||||
|
||||
if channel_info and channel.screenname != channel_info.screenname:
|
||||
channel.screenname = channel_info.screenname
|
||||
|
||||
Reference in New Issue
Block a user