mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
Use new RawChannelInfo class
This commit is contained in:
4
Pipfile
4
Pipfile
@@ -23,6 +23,8 @@ gspread = "*"
|
||||
cryptg = "*"
|
||||
gabber = {git = "https://github.com/stanfordio/gabber.git"}
|
||||
psycopg2-binary = "*"
|
||||
tqdm = "*"
|
||||
ratelimit = "*"
|
||||
|
||||
[dev-packages]
|
||||
pytest = "*"
|
||||
@@ -34,7 +36,7 @@ sphinx = "*"
|
||||
sphinx_rtd_theme = "*"
|
||||
|
||||
[requires]
|
||||
python_version = "3.9"
|
||||
python_version = "3.8"
|
||||
|
||||
[pipenv]
|
||||
allow_prereleases = true
|
||||
|
||||
1532
Pipfile.lock
generated
Normal file
1532
Pipfile.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
33
app.py
33
app.py
@@ -3,6 +3,8 @@ from loguru import logger
|
||||
import gspread
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import os
|
||||
import time
|
||||
|
||||
from cisticola.base import Channel, mapper_registry
|
||||
from cisticola.scraper import (
|
||||
@@ -19,7 +21,7 @@ from cisticola.scraper import (
|
||||
def sync_channels(args):
|
||||
logger.info("Synchronizing channels")
|
||||
|
||||
session = get_db_session(args)
|
||||
session = get_db_session()
|
||||
|
||||
gc = gspread.service_account(filename='service_account.json')
|
||||
|
||||
@@ -29,6 +31,7 @@ def sync_channels(args):
|
||||
row = 2
|
||||
|
||||
for c in channels:
|
||||
logger.info(c)
|
||||
del c['id']
|
||||
del c['followers']
|
||||
|
||||
@@ -43,20 +46,29 @@ def sync_channels(args):
|
||||
|
||||
|
||||
# check to see if this already exists,
|
||||
channel = session.query(Channel).filter_by(platform_id=None if c['platform_id'] == '' else c['platform_id'], platform=c['platform'], url=c['url']).first()
|
||||
|
||||
platform_id = None
|
||||
if c['platform_id'] != '':
|
||||
platform_id = c['platform_id']
|
||||
|
||||
channel = session.query(Channel).filter_by(platform_id=platform_id, platform=c['platform'], url=c['url']).first()
|
||||
logger.info(channel)
|
||||
|
||||
if not channel:
|
||||
channel = Channel(**c, source='researcher')
|
||||
logger.debug(f"{channel} does not exist, adding")
|
||||
session.add(channel)
|
||||
session.flush()
|
||||
session.commit()
|
||||
|
||||
wks.update_cell(row, 1, channel.id)
|
||||
time.sleep(1)
|
||||
|
||||
row += 1
|
||||
|
||||
session.commit()
|
||||
|
||||
def get_db_session(args):
|
||||
engine = create_engine(args.db)
|
||||
def get_db_session():
|
||||
engine = create_engine(os.environ['DB'])
|
||||
|
||||
session_generator = sessionmaker()
|
||||
session_generator.configure(bind=engine)
|
||||
@@ -64,8 +76,8 @@ def get_db_session(args):
|
||||
|
||||
return session
|
||||
|
||||
def get_scraper_controller(args):
|
||||
engine = create_engine(args.db)
|
||||
def get_scraper_controller():
|
||||
engine = create_engine(os.environ['DB'])
|
||||
|
||||
controller = ScraperController()
|
||||
controller.connect_to_db(engine)
|
||||
@@ -90,8 +102,8 @@ def archive_media(args):
|
||||
controller = get_scraper_controller(args)
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
def init_db(args):
|
||||
engine = create_engine(args.db)
|
||||
def init_db():
|
||||
engine = create_engine(os.environ['DB'])
|
||||
mapper_registry.metadata.create_all(bind=engine)
|
||||
|
||||
if __name__ == '__main__':
|
||||
@@ -99,14 +111,13 @@ if __name__ == '__main__':
|
||||
|
||||
parser = argparse.ArgumentParser(description = 'Cisticola command line tools')
|
||||
parser.add_argument('command', type=str, help='Command to run: "sync-channels", "scrape-channels", or "archive-media"')
|
||||
parser.add_argument('--db', type=str, help='[*] Sqlalchemy database string, eg, "sqlite:///cisticola.db"')
|
||||
parser.add_argument('--gsheet', type=str, help='[sync-channels] URL of Google Sheet to synchronize')
|
||||
parser.add_argument('--media', action='store_true', help='[scrape-channels] Add this flag to media')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.command == 'init-db':
|
||||
init_db(args)
|
||||
init_db()
|
||||
elif args.command == 'sync-channels':
|
||||
sync_channels(args)
|
||||
elif args.command == 'scrape-channels':
|
||||
|
||||
@@ -34,7 +34,7 @@ class ScraperResult:
|
||||
date: datetime
|
||||
|
||||
#: JSON dump of dict that contains all data scraped for the post.
|
||||
raw_data: str
|
||||
raw_posts: str
|
||||
|
||||
#: Datetime (relative to UTC) that the scraped post was archived at.
|
||||
date_archived: datetime
|
||||
@@ -44,7 +44,7 @@ class ScraperResult:
|
||||
|
||||
#: Has the media in this post been archived?
|
||||
media_archived: bool
|
||||
|
||||
|
||||
@dataclass
|
||||
class Channel:
|
||||
"""Information about a specific channel to be scraped.
|
||||
@@ -89,11 +89,31 @@ class Channel:
|
||||
def hydrate(self):
|
||||
pass
|
||||
|
||||
@dataclass
|
||||
class RawChannelInfo:
|
||||
"""A minimally processed result from a scraper
|
||||
"""
|
||||
|
||||
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
|
||||
scraper: str
|
||||
|
||||
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
|
||||
platform: str
|
||||
|
||||
#: Foreign key of channel ID that this was scraped from
|
||||
channel: int
|
||||
|
||||
#: JSON dump of dict that contains all data scraped for the post.
|
||||
raw_data: str
|
||||
|
||||
#: Datetime (relative to UTC) that the scraped post was archived at.
|
||||
date_archived: datetime
|
||||
|
||||
@dataclass
|
||||
class Post:
|
||||
"""An object with fields for columns in the analysis table"""
|
||||
|
||||
#: ID number of the scraped post in the ``raw_data`` table
|
||||
#: ID number of the scraped post in the ``raw_posts`` table
|
||||
raw_id: int
|
||||
|
||||
#: Platform specific post ID
|
||||
@@ -144,7 +164,7 @@ class Media:
|
||||
"""Base class for organizing information about a media file.
|
||||
"""
|
||||
|
||||
#: ID number of the media's corresponding scraped post in the ``raw_data`` table.
|
||||
#: ID number of the media's corresponding scraped post in the ``raw_posts`` table.
|
||||
raw_id: int
|
||||
|
||||
#: ID number of the media's corresponging scraped post in the ``analysis`` table.
|
||||
@@ -221,7 +241,7 @@ class Video(Media):
|
||||
|
||||
mapper_registry = registry()
|
||||
|
||||
raw_data_table = Table('raw_data', mapper_registry.metadata,
|
||||
raw_posts_table = Table('raw_posts', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
Column('scraper', String),
|
||||
@@ -229,15 +249,23 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
|
||||
Column('channel', Integer, ForeignKey('channels.id')),
|
||||
Column('platform_id', String),
|
||||
Column('date', DateTime),
|
||||
Column('raw_data', String),
|
||||
Column('raw_posts', String),
|
||||
Column('date_archived', DateTime),
|
||||
Column('archived_urls', JSON),
|
||||
Column('media_archived', Boolean))
|
||||
|
||||
raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True),
|
||||
Column('scraper', String),
|
||||
Column('platform', String),
|
||||
Column('channel', Integer, ForeignKey('channels.id')),
|
||||
Column('raw_data', String),
|
||||
Column('date_archived', DateTime))
|
||||
|
||||
channel_table = Table('channels', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True, autoincrement=True),
|
||||
Column('name', String),
|
||||
Column('platform_id', Integer),
|
||||
Column('platform_id', String),
|
||||
Column('category', String),
|
||||
Column('platform', String),
|
||||
Column('url', String),
|
||||
@@ -253,7 +281,7 @@ channel_table = Table('channels', mapper_registry.metadata,
|
||||
post_table = Table('posts', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
Column('raw_id', Integer, ForeignKey('raw_data.id')),
|
||||
Column('raw_id', Integer, ForeignKey('raw_posts.id')),
|
||||
Column('platform_id', Integer),
|
||||
Column('scraper', String),
|
||||
Column('transformer', String),
|
||||
@@ -273,7 +301,7 @@ media_table = Table('media', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
Column('type', String),
|
||||
Column('raw_id', Integer, ForeignKey('raw_data.id')),
|
||||
Column('raw_id', Integer, ForeignKey('raw_posts.id')),
|
||||
Column('post', Integer, ForeignKey('posts.id')),
|
||||
Column('url', String),
|
||||
Column('original_url', String),
|
||||
@@ -282,7 +310,8 @@ media_table = Table('media', mapper_registry.metadata,
|
||||
|
||||
mapper_registry.map_imperatively(Post, post_table)
|
||||
mapper_registry.map_imperatively(Channel, channel_table)
|
||||
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
|
||||
mapper_registry.map_imperatively(ScraperResult, raw_posts_table)
|
||||
mapper_registry.map_imperatively(RawChannelInfo, raw_channel_info_table)
|
||||
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
|
||||
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
|
||||
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')
|
||||
@@ -9,7 +9,7 @@ from typing import Generator
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class BitchuteScraper(Scraper):
|
||||
@@ -57,7 +57,7 @@ class BitchuteScraper(Scraper):
|
||||
platform_id=post['id'],
|
||||
date=datetime.fromtimestamp(post['timestamp']),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(post),
|
||||
raw_posts=json.dumps(post),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
@@ -65,7 +65,7 @@ class BitchuteScraper(Scraper):
|
||||
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
base_url = channel.url
|
||||
|
||||
@@ -106,8 +106,12 @@ class BitchuteScraper(Scraper):
|
||||
'subscribers': counts['subscriber_count'],
|
||||
'views': int(counts['about_view_count'].split(' ')[0])}
|
||||
|
||||
return profile
|
||||
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
def strip_tags(html, convert_newlines=True):
|
||||
|
||||
@@ -5,7 +5,7 @@ import os
|
||||
|
||||
from gabber.client import Client, GAB_API_BASE_URL
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class GabScraper(Scraper):
|
||||
@@ -80,7 +80,7 @@ class GabScraper(Scraper):
|
||||
platform_id=post['id'],
|
||||
date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(post),
|
||||
raw_posts=json.dumps(post),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
@@ -88,7 +88,7 @@ class GabScraper(Scraper):
|
||||
if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
client = Client(
|
||||
username = os.environ['GAB_USER'],
|
||||
@@ -106,4 +106,8 @@ class GabScraper(Scraper):
|
||||
|
||||
profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
|
||||
|
||||
return profile
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
@@ -5,7 +5,7 @@ from urllib.parse import urlparse
|
||||
|
||||
from gogettr import PublicClient
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class GettrScraper(Scraper):
|
||||
@@ -58,7 +58,7 @@ class GettrScraper(Scraper):
|
||||
platform_id=post['_id'],
|
||||
date=datetime.fromtimestamp(post['cdate']/1000.),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(post),
|
||||
raw_posts=json.dumps(post),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
@@ -71,9 +71,13 @@ class GettrScraper(Scraper):
|
||||
key = urlparse(url).path.split('/')[-2] + ext
|
||||
return key
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
client = client = PublicClient()
|
||||
username = self.get_username_from_url(channel.url)
|
||||
profile = client.user_info(username)
|
||||
|
||||
return profile
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
@@ -8,7 +8,7 @@ from pathlib import Path
|
||||
from loguru import logger
|
||||
import instaloader
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
BASE_URL = 'https://www.instagram.com/'
|
||||
@@ -79,7 +79,7 @@ class InstagramScraper(Scraper):
|
||||
platform_id=post.mediaid,
|
||||
date=post.date_utc,
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(post._asdict(), default=str),
|
||||
raw_posts=json.dumps(post._asdict(), default=str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
@@ -96,7 +96,7 @@ class InstagramScraper(Scraper):
|
||||
platform_id=post.mediaid,
|
||||
date=comment.created_at_utc,
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(comment_dict, default=str),
|
||||
raw_posts=json.dumps(comment_dict, default=str),
|
||||
archived_urls={},
|
||||
media_archived=archive_media)
|
||||
|
||||
@@ -104,7 +104,7 @@ class InstagramScraper(Scraper):
|
||||
if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
@@ -125,4 +125,8 @@ class InstagramScraper(Scraper):
|
||||
profile['followers'] = user_profile.followers
|
||||
profile['followees'] = user_profile.followees
|
||||
|
||||
return profile
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
@@ -8,7 +8,7 @@ from loguru import logger
|
||||
|
||||
from polyphemus.base import OdyseeChannel
|
||||
from polyphemus.api import get_auth_token
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class OdyseeScraper(Scraper):
|
||||
@@ -60,7 +60,7 @@ class OdyseeScraper(Scraper):
|
||||
platform_id=video.info['claim_id'],
|
||||
date=datetime.fromtimestamp(video.info['created']),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(video.info),
|
||||
raw_posts=json.dumps(video.info),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
@@ -73,7 +73,7 @@ class OdyseeScraper(Scraper):
|
||||
platform_id=comment.info['claim_id'],
|
||||
date=datetime.fromtimestamp(comment.info['created']),
|
||||
date_archived=datetime.now(),
|
||||
raw_data=json.dumps(comment.info),
|
||||
raw_posts=json.dumps(comment.info),
|
||||
archived_urls={},
|
||||
media_archived=True)
|
||||
|
||||
@@ -87,10 +87,14 @@ class OdyseeScraper(Scraper):
|
||||
|
||||
return f'{key}.{ext}'
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token)
|
||||
profile = odysee_channel.info
|
||||
|
||||
return profile
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
@@ -5,7 +5,7 @@ from urllib.parse import urlparse
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper import Scraper, make_request
|
||||
|
||||
BASE_URL = 'https://rumble.com'
|
||||
@@ -39,7 +39,7 @@ class RumbleScraper(Scraper):
|
||||
platform_id=post['media_url'].split('/')[-2],
|
||||
date=post['datetime'].replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(post, default = str),
|
||||
raw_posts=json.dumps(post, default = str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
@@ -52,11 +52,15 @@ class RumbleScraper(Scraper):
|
||||
if channel.platform == "Rumble" and channel.url is not None:
|
||||
return True
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
profile = get_channel_profile(url = channel.url)
|
||||
|
||||
return profile
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
@@ -128,6 +132,7 @@ def get_channel_profile(url):
|
||||
'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None,
|
||||
'cover': cover_soup.get('src') if cover_soup else None,
|
||||
'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text}
|
||||
|
||||
return profile
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
@@ -1,10 +1,10 @@
|
||||
from typing import Generator
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import json
|
||||
import snscrape.modules
|
||||
from loguru import logger
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class TelegramSnscrapeScraper(Scraper):
|
||||
@@ -49,15 +49,20 @@ class TelegramSnscrapeScraper(Scraper):
|
||||
platform_id=post.url,
|
||||
date=post.date,
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=post.json(),
|
||||
raw_posts=post.json(),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media
|
||||
)
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
scr = snscrape.modules.telegram.TelegramChannelScraper(
|
||||
channel.screenname)
|
||||
|
||||
profile = scr._get_entity().__dict__
|
||||
return profile
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
@@ -11,7 +11,7 @@ from telethon.sync import TelegramClient
|
||||
from telethon.tl.functions.channels import GetFullChannelRequest
|
||||
from telethon.tl import types
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
|
||||
@@ -44,7 +44,7 @@ class TelegramTelethonScraper(Scraper):
|
||||
key = list(result.archived_urls.keys())[0]
|
||||
|
||||
if result.archived_urls[key] is None:
|
||||
raw = json.loads(result.raw_data)
|
||||
raw = json.loads(result.raw_posts)
|
||||
|
||||
message = client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']])
|
||||
|
||||
@@ -141,11 +141,11 @@ class TelegramTelethonScraper(Scraper):
|
||||
platform_id=post_url,
|
||||
date=post.date.replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(post.to_dict(), default=str),
|
||||
raw_posts=json.dumps(post.to_dict(), default=str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
@@ -157,4 +157,8 @@ class TelegramTelethonScraper(Scraper):
|
||||
full_channel = client(GetFullChannelRequest(channel = username))
|
||||
profile = full_channel.__dict__
|
||||
|
||||
return profile
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
from datetime import datetime, timezone
|
||||
from typing import Generator
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
|
||||
from snscrape.modules.twitter import TwitterProfileScraper, TwitterUserScraper, Video, Gif, Photo
|
||||
from loguru import logger
|
||||
import json
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper, ChannelDoesNotExistError
|
||||
|
||||
class TwitterScraper(Scraper):
|
||||
@@ -66,7 +66,7 @@ class TwitterScraper(Scraper):
|
||||
platform_id=tweet.id,
|
||||
date=tweet.date,
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=tweet.json(),
|
||||
raw_posts=tweet.json(),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
@@ -91,7 +91,7 @@ class TwitterScraper(Scraper):
|
||||
key = parsed_url.path.split('/')[-1] + ext
|
||||
return key
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
scraper = TwitterUserScraper(channel.screenname)
|
||||
entity = scraper._get_entity()
|
||||
@@ -99,4 +99,8 @@ class TwitterScraper(Scraper):
|
||||
if entity is None:
|
||||
raise ChannelDoesNotExistError(channel.url)
|
||||
else:
|
||||
return entity.__dict__
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(emtity.__dict__),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
from datetime import datetime, timezone
|
||||
from typing import Generator
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from snscrape.modules.vkontakte import VKontakteUserScraper
|
||||
from loguru import logger
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class VkontakteScraper(Scraper):
|
||||
@@ -62,7 +61,7 @@ class VkontakteScraper(Scraper):
|
||||
platform_id=post.url.split('/')[-1],
|
||||
date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=post.json(),
|
||||
raw_posts=post.json(),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
@@ -80,10 +79,15 @@ class VkontakteScraper(Scraper):
|
||||
|
||||
return key
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
scraper = VKontakteUserScraper(username)
|
||||
|
||||
profile = scraper._get_entity().__dict__
|
||||
return profile
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
@@ -2,10 +2,9 @@ from datetime import datetime, timezone
|
||||
import json
|
||||
from typing import Generator
|
||||
import tempfile
|
||||
|
||||
import yt_dlp
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper import Scraper
|
||||
|
||||
class YoutubeScraper(Scraper):
|
||||
@@ -71,7 +70,7 @@ class YoutubeScraper(Scraper):
|
||||
platform_id=video_id,
|
||||
date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(video, default = str),
|
||||
raw_posts=json.dumps(video, default = str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
@@ -79,8 +78,7 @@ class YoutubeScraper(Scraper):
|
||||
if channel.platform == "Youtube" and channel.url:
|
||||
return True
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
ydl_opts = {}
|
||||
ydl = yt_dlp.YoutubeDL(ydl_opts)
|
||||
|
||||
@@ -89,7 +87,12 @@ class YoutubeScraper(Scraper):
|
||||
meta = ydl.extract_info(
|
||||
channel.url,
|
||||
process=False)
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(meta),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
except yt_dlp.utils.DownloadError as e:
|
||||
raise e
|
||||
|
||||
return meta
|
||||
@@ -20,7 +20,7 @@ class BitchuteTransformer(Transformer):
|
||||
return False
|
||||
|
||||
def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
raw = json.loads(data.raw_posts)
|
||||
|
||||
orig = raw['video_url']
|
||||
new = data.archived_urls[orig]
|
||||
@@ -30,7 +30,7 @@ class BitchuteTransformer(Transformer):
|
||||
yield m
|
||||
|
||||
def transform(self, data: ScraperResult) -> Post:
|
||||
raw = json.loads(data.raw_data)
|
||||
raw = json.loads(data.raw_posts)
|
||||
|
||||
soup = BeautifulSoup(raw['body'], features = 'html.parser')
|
||||
content = soup.find_all('p')[-1].text
|
||||
|
||||
@@ -47,7 +47,7 @@ class TwitterTransformer(Transformer):
|
||||
|
||||
|
||||
def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
raw = json.loads(data.raw_posts)
|
||||
|
||||
transformed = Post(
|
||||
raw_id=data.id,
|
||||
|
||||
Reference in New Issue
Block a user