mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
refactored import structure
This commit is contained in:
@@ -1,71 +1,3 @@
|
||||
from typing import List
|
||||
import cisticola.base
|
||||
import cisticola.scraper.base
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class ScraperController:
|
||||
"""Registers scrapers, uses them to generate ScraperResults. Synchronizes
|
||||
everything with database via ORM."""
|
||||
|
||||
def __init__(self):
|
||||
self.scrapers = []
|
||||
self.session = None
|
||||
self.mapper_registry = None
|
||||
|
||||
def register_scraper(self, scraper: cisticola.scraper.base.Scraper):
|
||||
self.scrapers.append(scraper)
|
||||
|
||||
def scrape_channels(self, channels: List[cisticola.base.Channel]):
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
|
||||
for channel in channels:
|
||||
handled = False
|
||||
|
||||
for scraper in self.scrapers:
|
||||
if scraper.can_handle(channel):
|
||||
session = self.session()
|
||||
handled = True
|
||||
added = 0
|
||||
|
||||
# get most recent post
|
||||
session = self.session()
|
||||
rows = session.query(cisticola.base.ScraperResult).where(
|
||||
cisticola.base.ScraperResult.channel == channel.id).order_by(
|
||||
cisticola.base.ScraperResult.date.desc()).limit(1).all()
|
||||
|
||||
if len(rows) == 1:
|
||||
since = rows[0]
|
||||
else:
|
||||
since = None
|
||||
|
||||
posts = scraper.get_posts(channel, since=since)
|
||||
|
||||
for post in posts:
|
||||
session.add(post)
|
||||
added += 1
|
||||
|
||||
session.commit()
|
||||
logger.info(
|
||||
f"{scraper} found {added} new posts from {channel}")
|
||||
break
|
||||
|
||||
if not handled:
|
||||
logger.warning(f"No handler found for Channel {channel}")
|
||||
|
||||
def connect_to_db(self, engine):
|
||||
# create tables
|
||||
cisticola.base.mapper_registry.metadata.create_all(bind=engine)
|
||||
|
||||
self.session = sessionmaker()
|
||||
self.session.configure(bind=engine)
|
||||
|
||||
|
||||
class ETLController:
|
||||
"""This class will transform the raw_data tables into a format more conducive to analysis."""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
from . import base
|
||||
from . import scraper
|
||||
from . import transformer
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
from typing import List
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy.orm import registry
|
||||
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey
|
||||
|
||||
mapper_registry = registry()
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScraperResult:
|
||||
"""A minimally processed result from a scraper"""
|
||||
@@ -84,4 +85,4 @@ analysis_table = Table('analysis', mapper_registry.metadata,
|
||||
Column('author_username', String)
|
||||
)
|
||||
|
||||
mapper_registry.map_imperatively(TransformedResult, analysis_table)
|
||||
mapper_registry.map_imperatively(TransformedResult, analysis_table)
|
||||
131
cisticola/examples/russian_telegram_ingest.py
Normal file
131
cisticola/examples/russian_telegram_ingest.py
Normal file
@@ -0,0 +1,131 @@
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import (
|
||||
ScraperController,
|
||||
TelegramSnscrapeScraper)
|
||||
|
||||
test_channels = [
|
||||
Channel(
|
||||
id=0,
|
||||
name="QAnon Россия",
|
||||
platform_id=-1001319637748,
|
||||
category="Qanon",
|
||||
followers=94048,
|
||||
platform="Telegram",
|
||||
url="https://t.me/qanonrus",
|
||||
screenname="qanonrus",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=1,
|
||||
name="The Great Awakening | Q",
|
||||
platform_id=-1001325597521,
|
||||
category="Qanon",
|
||||
followers=5715,
|
||||
platform="Telegram",
|
||||
url="https://t.me/greatawakin",
|
||||
screenname="greatawakin",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=2,
|
||||
name="Великое Пробуждение",
|
||||
platform_id=-1001285898079,
|
||||
category="Qanon",
|
||||
followers=5861,
|
||||
platform="Telegram",
|
||||
url="https://t.me/greatawakeningrus",
|
||||
screenname="greatawakeningrus",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=3,
|
||||
name="T🕊Редакция Президент Гордон🕊",
|
||||
platform_id=-1001101170442,
|
||||
category="Qanon",
|
||||
followers=5743,
|
||||
platform="Telegram",
|
||||
url="https://t.me/prezidentgordonteam",
|
||||
screenname="prezidentgordonteam",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=4,
|
||||
name="ПРОЕКТ АВРОРА",
|
||||
platform_id=-1001279171101,
|
||||
category="Qanon",
|
||||
followers=5930,
|
||||
platform="Telegram",
|
||||
url="https://t.me/project_aurora",
|
||||
screenname="project_aurora",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=5,
|
||||
name="Сон Разума",
|
||||
platform_id=-1001202338312,
|
||||
category="Qanon",
|
||||
followers=27099,
|
||||
platform="Telegram",
|
||||
url="https://t.me/error_288",
|
||||
screenname="error_288",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=6,
|
||||
name="Пробуждающий Мир - официальный канал",
|
||||
platform_id=-1001492521207,
|
||||
category="Qanon",
|
||||
followers=19097,
|
||||
platform="Telegram",
|
||||
url="https://t.me/promirru",
|
||||
screenname="promirru",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=7,
|
||||
name="ЦЕЛЬНОЗОР",
|
||||
platform_id=-1001642737506,
|
||||
category="Qanon",
|
||||
followers=13654,
|
||||
platform="Telegram",
|
||||
url="https://t.me/tselnozor",
|
||||
screenname="tselnozor",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),]
|
||||
|
||||
controller = ScraperController()
|
||||
|
||||
telegram = TelegramSnscrapeScraper()
|
||||
controller.register_scraper(telegram)
|
||||
|
||||
engine = create_engine('sqlite:///russian_telegram.db')
|
||||
controller.connect_to_db(engine)
|
||||
|
||||
controller.scrape_channels(test_channels)
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
from .base import Scraper, ScraperController
|
||||
from .bitchute import BitchuteScraper
|
||||
from .gab import GabScraper
|
||||
from .gettr import GettrScraper
|
||||
from .odysee import OdyseeScraper
|
||||
from .rumble import RumbleScraper
|
||||
from .telegram_snscrape import TelegramSnscrapeScraper
|
||||
from .twitter import TwitterScraper
|
||||
@@ -1,13 +1,17 @@
|
||||
from typing import Generator, Tuple
|
||||
import cisticola.base
|
||||
import requests
|
||||
from typing import Generator, Tuple, List
|
||||
import os
|
||||
import boto3
|
||||
from io import BytesIO
|
||||
from urllib.parse import urlparse
|
||||
import tempfile
|
||||
|
||||
import requests
|
||||
import boto3
|
||||
from loguru import logger
|
||||
import ffmpeg
|
||||
import tempfile
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from cisticola.base import Channel, ScraperResult, mapper_registry
|
||||
|
||||
class Scraper:
|
||||
__version__ = "Scraper 0.0.0"
|
||||
|
||||
@@ -89,8 +93,77 @@ class Scraper:
|
||||
|
||||
return archived_url
|
||||
|
||||
def can_handle(self, channel: cisticola.base.Channel) -> bool:
|
||||
def can_handle(self, channel: Channel) -> bool:
|
||||
pass
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
pass
|
||||
|
||||
|
||||
class ScraperController:
|
||||
"""Registers scrapers, uses them to generate ScraperResults. Synchronizes
|
||||
everything with database via ORM."""
|
||||
|
||||
def __init__(self):
|
||||
self.scrapers = []
|
||||
self.session = None
|
||||
self.mapper_registry = None
|
||||
|
||||
def register_scraper(self, scraper: Scraper):
|
||||
self.scrapers.append(scraper)
|
||||
|
||||
def register_scrapers(self, scraper: List[Scraper]):
|
||||
self.scrapers.extend(scraper)
|
||||
|
||||
def scrape_channels(self, channels: List[Channel]):
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
|
||||
for channel in channels:
|
||||
handled = False
|
||||
|
||||
for scraper in self.scrapers:
|
||||
if scraper.can_handle(channel):
|
||||
session = self.session()
|
||||
handled = True
|
||||
added = 0
|
||||
|
||||
# get most recent post
|
||||
session = self.session()
|
||||
rows = session.query(ScraperResult).where(
|
||||
ScraperResult.channel == channel.id).order_by(
|
||||
ScraperResult.date.desc()).limit(1).all()
|
||||
|
||||
if len(rows) == 1:
|
||||
since = rows[0]
|
||||
else:
|
||||
since = None
|
||||
|
||||
posts = scraper.get_posts(channel, since=since)
|
||||
|
||||
for post in posts:
|
||||
session.add(post)
|
||||
added += 1
|
||||
|
||||
session.commit()
|
||||
logger.info(
|
||||
f"{scraper} found {added} new posts from {channel}")
|
||||
break
|
||||
|
||||
if not handled:
|
||||
logger.warning(f"No handler found for Channel {channel}")
|
||||
|
||||
def connect_to_db(self, engine):
|
||||
# create tables
|
||||
mapper_registry.metadata.create_all(bind=engine)
|
||||
|
||||
self.session = sessionmaker()
|
||||
self.session.configure(bind=engine)
|
||||
|
||||
|
||||
class ETLController:
|
||||
"""This class will transform the raw_data tables into a format more conducive to analysis."""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@@ -9,9 +9,9 @@ from typing import Generator
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import cisticola.base
|
||||
|
||||
class BitchuteScraper(cisticola.scraper.base.Scraper):
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.scraper.base import Scraper
|
||||
class BitchuteScraper(Scraper):
|
||||
"""An implementation of a Scraper for Bitchute, using classes from the 4cat
|
||||
library"""
|
||||
__version__ = "BitchuteScraper 0.0.1"
|
||||
@@ -23,7 +23,7 @@ class BitchuteScraper(cisticola.scraper.base.Scraper):
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
session = requests.Session()
|
||||
session.headers.update(self.headers)
|
||||
@@ -32,8 +32,6 @@ class BitchuteScraper(cisticola.scraper.base.Scraper):
|
||||
"input", {"name": "csrfmiddlewaretoken"})[0].get("value")
|
||||
time.sleep(0.25)
|
||||
|
||||
# Don't scrape comment information
|
||||
#TODO implement framework for processing and storing comments
|
||||
detail = 'comments'
|
||||
|
||||
username = BitchuteScraper.get_username_from_url(channel.url)
|
||||
@@ -52,7 +50,7 @@ class BitchuteScraper(cisticola.scraper.base.Scraper):
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield cisticola.base.ScraperResult(
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Bitchute",
|
||||
channel=channel.id,
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
import cisticola.base
|
||||
import cisticola.scraper.base
|
||||
from datetime import datetime
|
||||
import json
|
||||
from typing import Generator
|
||||
|
||||
from garc import Garc
|
||||
|
||||
class GabScraper(cisticola.scraper.base.Scraper):
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.scraper.base import Scraper
|
||||
class GabScraper(Scraper):
|
||||
"""An implementation of a Scraper for Gab, using GARC library"""
|
||||
__version__ = "GabScraper 0.0.1"
|
||||
|
||||
@@ -14,7 +15,7 @@ class GabScraper(cisticola.scraper.base.Scraper):
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
client = Garc(profile = 'main')
|
||||
username = GabScraper.get_username_from_url(channel.url)
|
||||
|
||||
@@ -37,7 +38,7 @@ class GabScraper(cisticola.scraper.base.Scraper):
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield cisticola.base.ScraperResult(
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Gab",
|
||||
channel=channel.id,
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
import cisticola.base
|
||||
import cisticola.scraper.base
|
||||
from datetime import datetime
|
||||
import json
|
||||
from typing import Generator, Tuple
|
||||
from gogettr import PublicClient
|
||||
from urllib.parse import urlparse
|
||||
|
||||
class GettrScraper(cisticola.scraper.base.Scraper):
|
||||
from gogettr import PublicClient
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.scraper.base import Scraper
|
||||
class GettrScraper(Scraper):
|
||||
"""An implementation of a Scraper for Gettr, using gogettr library"""
|
||||
__version__ = "GettrScraper 0.0.1"
|
||||
|
||||
@@ -17,7 +18,7 @@ class GettrScraper(cisticola.scraper.base.Scraper):
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
client = PublicClient()
|
||||
username = GettrScraper.get_username_from_url(channel.url)
|
||||
scraper = client.user_activity(username=username, type="posts")
|
||||
@@ -47,7 +48,7 @@ class GettrScraper(cisticola.scraper.base.Scraper):
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[post['vid']] = archived_url
|
||||
|
||||
yield cisticola.base.ScraperResult(
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Gettr",
|
||||
channel=channel.id,
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
import cisticola.base
|
||||
import cisticola.scraper.base
|
||||
from datetime import datetime
|
||||
import json
|
||||
from typing import Generator
|
||||
from polyphemus.base import OdyseeChannel
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from polyphemus.base import OdyseeChannel
|
||||
import requests
|
||||
|
||||
class OdyseeScraper(cisticola.scraper.base.Scraper):
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class OdyseeScraper(Scraper):
|
||||
"""An implementation of a Scraper for Odysee, using polyphemus library"""
|
||||
__version__ = "OdyseeScraper 0.0.1"
|
||||
|
||||
@@ -17,7 +19,7 @@ class OdyseeScraper(cisticola.scraper.base.Scraper):
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = OdyseeScraper.get_username_from_url(channel.url)
|
||||
odysee_channel = OdyseeChannel(channel_name = username)
|
||||
@@ -43,7 +45,7 @@ class OdyseeScraper(cisticola.scraper.base.Scraper):
|
||||
|
||||
all_comments = video.get_all_comments()
|
||||
|
||||
yield cisticola.base.ScraperResult(
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Odysee",
|
||||
channel=channel.id,
|
||||
@@ -55,7 +57,7 @@ class OdyseeScraper(cisticola.scraper.base.Scraper):
|
||||
|
||||
for comment in all_comments:
|
||||
|
||||
yield cisticola.base.ScraperResult(
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Odysee",
|
||||
channel=channel.id,
|
||||
|
||||
@@ -1,20 +1,19 @@
|
||||
from concurrent.futures import process
|
||||
import cisticola.base
|
||||
import cisticola.scraper.base
|
||||
from datetime import datetime
|
||||
import json
|
||||
from typing import Generator, Tuple
|
||||
import tempfile
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import youtube_dl
|
||||
import json
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
BASE_URL = 'https://rumble.com'
|
||||
|
||||
class RumbleScraper(cisticola.scraper.base.Scraper):
|
||||
class RumbleScraper(Scraper):
|
||||
"""An implementation of a Scraper for Rumble, using custom functions"""
|
||||
__version__ = "RumbleScraper 0.0.1"
|
||||
|
||||
@@ -23,7 +22,7 @@ class RumbleScraper(cisticola.scraper.base.Scraper):
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = RumbleScraper.get_username_from_url(channel.url)
|
||||
scraper = get_channel_videos(username)
|
||||
@@ -40,7 +39,7 @@ class RumbleScraper(cisticola.scraper.base.Scraper):
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[post['media_url']] = archived_url
|
||||
|
||||
yield cisticola.base.ScraperResult(
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Rumble",
|
||||
channel=channel.id,
|
||||
|
||||
@@ -1,18 +1,19 @@
|
||||
import cisticola.base
|
||||
import cisticola.scraper.base
|
||||
from typing import Generator
|
||||
import snscrape.modules
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import snscrape.modules
|
||||
|
||||
class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class TelegramSnscrapeScraper(Scraper):
|
||||
__version__ = "TelegramSnscrapeScraper 0.0.1"
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Telegram" and channel.public and not channel.chat:
|
||||
return True
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
scr = snscrape.modules.telegram.TelegramChannelScraper(
|
||||
channel.screenname)
|
||||
|
||||
@@ -34,7 +35,7 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[post.video] = archived_url
|
||||
|
||||
yield cisticola.base.ScraperResult(
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Telegram",
|
||||
channel=channel.id,
|
||||
|
||||
@@ -1,17 +1,19 @@
|
||||
import cisticola.base
|
||||
import cisticola.scraper.base
|
||||
from datetime import datetime, timezone
|
||||
from typing import Generator
|
||||
import snscrape.modules
|
||||
from loguru import logger
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
|
||||
class TwitterScraper(cisticola.scraper.base.Scraper):
|
||||
from snscrape.modules.twitter import TwitterProfileScraper, Video, Gif, Photo
|
||||
from loguru import logger
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class TwitterScraper(Scraper):
|
||||
"""An implementation of a Scraper for Twitter, using snscrape library"""
|
||||
__version__ = "TwitterScraper 0.0.1"
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
|
||||
scraper = snscrape.modules.twitter.TwitterProfileScraper(channel.platform_id)
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
scraper = TwitterProfileScraper(channel.platform_id)
|
||||
|
||||
first = True
|
||||
|
||||
@@ -28,13 +30,13 @@ class TwitterScraper(cisticola.scraper.base.Scraper):
|
||||
|
||||
if tweet.media:
|
||||
for media in tweet.media:
|
||||
if type(media) == snscrape.modules.twitter.Video:
|
||||
if type(media) == Video:
|
||||
variant = max(
|
||||
[v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
|
||||
url = variant.url
|
||||
elif type(media) == snscrape.modules.twitter.Gif:
|
||||
elif type(media) == Gif:
|
||||
url = media.variants[0].url
|
||||
elif type(media) == snscrape.modules.twitter.Photo:
|
||||
elif type(media) == Photo:
|
||||
url = media.fullUrl
|
||||
else:
|
||||
logger.warning(f"Could not get media URL of {media}")
|
||||
@@ -45,7 +47,7 @@ class TwitterScraper(cisticola.scraper.base.Scraper):
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield cisticola.base.ScraperResult(
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Twitter",
|
||||
channel=channel.id,
|
||||
|
||||
@@ -1,16 +1,2 @@
|
||||
import cisticola.base
|
||||
|
||||
class Transformer:
|
||||
"""Interface class for transformers"""
|
||||
|
||||
__version__ = "Transformer 0.0.0"
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def can_handle(data: cisticola.base.ScraperResult) -> bool:
|
||||
pass
|
||||
|
||||
def transform(data: cisticola.base.ScraperResult) -> cisticola.base.TransformedResult:
|
||||
pass
|
||||
|
||||
from . import base
|
||||
from .twitter import TwitterTransformer
|
||||
16
cisticola/transformer/base.py
Normal file
16
cisticola/transformer/base.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from cisticola.base import ScraperResult, TransformedResult
|
||||
|
||||
class Transformer:
|
||||
"""Interface class for transformers"""
|
||||
|
||||
__version__ = "Transformer 0.0.0"
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def can_handle(data: ScraperResult) -> bool:
|
||||
pass
|
||||
|
||||
def transform(data: ScraperResult) -> TransformedResult:
|
||||
pass
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
import cisticola.transformer
|
||||
import cisticola.base
|
||||
import json
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import ScraperResult, TransformedResult
|
||||
|
||||
class TwitterTransformer(cisticola.transformer.Transformer):
|
||||
class TwitterTransformer(Transformer):
|
||||
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
|
||||
|
||||
__version__ = "TwitterTransformer 0.0.1"
|
||||
|
||||
def transform(self, data: cisticola.base.ScraperResult) -> cisticola.base.TransformedResult:
|
||||
def transform(self, data: ScraperResult) -> TransformedResult:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
transformed = cisticola.base.TransformedResult(
|
||||
transformed = TransformedResult(
|
||||
raw_id=data.id,
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
|
||||
181
test.py
181
test.py
@@ -1,76 +1,127 @@
|
||||
import cisticola
|
||||
import cisticola.scraper.telegram_snscrape
|
||||
import cisticola.scraper.twitter
|
||||
import cisticola.scraper.gettr
|
||||
import cisticola.scraper.bitchute
|
||||
import cisticola.scraper.odysee
|
||||
import cisticola.scraper.gab
|
||||
import cisticola.scraper.rumble
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import (
|
||||
ScraperController,
|
||||
BitchuteScraper,
|
||||
GabScraper,
|
||||
GettrScraper,
|
||||
OdyseeScraper,
|
||||
RumbleScraper,
|
||||
TelegramSnscrapeScraper,
|
||||
TwitterScraper)
|
||||
|
||||
test_channels = [
|
||||
cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132,
|
||||
category="test", followers=None, platform="Twitter",
|
||||
url="https://twitter.com/obtusatum", screenname="obtusatum", country="US",
|
||||
influencer=None, public=True, chat=False,
|
||||
notes=""),
|
||||
cisticola.base.Channel(id=1, name="South West Ohio Proud Boys (test)", platform_id=-1001276612436,
|
||||
category="test", followers=None, platform="Telegram",
|
||||
url="https://t.me/SouthwestOhioPB", screenname="SouthwestOhioPB", country="US",
|
||||
influencer=None, public=True, chat=False, notes=""),
|
||||
cisticola.base.Channel(id=2, name="LizardRepublic (test)", platform_id='lizardrepublic',
|
||||
category="test", followers=None, platform="Gettr",
|
||||
url="https://www.gettr.com/user/lizardrepublic", screenname="lizardrepublic", country="US",
|
||||
influencer=None, public=True, chat=False, notes=""),
|
||||
cisticola.base.Channel(
|
||||
id=4, name="bestonlinejewelrystoresusa@gmail.com (test)", platform_id='bestonlinejewelrystoresusagmailcom',
|
||||
category="test", followers=None, platform="Bitchute",
|
||||
url="https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/", screenname=None, country="US",
|
||||
influencer=None, public=True, chat=False, notes=""),
|
||||
cisticola.base.Channel(
|
||||
id=5, name="Mak1n' Bacon (test)", platform_id='Mak1nBacon',
|
||||
category="test", followers=None, platform="Odysee",
|
||||
url="https://odysee.com/@Mak1nBacon", screenname='Mak1nBacon', country="US",
|
||||
influencer=None, public=True, chat=False, notes=""),
|
||||
cisticola.base.Channel(
|
||||
id=6, name="Capt. Marc Simon (test)", platform_id='marc_capt',
|
||||
category="test", followers=None, platform="Gab",
|
||||
url="https://gab.com/marc_capt", screenname='marc_capt', country="CA",
|
||||
influencer=None, public=True, chat=False, notes=""),
|
||||
cisticola.base.Channel(
|
||||
id=7, name="we are uploading videos wow products and problem solving products.please share like and subscribe our channelwe are uploading videos wow products and problem solving products.please share like and subscribe our channel", platform_id='c-916305',
|
||||
category="test", followers=None, platform="Rumble",
|
||||
url="https://rumble.com/c/c-916305", screenname='we are uploading', country="CA",
|
||||
influencer=None, public=True, chat=False, notes="")]
|
||||
Channel(
|
||||
id=0,
|
||||
name="Logan Williams (test)",
|
||||
platform_id=891729132,
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Twitter",
|
||||
url="https://twitter.com/obtusatum",
|
||||
screenname="obtusatum",
|
||||
country="US",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=1,
|
||||
name="South West Ohio Proud Boys (test)",
|
||||
platform_id=-1001276612436,
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Telegram",
|
||||
url="https://t.me/SouthwestOhioPB",
|
||||
screenname="SouthwestOhioPB",
|
||||
country="US",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=2,
|
||||
name="LizardRepublic (test)",
|
||||
platform_id='lizardrepublic',
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Gettr",
|
||||
url="https://www.gettr.com/user/lizardrepublic",
|
||||
screenname="lizardrepublic",
|
||||
country="US",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=4,
|
||||
name="bestonlinejewelrystoresusa@gmail.com (test)", platform_id='bestonlinejewelrystoresusagmailcom',
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Bitchute",
|
||||
url="https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/", screenname=None,
|
||||
country="US",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=5,
|
||||
name="Mak1n' Bacon (test)",
|
||||
platform_id='Mak1nBacon',
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Odysee",
|
||||
url="https://odysee.com/@Mak1nBacon",
|
||||
screenname='Mak1nBacon',
|
||||
country="US",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=6,
|
||||
name="Capt. Marc Simon (test)",
|
||||
platform_id='marc_capt',
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Gab",
|
||||
url="https://gab.com/marc_capt",
|
||||
screenname='marc_capt',
|
||||
country="CA",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=7,
|
||||
name="we are uploading videos wow products and problem solving products.please share like and subscribe our channelwe are uploading videos wow products and problem solving products.please share like and subscribe our channel", platform_id='c-916305',
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Rumble",
|
||||
url="https://rumble.com/c/c-916305",
|
||||
screenname='we are uploading',
|
||||
country="CA",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes="")]
|
||||
|
||||
controller = ScraperController()
|
||||
|
||||
controller = cisticola.ScraperController()
|
||||
scrapers = [
|
||||
BitchuteScraper(),
|
||||
GabScraper(),
|
||||
GettrScraper(),
|
||||
OdyseeScraper(),
|
||||
RumbleScraper(),
|
||||
TelegramSnscrapeScraper(),
|
||||
TwitterScraper()]
|
||||
|
||||
twitter = cisticola.scraper.twitter.TwitterScraper()
|
||||
controller.register_scraper(twitter)
|
||||
|
||||
telegram = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper()
|
||||
controller.register_scraper(telegram)
|
||||
|
||||
gettr = cisticola.scraper.gettr.GettrScraper()
|
||||
controller.register_scraper(gettr)
|
||||
|
||||
bitchute = cisticola.scraper.bitchute.BitchuteScraper()
|
||||
controller.register_scraper(bitchute)
|
||||
|
||||
odysee = cisticola.scraper.odysee.OdyseeScraper()
|
||||
controller.register_scraper(odysee)
|
||||
|
||||
gab = cisticola.scraper.gab.GabScraper()
|
||||
controller.register_scraper(gab)
|
||||
|
||||
rumble = cisticola.scraper.rumble.RumbleScraper()
|
||||
controller.register_scraper(rumble)
|
||||
controller.register_scrapers(scrapers)
|
||||
|
||||
engine = create_engine('sqlite:///test3.db')
|
||||
controller.connect_to_db(engine)
|
||||
|
||||
controller.scrape_channels(test_channels)
|
||||
|
||||
controller.scrape_channels(test_channels)
|
||||
Reference in New Issue
Block a user