mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-29 05:28:40 +03:00
Merge branch 'main' into channel-db
This commit is contained in:
@@ -1,33 +1,47 @@
|
||||
from typing import List
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
import tempfile
|
||||
import json
|
||||
import io
|
||||
|
||||
from sqlalchemy.orm import registry
|
||||
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean
|
||||
import pytesseract
|
||||
import PIL
|
||||
import io
|
||||
import exiftool
|
||||
import json
|
||||
import os
|
||||
|
||||
from .utils import make_request
|
||||
|
||||
mapper_registry = registry()
|
||||
|
||||
@dataclass
|
||||
class ScraperResult:
|
||||
"""A minimally processed result from a scraper"""
|
||||
"""A minimally processed result from a scraper
|
||||
"""
|
||||
|
||||
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
|
||||
scraper: str
|
||||
|
||||
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
|
||||
platform: str
|
||||
|
||||
#: Foreign key of channel ID that this was scraped from
|
||||
channel: int
|
||||
|
||||
#: String that uniquely identifies the scraped post on the given platform, e.g. ``"1503397267675533313"``
|
||||
platform_id: str
|
||||
|
||||
#: Datetime (relative to UTC) that the scraped post was created at.
|
||||
date: datetime
|
||||
|
||||
#: JSON dump of dict that contains all data scraped for the post.
|
||||
raw_data: str
|
||||
|
||||
#: Datetime (relative to UTC) that the scraped post was archived at.
|
||||
date_archived: datetime
|
||||
|
||||
#: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files.
|
||||
archived_urls: dict
|
||||
|
||||
|
||||
|
||||
raw_data_table = Table('raw_data', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
@@ -40,22 +54,45 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
|
||||
Column('date_archived', DateTime),
|
||||
Column('archived_urls', JSON))
|
||||
|
||||
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Channel:
|
||||
"""Information about a specific channel to be scraped.
|
||||
"""
|
||||
|
||||
#: Name of channel (different from username because it can be non-unique and contain emojis), e.g. ``T🕊Редакция Президент Гордон🕊"``.
|
||||
name: str
|
||||
|
||||
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
|
||||
platform_id: str
|
||||
|
||||
#: User-specified category for the channel, e.g. ``"explicit_qanon"``.
|
||||
category: str
|
||||
|
||||
#: Name of platform the given channel is on, e.g. ``"Telegram"``.
|
||||
platform: str
|
||||
|
||||
#: URL for the given channel on the platform, e.g. ``"https://t.me/prezidentgordonteam"``
|
||||
url: str
|
||||
|
||||
#: Screen name/username of channel.
|
||||
screenname: str
|
||||
|
||||
#: 2 digit country code for the country of origin for the channel, e.g. ``"RU"``.
|
||||
country: str = None
|
||||
|
||||
#: Name of influencer, if channel belongs to an influencer that operates on multiple platforms.
|
||||
influencer: str = None
|
||||
|
||||
#: Whether or not the channel is publicly-accessible.
|
||||
public: bool = None
|
||||
|
||||
#: Whether or not the channel is a chat (i.e. allows users who are not the channel creator to post/message)
|
||||
chat: bool = None
|
||||
|
||||
#: Any other additional notes about the channel.
|
||||
notes: str = ""
|
||||
|
||||
#: Did the channel come from a researcher or a scraping process?
|
||||
source: str = None
|
||||
|
||||
def hydrate(self):
|
||||
@@ -82,26 +119,52 @@ mapper_registry.map_imperatively(Channel, channel_table)
|
||||
@dataclass
|
||||
class Post:
|
||||
"""An object with fields for columns in the analysis table"""
|
||||
|
||||
#: ID number of the scraped post in the ``raw_data`` table
|
||||
raw_id: int
|
||||
|
||||
#: Platform specific post ID
|
||||
platform_id: str
|
||||
|
||||
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
|
||||
scraper: str
|
||||
|
||||
#: String specifying name and version of transformer used to tranform result, e.g. ``"TwitterTransformer 0.0.1"``.
|
||||
transformer: str
|
||||
|
||||
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
|
||||
platform: str
|
||||
|
||||
#: User-specified integer that uniquely identifies a channel, e.g. ``15``.
|
||||
channel: int
|
||||
|
||||
#: Datetime (relative to UTC) that the scraped post was created at.
|
||||
date: datetime
|
||||
|
||||
#: Datetime (relative to UTC) that the scraped post was archived at.
|
||||
date_archived: datetime
|
||||
|
||||
#: URL of the original post
|
||||
url: str
|
||||
|
||||
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
|
||||
author_id: str
|
||||
|
||||
#: Username of author who made post.
|
||||
author_username: str
|
||||
|
||||
#: Text of the original post
|
||||
content: str
|
||||
|
||||
#: The ID of the Channel that the post was forwarded or quoted from
|
||||
forwarded_from: int = None
|
||||
|
||||
#: The ID of the Post that this Post is a reply to or reblog of
|
||||
reply_to: int = None
|
||||
|
||||
def hydrate(self):
|
||||
pass
|
||||
|
||||
|
||||
|
||||
post_table = Table('posts', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
@@ -125,39 +188,64 @@ mapper_registry.map_imperatively(Post, post_table)
|
||||
|
||||
@dataclass
|
||||
class Media:
|
||||
"""Base class for organizing information about a media file.
|
||||
"""
|
||||
|
||||
#: ID number of the media's corresponding scraped post in the ``raw_data`` table.
|
||||
raw_id: int
|
||||
|
||||
#: ID number of the media's corresponging scraped post in the ``analysis`` table.
|
||||
post: int
|
||||
|
||||
#: URL of the original post.
|
||||
url: str
|
||||
|
||||
#: Original URL of the media from the the original post.
|
||||
original_url: str
|
||||
|
||||
#: JSON dump of the dict containing metadata information for the media file.
|
||||
exif: str = None
|
||||
|
||||
def get_blob(self):
|
||||
"""Download media file as bytes blob.
|
||||
"""
|
||||
|
||||
blob = make_request(self.url)
|
||||
return blob.content
|
||||
|
||||
def hydrate(self, blob = None):
|
||||
"""Download media file as bytes blob and extract data from content.
|
||||
"""
|
||||
|
||||
if blob is None:
|
||||
blob = self.get_blob()
|
||||
|
||||
self.hydrate_exif(blob)
|
||||
|
||||
def hydrate_exif(self, blob):
|
||||
f = open('tmp', 'wb')
|
||||
f.write(blob)
|
||||
f.close()
|
||||
"""Extract Exif metadata from bytes blob.
|
||||
"""
|
||||
|
||||
with exiftool.ExifTool() as et:
|
||||
exif = et.get_metadata('tmp')
|
||||
self.exif = json.dumps(exif)
|
||||
with tempfile.NamedTemporaryFile() as temp_file:
|
||||
temp_file.write(blob)
|
||||
|
||||
os.remove('tmp')
|
||||
with exiftool.ExifTool() as et:
|
||||
exif = et.get_metadata(temp_file.name)
|
||||
self.exif = json.dumps(exif)
|
||||
|
||||
@dataclass
|
||||
class Image(Media):
|
||||
"""Class for organizing information about an image file.
|
||||
"""
|
||||
|
||||
#: Extracted OCR content from image
|
||||
ocr: str = None
|
||||
|
||||
def hydrate(self, blob=None):
|
||||
"""Download image file as bytes blob and extract Exif and OCR content
|
||||
from the image.
|
||||
"""
|
||||
|
||||
if blob is None:
|
||||
blob = self.get_blob()
|
||||
|
||||
@@ -165,25 +253,62 @@ class Image(Media):
|
||||
self.hydrate_ocr(blob)
|
||||
|
||||
def hydrate_ocr(self, blob):
|
||||
"""Extract OCR (optical character recognition) data from image bytes blob.
|
||||
"""
|
||||
|
||||
image = PIL.Image.open(io.BytesIO(blob))
|
||||
self.ocr = pytesseract.image_to_string(image)
|
||||
|
||||
@dataclass
|
||||
class Video(Media):
|
||||
"""Class for organizing information about an image file.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
mapper_registry = registry()
|
||||
|
||||
raw_data_table = Table('raw_data', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
Column('scraper', String),
|
||||
Column('platform', String),
|
||||
Column('channel', Integer),
|
||||
Column('platform_id', String),
|
||||
Column('date', DateTime),
|
||||
Column('raw_data', String),
|
||||
Column('date_archived', DateTime),
|
||||
Column('archived_urls', JSON))
|
||||
|
||||
|
||||
analysis_table = Table('analysis', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
Column('raw_id', Integer, ForeignKey('raw_data.id')),
|
||||
Column('scraper', String),
|
||||
Column('transformer', String),
|
||||
Column('platform', String),
|
||||
Column('channel', Integer),
|
||||
Column('date', DateTime),
|
||||
Column('date_archived', DateTime),
|
||||
Column('url', String),
|
||||
Column('author_id', String),
|
||||
Column('author_username', String),
|
||||
Column('content', String))
|
||||
|
||||
media_table = Table('media', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
Column('type', String),
|
||||
Column('type', String),
|
||||
Column('raw_id', Integer, ForeignKey('raw_data.id')),
|
||||
Column('post', Integer, ForeignKey('posts.id')),
|
||||
Column('url', String),
|
||||
Column('original_url', String),
|
||||
Column('exif', String),
|
||||
Column('ocr', String)
|
||||
)
|
||||
Column('ocr', String))
|
||||
|
||||
mapper_registry.map_imperatively(TransformedResult, analysis_table)
|
||||
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
|
||||
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
|
||||
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
|
||||
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')
|
||||
@@ -14,29 +14,91 @@ from cisticola.base import Channel, ScraperResult, mapper_registry
|
||||
from cisticola.utils import make_request
|
||||
|
||||
class Scraper:
|
||||
"""Base class for defining platform-specific scrapers for scraping all posts
|
||||
from a given channel on that specific platform.
|
||||
"""
|
||||
|
||||
__version__ = "Scraper 0.0.0"
|
||||
|
||||
def __init__(self):
|
||||
self.s3_client = boto3.client('s3',
|
||||
region_name=os.environ['DO_SPACES_REGION'],
|
||||
endpoint_url='https://{}.digitaloceanspaces.com'.format(
|
||||
os.environ['DO_SPACES_REGION']),
|
||||
aws_access_key_id=os.environ['DO_SPACES_KEY'],
|
||||
aws_secret_access_key=os.environ['DO_SPACES_SECRET'])
|
||||
|
||||
# Initialize client to transfer files to the storage archive
|
||||
self.s3_client = boto3.client(
|
||||
service_name='s3',
|
||||
region_name=os.environ['DO_SPACES_REGION'],
|
||||
endpoint_url=f'https://{os.environ["DO_SPACES_REGION"]}.digitaloceanspaces.com',
|
||||
aws_access_key_id=os.environ['DO_SPACES_KEY'],
|
||||
aws_secret_access_key=os.environ['DO_SPACES_SECRET'])
|
||||
|
||||
# Define request headers (necessary to bypass scraping protection
|
||||
# for several platform scrapers)
|
||||
self.headers = {
|
||||
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'}
|
||||
|
||||
pass
|
||||
|
||||
def __str__(self):
|
||||
return self.__version__
|
||||
|
||||
def get_username_from_url(self, url: str) -> str:
|
||||
"""Extract a channel's username from its URL.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url: str
|
||||
URL of the channel on a given platform
|
||||
e.g. ``"https://twitter.com/EliotHiggins"``
|
||||
|
||||
Returns
|
||||
-------
|
||||
username: str
|
||||
Extracted username of the channel.
|
||||
e.g. ``"EliotHiggins"``
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
"""Generate a unique identifier for media from a specified post.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
url: str
|
||||
URL of original post.
|
||||
e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
|
||||
content_type: str
|
||||
Content-Type of media.
|
||||
e.g. ``"image/jpeg"``
|
||||
|
||||
Returns
|
||||
-------
|
||||
key: str
|
||||
Unique identifier for the media file from a specified post based on
|
||||
the original post URL and the media's Content-Type.
|
||||
"""
|
||||
|
||||
key = urlparse(url).path.split('/')[-1]
|
||||
return key
|
||||
|
||||
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
"""Download media file from a specified media file URL.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
url: str
|
||||
URL of media file from original post.
|
||||
e.g. ``"https://pbs.twimg.com/media/FN0j0dYWUAcQxfK?format=png&name=medium"``
|
||||
key: str or None
|
||||
Pre-defined unique identifier for the media file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
blob: bytes
|
||||
Raw bytes of the downloaded media file.
|
||||
content_type: str
|
||||
Content-Type of media.
|
||||
e.g. ``"image/jpeg"``.
|
||||
key: str
|
||||
Unique identifier for the media file.
|
||||
"""
|
||||
|
||||
r = make_request(url, headers = self.headers)
|
||||
|
||||
@@ -49,6 +111,27 @@ class Scraper:
|
||||
return blob, content_type, key
|
||||
|
||||
def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
"""Download media file from a specified media URL, where the media file
|
||||
is formatted as an m3u8 playlist, which is then decoded to an mp4 file.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
url: str
|
||||
URL of m3u8 playlist file from original post.
|
||||
e.g. ``"https://media.gettr.com/group47/origin/2022/03/15/01/cbc436c1-1a1a-4b97-671d-c42109f3ec9b/out.m3u8"``
|
||||
key: str or None
|
||||
Pre-defined unique identifier for the media file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
blob: bytes
|
||||
Raw bytes of the downloaded media file.
|
||||
content_type: str
|
||||
Content-Type of media.
|
||||
e.g. ``"video/mp4"``.
|
||||
key: str
|
||||
Unique identifier for the media file.
|
||||
"""
|
||||
|
||||
content_type = 'video/mp4'
|
||||
ext = '.' + content_type.split('/')[-1]
|
||||
@@ -71,7 +154,28 @@ class Scraper:
|
||||
return blob, content_type, key
|
||||
|
||||
def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
|
||||
"""Download media file from a specified media URL, using a fork of
|
||||
youtube-dl that enables faster downloading.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
url: str
|
||||
URL of media file from original post.
|
||||
e.g. ``"https://rumble.com/embed/vgt7gh/"``
|
||||
key: str or None
|
||||
Pre-defined unique identifier for the media file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
blob: bytes
|
||||
Raw bytes of the downloaded media file.
|
||||
content_type: str
|
||||
Content-Type of media.
|
||||
e.g. ``"video/mp4"``.
|
||||
key: str
|
||||
Unique identifier for the media file.
|
||||
"""
|
||||
|
||||
content_type = 'video/mp4'
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
@@ -103,6 +207,23 @@ class Scraper:
|
||||
return blob, content_type, key
|
||||
|
||||
def archive_blob(self, blob: bytes, content_type: str, key: str) -> str:
|
||||
"""Upload raw bytes of a media file to the storage archive.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
blob: bytes
|
||||
Raw bytes of the media file to be archived.
|
||||
content_type: str
|
||||
Content-Type of media.
|
||||
e.g. ``"video/mp4"``.
|
||||
key: str
|
||||
Unique identifier for the media file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
archived_url: str
|
||||
URL specifying the file on the storage archive.
|
||||
"""
|
||||
|
||||
filename = self.__version__.replace(' ', '_') + '/' + key
|
||||
|
||||
@@ -114,9 +235,42 @@ class Scraper:
|
||||
return archived_url
|
||||
|
||||
def can_handle(self, channel: Channel) -> bool:
|
||||
"""Whether or not the scraper can scrape the specified channel.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
channel: Channel
|
||||
Channel to be scraped.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
``True`` if the scraper is capable of scraping ``channel``,
|
||||
``False`` if not.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
"""Scrape all posts from the specified Channel.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
channel: Channel
|
||||
Channel to be scraped.
|
||||
since: ScraperResult or None
|
||||
Most recently scraped ScraperResult from a previous scrape, or
|
||||
``None`` if scraper has not run before.
|
||||
archive_media: bool
|
||||
If ``True``, any media files (images, video, etc.) from posts are archived.
|
||||
If ``False``, media files are not archived.
|
||||
|
||||
Yields
|
||||
------
|
||||
ScraperResult
|
||||
Scraper result from a single post/comment from the specified Channel.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@@ -129,9 +283,13 @@ class ScraperController:
|
||||
self.session = None
|
||||
|
||||
def register_scraper(self, scraper: Scraper):
|
||||
"""Register a single Scraper instance to the controller.
|
||||
"""
|
||||
self.scrapers.append(scraper)
|
||||
|
||||
def register_scrapers(self, scraper: List[Scraper]):
|
||||
"""Register a list of Scraper instances to the controller.
|
||||
"""
|
||||
self.scrapers.extend(scraper)
|
||||
|
||||
def scrape_all_channels(self, archive_media: bool = True):
|
||||
@@ -147,6 +305,17 @@ class ScraperController:
|
||||
|
||||
@logger.catch(reraise = True)
|
||||
def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
|
||||
"""Scrape all posts for all specified channels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
channels: list<Channel>
|
||||
List of Channel instances to be scraped
|
||||
archive_media: bool
|
||||
If ``True``, any media files (images, video, etc.) from posts are archived.
|
||||
If ``False``, media files are not archived.
|
||||
"""
|
||||
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
@@ -185,6 +354,9 @@ class ScraperController:
|
||||
logger.warning(f"No handler found for Channel {channel}")
|
||||
|
||||
def connect_to_db(self, engine):
|
||||
"""Connect the specified SQLAlchemy engine to the controller.
|
||||
"""
|
||||
|
||||
# create tables
|
||||
mapper_registry.metadata.create_all(bind=engine)
|
||||
|
||||
@@ -193,8 +365,8 @@ class ScraperController:
|
||||
self.session.configure(bind=self.engine)
|
||||
|
||||
def reset_db(self):
|
||||
"""Drop all data from the connected SQLAlchemy database.
|
||||
"""
|
||||
|
||||
mapper_registry.metadata.drop_all(bind=self.engine)
|
||||
self.connect_to_db(self.engine)
|
||||
|
||||
|
||||
self.connect_to_db(self.engine)
|
||||
@@ -17,7 +17,7 @@ class BitchuteScraper(Scraper):
|
||||
library"""
|
||||
__version__ = "BitchuteScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(url):
|
||||
def get_username_from_url(self, url):
|
||||
username = url.split('bitchute.com/channel/')[-1].strip('/')
|
||||
|
||||
return username
|
||||
@@ -33,7 +33,7 @@ class BitchuteScraper(Scraper):
|
||||
|
||||
detail = 'comments'
|
||||
|
||||
username = BitchuteScraper.get_username_from_url(channel.url)
|
||||
username = self.get_username_from_url(channel.url)
|
||||
scraper = get_videos_user(session, username, csrftoken, detail)
|
||||
|
||||
for post in scraper:
|
||||
@@ -61,7 +61,7 @@ class BitchuteScraper(Scraper):
|
||||
archived_urls=archived_urls)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None:
|
||||
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
@@ -11,14 +11,14 @@ class GabScraper(Scraper):
|
||||
"""An implementation of a Scraper for Gab, using GARC library"""
|
||||
__version__ = "GabScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(url):
|
||||
def get_username_from_url(self, url):
|
||||
username = url.split('https://gab.com/')[-1]
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
client = Garc(profile = 'main')
|
||||
username = GabScraper.get_username_from_url(channel.url)
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
scraper = client.userposts(username)
|
||||
|
||||
@@ -52,5 +52,5 @@ class GabScraper(Scraper):
|
||||
archived_urls=archived_urls)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Gab" and GabScraper.get_username_from_url(channel.url) is not None:
|
||||
if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
@@ -12,7 +12,7 @@ class GettrScraper(Scraper):
|
||||
"""An implementation of a Scraper for Gettr, using gogettr library"""
|
||||
__version__ = "GettrScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(url):
|
||||
def get_username_from_url(self, url):
|
||||
username = url.split("gettr.com/user/")[1]
|
||||
if len(username.split("/")) > 1:
|
||||
return None
|
||||
@@ -21,7 +21,7 @@ class GettrScraper(Scraper):
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
client = PublicClient()
|
||||
username = GettrScraper.get_username_from_url(channel.url)
|
||||
username = self.get_username_from_url(channel.url)
|
||||
scraper = client.user_activity(username=username, type="posts")
|
||||
|
||||
for post in scraper:
|
||||
@@ -62,7 +62,7 @@ class GettrScraper(Scraper):
|
||||
archived_urls=archived_urls)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None:
|
||||
if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
|
||||
@@ -18,6 +18,7 @@ CONTENT_TYPES = {
|
||||
'mp4' : 'video/mp4'}
|
||||
|
||||
class InstagramScraper(Scraper):
|
||||
"""An implementation of a Scraper for Instagram, using instaloader library"""
|
||||
__version__ = "InstagramScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(self, url):
|
||||
|
||||
@@ -13,7 +13,7 @@ class OdyseeScraper(Scraper):
|
||||
"""An implementation of a Scraper for Odysee, using polyphemus library"""
|
||||
__version__ = "OdyseeScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(url):
|
||||
def get_username_from_url(self, url):
|
||||
|
||||
username = url.split('odysee.com/')[-1].strip('@').split(':')[0]
|
||||
|
||||
@@ -21,7 +21,7 @@ class OdyseeScraper(Scraper):
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = OdyseeScraper.get_username_from_url(channel.url)
|
||||
username = self.get_username_from_url(channel.url)
|
||||
odysee_channel = OdyseeChannel(channel_name = username)
|
||||
|
||||
all_videos = odysee_channel.get_all_videos()
|
||||
@@ -70,7 +70,7 @@ class OdyseeScraper(Scraper):
|
||||
archived_urls={})
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None:
|
||||
if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
|
||||
@@ -14,14 +14,14 @@ class RumbleScraper(Scraper):
|
||||
"""An implementation of a Scraper for Rumble, using custom functions"""
|
||||
__version__ = "RumbleScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(url):
|
||||
def get_username_from_url(self, url):
|
||||
username = url.split('https://rumble.com/c/')[1]
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = RumbleScraper.get_username_from_url(channel.url)
|
||||
username = self.get_username_from_url(channel.url)
|
||||
scraper = get_channel_videos(username)
|
||||
|
||||
for post in scraper:
|
||||
@@ -54,7 +54,7 @@ class RumbleScraper(Scraper):
|
||||
return key
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None:
|
||||
if channel.platform == "Rumble" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
@@ -8,6 +8,7 @@ from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class TelegramSnscrapeScraper(Scraper):
|
||||
"""An implementation of a Scraper for Telegram, using snscrape library"""
|
||||
__version__ = "TelegramSnscrapeScraper 0.0.1"
|
||||
|
||||
def can_handle(self, channel):
|
||||
|
||||
@@ -14,6 +14,7 @@ from cisticola.scraper.base import Scraper
|
||||
MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
|
||||
|
||||
class TelegramTelethonScraper(Scraper):
|
||||
"""An implementation of a Scraper for Telegram, using Telethon library"""
|
||||
__version__ = "TelegramTelethonScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(self, url):
|
||||
@@ -30,9 +31,9 @@ class TelegramTelethonScraper(Scraper):
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
api_id = os.environ['TELEGRAM_API_ID_1']
|
||||
api_hash = os.environ['TELEGRAM_API_HASH_1']
|
||||
phone = os.environ['TELEGRAM_PHONE_1']
|
||||
api_id = os.environ['TELEGRAM_API_ID']
|
||||
api_hash = os.environ['TELEGRAM_API_HASH']
|
||||
phone = os.environ['TELEGRAM_PHONE']
|
||||
|
||||
with TelegramClient(phone, api_id, api_hash) as client:
|
||||
|
||||
|
||||
Reference in New Issue
Block a user