mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-11 21:08:34 +03:00
Merge pull request #17 from bellingcat/channel-db
Add Channel object to ORM, store in DB
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -9,10 +9,12 @@ docs/source/_*
|
||||
*.db
|
||||
.env
|
||||
*.session
|
||||
service_account.json
|
||||
.vscode/
|
||||
|
||||
# Unit test / coverage reports
|
||||
reports
|
||||
.coverage
|
||||
.cache
|
||||
.pytest_cache/
|
||||
cover/
|
||||
cover/
|
||||
|
||||
5
Pipfile
5
Pipfile
@@ -20,14 +20,19 @@ telethon = "*"
|
||||
pytesseract = "*"
|
||||
pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"}
|
||||
instaloader = "*"
|
||||
gspread = "*"
|
||||
|
||||
[dev-packages]
|
||||
pytest = "*"
|
||||
pytest-cov = "*"
|
||||
pytest-html = "*"
|
||||
pytest-metadata = "*"
|
||||
black = "*"
|
||||
sphinx = "*"
|
||||
sphinx_rtd_theme = "*"
|
||||
|
||||
[requires]
|
||||
python_version = "3.9"
|
||||
|
||||
[pipenv]
|
||||
allow_prereleases = true
|
||||
|
||||
@@ -6,7 +6,7 @@ import json
|
||||
import io
|
||||
|
||||
from sqlalchemy.orm import registry
|
||||
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey
|
||||
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean
|
||||
import pytesseract
|
||||
import PIL
|
||||
import exiftool
|
||||
@@ -24,8 +24,7 @@ class ScraperResult:
|
||||
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
|
||||
platform: str
|
||||
|
||||
#TODO there is probably a way of making this a Channel object foreign key
|
||||
#: User-specified integer that uniquely identifies a channel, e.g. ``15``.
|
||||
#: Foreign key of channel ID that this was scraped from
|
||||
channel: int
|
||||
|
||||
#: String that uniquely identifies the scraped post on the given platform, e.g. ``"1503397267675533313"``
|
||||
@@ -42,27 +41,33 @@ class ScraperResult:
|
||||
|
||||
#: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files.
|
||||
archived_urls: dict
|
||||
|
||||
raw_data_table = Table('raw_data', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
Column('scraper', String),
|
||||
Column('platform', String),
|
||||
Column('channel', Integer, ForeignKey('channels.id')),
|
||||
Column('platform_id', String),
|
||||
Column('date', DateTime),
|
||||
Column('raw_data', String),
|
||||
Column('date_archived', DateTime),
|
||||
Column('archived_urls', JSON))
|
||||
|
||||
@dataclass
|
||||
class Channel:
|
||||
"""Information about a specific channel to be scraped.
|
||||
"""
|
||||
|
||||
#: User-specified integer that uniquely identifies a channel, e.g. ``15``.
|
||||
id: int
|
||||
|
||||
#: Name of channel (different from username because it can be non-unique and contain emojis), e.g. ``T🕊Редакция Президент Гордон🕊"``.
|
||||
name: str
|
||||
|
||||
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
|
||||
platform_id: str
|
||||
|
||||
#: User-specified category for the channel, e.g. ``"qanon-adjacent"``.
|
||||
#: User-specified category for the channel, e.g. ``"explicit_qanon"``.
|
||||
category: str
|
||||
|
||||
#: Number of followers the channel has on the given platform, e.e. ``"1465"``.
|
||||
followers: int
|
||||
|
||||
#: Name of platform the given channel is on, e.g. ``"Telegram"``.
|
||||
platform: str
|
||||
|
||||
@@ -71,28 +76,55 @@ class Channel:
|
||||
|
||||
#: Screen name/username of channel.
|
||||
screenname: str
|
||||
|
||||
|
||||
#: 2 digit country code for the country of origin for the channel, e.g. ``"RU"``.
|
||||
country: str
|
||||
|
||||
#: Name of influencer, if channel belongs to an influencer that operates on multiple platforms.
|
||||
influencer: str
|
||||
|
||||
country: str = None
|
||||
|
||||
#: Name of influencer, if channel belongs to an influencer that operates on multiple platforms.
|
||||
influencer: str = None
|
||||
|
||||
#: Whether or not the channel is publicly-accessible.
|
||||
public: bool
|
||||
|
||||
public: bool = None
|
||||
|
||||
#: Whether or not the channel is a chat (i.e. allows users who are not the channel creator to post/message)
|
||||
chat: bool
|
||||
|
||||
chat: bool = None
|
||||
|
||||
#: Any other additional notes about the channel.
|
||||
notes: str
|
||||
notes: str = ""
|
||||
|
||||
#: Did the channel come from a researcher or a scraping process?
|
||||
source: str = None
|
||||
|
||||
def hydrate(self):
|
||||
pass
|
||||
|
||||
channel_table = Table('channels', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True, autoincrement=True),
|
||||
Column('name', String),
|
||||
Column('platform_id', Integer),
|
||||
Column('category', String),
|
||||
Column('platform', String),
|
||||
Column('url', String),
|
||||
Column('screenname', String),
|
||||
Column('country', String),
|
||||
Column('influencer', String),
|
||||
Column('public', Boolean),
|
||||
Column('chat', Boolean),
|
||||
Column('notes', String),
|
||||
Column('source', String)
|
||||
)
|
||||
|
||||
mapper_registry.map_imperatively(Channel, channel_table)
|
||||
|
||||
@dataclass
|
||||
class TransformedResult:
|
||||
class Post:
|
||||
"""An object with fields for columns in the analysis table"""
|
||||
|
||||
#: ID number of the scraped post in the ``raw_data`` table
|
||||
raw_id: int
|
||||
|
||||
#: Platform specific post ID
|
||||
platform_id: str
|
||||
|
||||
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
|
||||
scraper: str
|
||||
@@ -111,19 +143,49 @@ class TransformedResult:
|
||||
|
||||
#: Datetime (relative to UTC) that the scraped post was archived at.
|
||||
date_archived: datetime
|
||||
|
||||
|
||||
#: URL of the original post
|
||||
url: str
|
||||
|
||||
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
|
||||
author_id: str
|
||||
|
||||
|
||||
#: Username of author who made post.
|
||||
author_username: str
|
||||
|
||||
|
||||
#: Text of the original post
|
||||
content: str
|
||||
|
||||
#: The ID of the Channel that the post was forwarded or quoted from
|
||||
forwarded_from: int = None
|
||||
|
||||
#: The ID of the Post that this Post is a reply to or reblog of
|
||||
reply_to: int = None
|
||||
|
||||
def hydrate(self):
|
||||
pass
|
||||
|
||||
post_table = Table('posts', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
Column('raw_id', Integer, ForeignKey('raw_data.id')),
|
||||
Column('platform_id', Integer),
|
||||
Column('scraper', String),
|
||||
Column('transformer', String),
|
||||
Column('platform', String),
|
||||
Column('channel', Integer, ForeignKey('channels.id')),
|
||||
Column('date', DateTime),
|
||||
Column('date_archived', DateTime),
|
||||
Column('url', String),
|
||||
Column('author_id', String),
|
||||
Column('author_username', String),
|
||||
Column('content', String),
|
||||
Column('forwarded_from', Integer, ForeignKey('channels.id')),
|
||||
Column('reply_to', Integer, ForeignKey('posts.id'))
|
||||
)
|
||||
|
||||
mapper_registry.map_imperatively(Post, post_table)
|
||||
|
||||
@dataclass
|
||||
class Media:
|
||||
"""Base class for organizing information about a media file.
|
||||
@@ -239,7 +301,7 @@ media_table = Table('media', mapper_registry.metadata,
|
||||
autoincrement=True),
|
||||
Column('type', String),
|
||||
Column('raw_id', Integer, ForeignKey('raw_data.id')),
|
||||
Column('post', Integer, ForeignKey('analysis.id')),
|
||||
Column('post', Integer, ForeignKey('posts.id')),
|
||||
Column('url', String),
|
||||
Column('original_url', String),
|
||||
Column('exif', String),
|
||||
|
||||
@@ -291,6 +291,17 @@ class ScraperController:
|
||||
"""Register a list of Scraper instances to the controller.
|
||||
"""
|
||||
self.scrapers.extend(scraper)
|
||||
|
||||
def scrape_all_channels(self, archive_media: bool = True):
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
|
||||
session = self.session()
|
||||
|
||||
channels = session.query(Channel).where(Channel.source=='researcher').all()
|
||||
|
||||
return self.scrape_channels(channels, archive_media=archive_media)
|
||||
|
||||
@logger.catch(reraise = True)
|
||||
def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
|
||||
@@ -314,7 +325,6 @@ class ScraperController:
|
||||
|
||||
for scraper in self.scrapers:
|
||||
if scraper.can_handle(channel):
|
||||
session = self.session()
|
||||
handled = True
|
||||
added = 0
|
||||
|
||||
|
||||
@@ -29,25 +29,33 @@ class TwitterScraper(Scraper):
|
||||
archived_urls = {}
|
||||
|
||||
if archive_media:
|
||||
|
||||
media_list = []
|
||||
if tweet.media:
|
||||
for media in tweet.media:
|
||||
if type(media) == Video:
|
||||
variant = max(
|
||||
[v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
|
||||
url = variant.url
|
||||
elif type(media) == Gif:
|
||||
url = media.variants[0].url
|
||||
elif type(media) == Photo:
|
||||
url = media.fullUrl
|
||||
else:
|
||||
logger.warning(f"Could not get media URL of {media}")
|
||||
url = None
|
||||
media_list += tweet.media
|
||||
|
||||
if url is not None:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
if tweet.retweetedTweet and tweet.retweetedTweet.media:
|
||||
media_list += tweet.retweetedTweet.media
|
||||
|
||||
if tweet.quotedTweet and tweet.quotedTweet.media:
|
||||
media_list += tweet.quotedTweet.media
|
||||
|
||||
for media in media_list:
|
||||
if type(media) == Video:
|
||||
variant = max(
|
||||
[v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
|
||||
url = variant.url
|
||||
elif type(media) == Gif:
|
||||
url = media.variants[0].url
|
||||
elif type(media) == Photo:
|
||||
url = media.fullUrl
|
||||
else:
|
||||
logger.warning(f"Could not get media URL of {media}")
|
||||
url = None
|
||||
|
||||
if url is not None and url not in archived_urls:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
from typing import List, Generator
|
||||
from typing import List, Generator, Union, Callable
|
||||
from loguru import logger
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from sqlalchemy.orm import sessionmaker, make_transient
|
||||
from sqlalchemy.engine.base import Engine
|
||||
from collections import defaultdict
|
||||
|
||||
from cisticola.base import ScraperResult, Post, Media, Channel, mapper_registry
|
||||
|
||||
from cisticola.base import ScraperResult, TransformedResult, Media, mapper_registry
|
||||
|
||||
class Transformer:
|
||||
"""Interface class for transformers."""
|
||||
@@ -16,12 +18,12 @@ class Transformer:
|
||||
def can_handle(data: ScraperResult) -> bool:
|
||||
"""Specifies whether or not a Transformer is capable of handling a particular
|
||||
piece of scraped data.
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ScraperResult
|
||||
The ScraperResult object to check for ability to handle.
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
@@ -30,39 +32,18 @@ class Transformer:
|
||||
|
||||
pass
|
||||
|
||||
def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
|
||||
"""Yields Media objects from each piece of media present in a raw ScraperResult.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ScraperResult
|
||||
The ScraperResult object to process
|
||||
transformed : TransformedResult
|
||||
The TransformedResult version of `data`. (E.g. as generated by `Transformer.transform()`)
|
||||
def transform(data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
"""Transform a ScraperResult into objects with additional parameters for analysis. This function can
|
||||
yield multiple objects, as it will find references to quoted/replied posts, media objects, and Channel
|
||||
objects and provide all of these to be inserted into the database.
|
||||
|
||||
Yields
|
||||
------
|
||||
Media
|
||||
A media object generated from the ScraperResult. One ScraperResult can have multiple pieces
|
||||
of media contained within it, so this can generate an arbitrary number of Media objects
|
||||
(or their subclasses.) These Media objects are not fully hydrated.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
def transform(data: ScraperResult) -> TransformedResult:
|
||||
"""Transform a ScraperResult into a TransformedResult object. This extracts additional attributes
|
||||
that can be used directly for analysis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ScraperResult
|
||||
The ScraperResult object to process.
|
||||
|
||||
Returns
|
||||
-------
|
||||
TransformedResult
|
||||
A TransformedResult representation of the `data` object.
|
||||
insert : Callable
|
||||
A function that either inserts the object into a database or finds an object with the
|
||||
relevant unique constraints if applicable.
|
||||
"""
|
||||
|
||||
pass
|
||||
@@ -78,7 +59,7 @@ class ETLController:
|
||||
|
||||
def register_transformer(self, transformer: Transformer):
|
||||
"""Adds a Transformer to the list of available Transformers.
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
transformer : Transformer
|
||||
@@ -89,7 +70,7 @@ class ETLController:
|
||||
|
||||
def connect_to_db(self, engine: Engine):
|
||||
"""Connects the ETLController to a SQLAlchemy engine.
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
engine : Engine
|
||||
@@ -101,11 +82,59 @@ class ETLController:
|
||||
self.session = sessionmaker()
|
||||
self.session.configure(bind=engine)
|
||||
|
||||
@logger.catch(reraise = True)
|
||||
def insert_or_select(self, obj, session, hydrate: bool = True):
|
||||
"""Inserts an object into the database or returns an existing object from the database.
|
||||
Regardless, the resulting object has an `id` attribute that can be referenced later."""
|
||||
|
||||
instance = None
|
||||
|
||||
# This is using some adhoc unique constraints that might be worth formalizing at some point
|
||||
if type(obj) == Channel:
|
||||
instance = session.query(Channel).filter_by(url=obj.url, platform_id=obj.platform_id, platform=obj.platform).first()
|
||||
|
||||
elif type(obj) == Post:
|
||||
instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first()
|
||||
|
||||
elif issubclass(type(obj), Media):
|
||||
instance = session.query(type(obj)).filter_by(original_url=obj.original_url, post=obj.post).first()
|
||||
if instance:
|
||||
logger.info(f"Found matching DB entry for {obj}: {instance}")
|
||||
return instance
|
||||
|
||||
instance = session.query(type(obj)).filter_by(original_url=obj.original_url).first()
|
||||
|
||||
# For Media objects we want to duplicate the entry to preserve the relationship with the post.
|
||||
# However, we also want to avoid rehydration, hence the code below:
|
||||
if instance:
|
||||
logger.info(f"Found matching media record, duplicating and inserting for new post")
|
||||
|
||||
session.expunge(instance)
|
||||
make_transient(instance)
|
||||
instance.id = None
|
||||
instance.post = obj.post
|
||||
instance.raw_id = obj.raw_id
|
||||
|
||||
session.add(instance)
|
||||
session.flush()
|
||||
return instance
|
||||
|
||||
if instance:
|
||||
logger.info(f"Found matching DB entry for {obj}: {instance}")
|
||||
return instance
|
||||
|
||||
if hydrate:
|
||||
obj.hydrate()
|
||||
|
||||
logger.info(f"Inserting new object {obj}")
|
||||
session.add(obj)
|
||||
session.flush()
|
||||
return obj
|
||||
|
||||
@logger.catch(reraise=True)
|
||||
def transform_results(self, results: List[ScraperResult], hydrate: bool = True):
|
||||
"""Transforms raw ScraperResults objects into TransformedResult objects and
|
||||
"""Transforms raw ScraperResults objects into Post objects and
|
||||
Media objects. Then, adds them to the database.
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
results : List[ScraperResult]
|
||||
@@ -126,34 +155,18 @@ class ETLController:
|
||||
handled = True
|
||||
session = self.session()
|
||||
|
||||
transformed = transformer.transform(result)
|
||||
|
||||
session.add(transformed)
|
||||
session.flush()
|
||||
|
||||
media = transformer.transform_media(result, transformed)
|
||||
|
||||
count = 0
|
||||
for obj in media:
|
||||
if hydrate:
|
||||
logger.info(f"Hydrating {obj}")
|
||||
obj.hydrate()
|
||||
|
||||
session.add(obj)
|
||||
count += 1
|
||||
|
||||
transformer.transform(result, lambda obj: self.insert_or_select(obj, session, hydrate))
|
||||
session.commit()
|
||||
logger.info(f"{transformer} generated {count} media objects")
|
||||
break
|
||||
|
||||
if handled == False:
|
||||
logger.warning(f"No Transformer could handle {result}")
|
||||
|
||||
@logger.catch(reraise = True)
|
||||
@logger.catch(reraise=True)
|
||||
def transform_all_untransformed(self, hydrate: bool = True):
|
||||
"""Transform all ScraperResult objects in the database that do not have an
|
||||
equivalent TransformedResult object stored.
|
||||
|
||||
equivalent Post object stored.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
hydrate : bool
|
||||
@@ -165,7 +178,12 @@ class ETLController:
|
||||
return
|
||||
|
||||
session = self.session()
|
||||
untransformed = session.query(ScraperResult).join(TransformedResult, isouter=True).where(TransformedResult.raw_id == None).all()
|
||||
untransformed = (
|
||||
session.query(ScraperResult)
|
||||
.join(Post, isouter=True)
|
||||
.where(Post.raw_id == None)
|
||||
.all()
|
||||
)
|
||||
logger.info(f"Found {len(untransformed)} items to ETL")
|
||||
|
||||
self.transform_results(untransformed, hydrate=hydrate)
|
||||
self.transform_results(untransformed, hydrate=hydrate)
|
||||
|
||||
@@ -5,7 +5,7 @@ from typing import Generator
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media
|
||||
from cisticola.base import ScraperResult, Post, Image, Video, Media
|
||||
|
||||
class BitchuteTransformer(Transformer):
|
||||
"""A Bitchute specific ScraperResult, with a method ETL/transforming"""
|
||||
@@ -19,7 +19,7 @@ class BitchuteTransformer(Transformer):
|
||||
|
||||
return False
|
||||
|
||||
def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
|
||||
def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
orig = raw['video_url']
|
||||
@@ -29,13 +29,13 @@ class BitchuteTransformer(Transformer):
|
||||
|
||||
yield m
|
||||
|
||||
def transform(self, data: ScraperResult) -> TransformedResult:
|
||||
def transform(self, data: ScraperResult) -> Post:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
soup = BeautifulSoup(raw['body'], features = 'html.parser')
|
||||
content = soup.find_all('p')[-1].text
|
||||
|
||||
transformed = TransformedResult(
|
||||
transformed = Post(
|
||||
raw_id=data.id,
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
import json
|
||||
from loguru import logger
|
||||
from typing import Generator
|
||||
from typing import Generator, Union, Callable
|
||||
import dateutil.parser
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media
|
||||
from cisticola.base import ScraperResult, Post, Image, Video, Media, Channel
|
||||
|
||||
class TwitterTransformer(Transformer):
|
||||
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
|
||||
@@ -17,11 +18,9 @@ class TwitterTransformer(Transformer):
|
||||
|
||||
return False
|
||||
|
||||
def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
if raw['media']:
|
||||
for media in raw['media']:
|
||||
def process_media(self, tweet, post_id, data):
|
||||
if tweet['media']:
|
||||
for media in tweet['media']:
|
||||
orig = None
|
||||
|
||||
if media["_type"] == "snscrape.modules.twitter.Photo":
|
||||
@@ -40,26 +39,77 @@ class TwitterTransformer(Transformer):
|
||||
new = data.archived_urls[orig]
|
||||
|
||||
if media["_type"] == "snscrape.modules.twitter.Photo":
|
||||
m = Image(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
|
||||
m = Image(url=new, post=post_id, raw_id=data.id, original_url=orig)
|
||||
else:
|
||||
m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
|
||||
m = Video(url=new, post=post_id, raw_id=data.id, original_url=orig)
|
||||
|
||||
yield m
|
||||
|
||||
def transform(self, data: ScraperResult) -> TransformedResult:
|
||||
|
||||
def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
transformed = TransformedResult(
|
||||
transformed = Post(
|
||||
raw_id=data.id,
|
||||
platform_id=raw['id'],
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
platform=data.platform,
|
||||
channel=data.channel,
|
||||
date=data.date,
|
||||
date=dateutil.parser.parse(raw['date']),
|
||||
date_archived=data.date_archived,
|
||||
url=raw['url'],
|
||||
content=raw['content'],
|
||||
author_id=raw['user']['id'],
|
||||
author_username=raw['user']['username'])
|
||||
|
||||
return transformed
|
||||
def subtweet(tweet):
|
||||
channel = Channel(
|
||||
name=tweet['user']['displayname'],
|
||||
platform_id=tweet['user']['id'],
|
||||
platform=data.platform,
|
||||
url=tweet['user']['url'],
|
||||
screenname=tweet['user']['username'],
|
||||
category='forwarded',
|
||||
source=self.__version__
|
||||
)
|
||||
|
||||
channel = insert(channel)
|
||||
|
||||
original = Post(
|
||||
raw_id=data.id,
|
||||
platform_id=tweet['id'],
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
platform=data.platform,
|
||||
channel=channel.id,
|
||||
date=dateutil.parser.parse(tweet['date']),
|
||||
date_archived=data.date_archived,
|
||||
url=tweet['url'],
|
||||
content=tweet['content'],
|
||||
author_id=tweet['user']['id'],
|
||||
author_username=tweet['user']['username']
|
||||
)
|
||||
|
||||
original = insert(original)
|
||||
transformed.forwarded_from = channel.id
|
||||
transformed.reply_to = original.id
|
||||
|
||||
media = self.process_media(tweet, original.id, data)
|
||||
for m in media:
|
||||
insert(m)
|
||||
|
||||
if raw['retweetedTweet'] is not None:
|
||||
subtweet(raw['retweetedTweet'])
|
||||
|
||||
if raw['quotedTweet'] is not None:
|
||||
subtweet(raw['quotedTweet'])
|
||||
|
||||
insert(transformed)
|
||||
|
||||
media = self.process_media(raw, transformed.id, data)
|
||||
for m in media:
|
||||
insert(m)
|
||||
|
||||
|
||||
|
||||
|
||||
52
test.py
52
test.py
@@ -1,7 +1,10 @@
|
||||
from sqlalchemy import create_engine
|
||||
from loguru import logger
|
||||
import gspread
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from cisticola.base import Channel, TransformedResult, ScraperResult
|
||||
from cisticola.base import Channel, Post, ScraperResult, mapper_registry
|
||||
from cisticola.scraper import (
|
||||
ScraperController,
|
||||
BitchuteScraper,
|
||||
@@ -14,26 +17,9 @@ from cisticola.scraper import (
|
||||
TwitterScraper)
|
||||
from cisticola.transformer import ETLController
|
||||
from cisticola.transformer.twitter import TwitterTransformer
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
logger.add("../test.log")
|
||||
|
||||
test_channels = [
|
||||
Channel(
|
||||
id=0,
|
||||
name="L Weber (test)",
|
||||
platform_id=1424979017749442595,
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Twitter",
|
||||
url="https://twitter.com/LWeber33662141",
|
||||
screenname="LWeber33662141",
|
||||
country="US",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes="")]
|
||||
|
||||
controller = ScraperController()
|
||||
|
||||
scrapers = [
|
||||
@@ -49,9 +35,35 @@ scrapers = [
|
||||
controller.register_scrapers(scrapers)
|
||||
|
||||
engine = create_engine('sqlite:///test.db')
|
||||
controller.connect_to_db(engine)
|
||||
mapper_registry.metadata.create_all(bind=engine)
|
||||
session_generator = sessionmaker()
|
||||
session_generator.configure(bind=engine)
|
||||
session = session_generator()
|
||||
|
||||
controller.scrape_channels(test_channels, archive_media = True)
|
||||
gc = gspread.service_account(filename='service_account.json')
|
||||
|
||||
# Open a sheet from a spreadsheet in one go
|
||||
wks = gc.open_by_url("https://docs.google.com/spreadsheets/d/1yxd6-2Mp0jZ8r9XJklb39WE-iIMrKRyA2kymJcIfGis/edit#gid=0")
|
||||
channels = wks.worksheet("channels").get_all_records()
|
||||
|
||||
for c in channels:
|
||||
del c['followers']
|
||||
|
||||
for k in c.keys():
|
||||
if c[k] == 'TRUE': c[k] = True
|
||||
if c[k] == 'FALSE': c[k] = False
|
||||
|
||||
# check to see if this already exists,
|
||||
channel = session.query(Channel).filter_by(platform_id=c['platform_id'], platform=c['platform']).first()
|
||||
|
||||
if not channel:
|
||||
channel = Channel(**c, source='researcher')
|
||||
session.add(channel)
|
||||
|
||||
session.commit()
|
||||
|
||||
controller.connect_to_db(engine)
|
||||
controller.scrape_all_channels(archive_media = True)
|
||||
|
||||
transformer = TwitterTransformer()
|
||||
|
||||
|
||||
@@ -8,11 +8,9 @@ from cisticola.transformer import ETLController
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
BITCHUTE_CHANNEL_KWARGS = {
|
||||
'id': 0,
|
||||
'name': 'bestonlinejewelrystoresusa@gmail.com (test)',
|
||||
'platform_id': 'bestonlinejewelrystoresusagmailcom',
|
||||
'category': 'test',
|
||||
'followers': None,
|
||||
'platform': 'Bitchute',
|
||||
'url': 'https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/',
|
||||
'screenname': None,
|
||||
@@ -23,11 +21,9 @@ BITCHUTE_CHANNEL_KWARGS = {
|
||||
'notes': ''}
|
||||
|
||||
GAB_CHANNEL_KWARGS = {
|
||||
'id': 1,
|
||||
'name': 'Capt. Marc Simon (test)',
|
||||
'platform_id': 'marc_capt',
|
||||
'category': 'test',
|
||||
'followers': None,
|
||||
'platform': 'Gab',
|
||||
'url': 'https://gab.com/marc_capt',
|
||||
'screenname': 'marc_capt',
|
||||
@@ -38,11 +34,9 @@ GAB_CHANNEL_KWARGS = {
|
||||
'notes': ''}
|
||||
|
||||
GETTR_CHANNEL_KWARGS = {
|
||||
'id': 2,
|
||||
'name': 'LizardRepublic (test)',
|
||||
'platform_id': 'lizardrepublic',
|
||||
'category': 'test',
|
||||
'followers': None,
|
||||
'platform': 'Gettr',
|
||||
'url': 'https://www.gettr.com/user/lizardrepublic',
|
||||
'screenname': 'lizardrepublic',
|
||||
@@ -53,11 +47,9 @@ GETTR_CHANNEL_KWARGS = {
|
||||
'notes': ''}
|
||||
|
||||
INSTAGRAM_CHANNEL_KWARGS = {
|
||||
'id': 3,
|
||||
'name': 'borland.88 (test)',
|
||||
'platform_id': 'borland.88',
|
||||
'category': 'test',
|
||||
'followers': None,
|
||||
'platform': 'Instagram',
|
||||
'url': 'https://www.instagram.com/borland.88/',
|
||||
'screenname': 'borland.88',
|
||||
@@ -68,11 +60,9 @@ INSTAGRAM_CHANNEL_KWARGS = {
|
||||
'notes': ''}
|
||||
|
||||
ODYSEE_CHANNEL_KWARGS = {
|
||||
'id': 4,
|
||||
'name': "Mak1n' Bacon (test)",
|
||||
'platform_id': 'Mak1nBacon',
|
||||
'category': 'test',
|
||||
'followers': None,
|
||||
'platform': 'Odysee',
|
||||
'url': 'https://odysee.com/@Mak1nBacon',
|
||||
'screenname': 'Mak1nBacon',
|
||||
@@ -83,11 +73,9 @@ ODYSEE_CHANNEL_KWARGS = {
|
||||
'notes': ''}
|
||||
|
||||
RUMBLE_CHANNEL_KWARGS = {
|
||||
'id': 5,
|
||||
'name': 'we are uploading videos wow products',
|
||||
'platform_id': 'c-916305',
|
||||
'category': 'test',
|
||||
'followers': None,
|
||||
'platform': 'Rumble',
|
||||
'url': 'https://rumble.com/c/c-916305',
|
||||
'screenname': 'we are uploading',
|
||||
@@ -98,11 +86,9 @@ RUMBLE_CHANNEL_KWARGS = {
|
||||
'notes': ''}
|
||||
|
||||
TELEGRAM_CHANNEL_KWARGS = {
|
||||
'id': 6,
|
||||
'name': 'South West Ohio Proud Boys (test)',
|
||||
'platform_id': -1001276612436,
|
||||
'category': 'test',
|
||||
'followers': None,
|
||||
'platform': 'Telegram',
|
||||
'url': 'https://t.me/SouthwestOhioPB',
|
||||
'screenname': 'SouthwestOhioPB',
|
||||
@@ -113,11 +99,9 @@ TELEGRAM_CHANNEL_KWARGS = {
|
||||
'notes': ''}
|
||||
|
||||
TWITTER_CHANNEL_KWARGS = {
|
||||
'id': 7,
|
||||
'name': 'L Weber (test)',
|
||||
'platform_id': 1424979017749442595,
|
||||
'category': 'test',
|
||||
'followers': None,
|
||||
'platform': 'Twitter',
|
||||
'url': 'https://twitter.com/LWeber33662141',
|
||||
'screenname': 'LWeber33662141',
|
||||
@@ -128,11 +112,9 @@ TWITTER_CHANNEL_KWARGS = {
|
||||
'notes': ''}
|
||||
|
||||
VKONTAKTE_CHANNEL_KWARGS = {
|
||||
'id': 8,
|
||||
'name': 'Wwg1wgA (test)',
|
||||
'platform_id': 'club201278078',
|
||||
'category': 'test',
|
||||
'followers': None,
|
||||
'platform': 'Vkontakte',
|
||||
'url': 'https://vk.com/club201278078',
|
||||
'screenname': 'Wwg1wgA',
|
||||
@@ -143,11 +125,9 @@ VKONTAKTE_CHANNEL_KWARGS = {
|
||||
'notes': ''}
|
||||
|
||||
YOUTUBE_CHANNEL_KWARGS = {
|
||||
'id': 9,
|
||||
'name': 'AnEs87 (test)',
|
||||
'platform_id': 'UCP6exBqGoxGLv_pM9Dxk2pA',
|
||||
'category': 'test',
|
||||
'followers': None,
|
||||
'platform': 'Youtube',
|
||||
'url': 'https://www.youtube.com/channel/UCP6exBqGoxGLv_pM9Dxk2pA',
|
||||
'screenname': 'AnEs87',
|
||||
|
||||
@@ -4,7 +4,7 @@ import json
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import TwitterScraper
|
||||
from cisticola.transformer import TwitterTransformer
|
||||
from cisticola.base import TransformedResult, Media
|
||||
from cisticola.base import Post, Media
|
||||
|
||||
def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
|
||||
controller.reset_db()
|
||||
@@ -20,11 +20,11 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
|
||||
sessionfactory.configure(bind=engine)
|
||||
session = sessionfactory()
|
||||
|
||||
posts = session.query(TransformedResult).all()
|
||||
posts = session.query(Post).all()
|
||||
media = session.query(Media).all()
|
||||
|
||||
assert len(posts) == 3
|
||||
assert len(media) == 2
|
||||
assert len(posts) == 10
|
||||
assert len(media) == 7
|
||||
|
||||
assert posts[-1].content == "This is a test. https://t.co/rzTFL9uFi6"
|
||||
assert posts[-1].content == "BARN"
|
||||
assert json.loads(media[-1].exif)['Composite:ImageSize'] == "826 728"
|
||||
Reference in New Issue
Block a user