merged main

This commit is contained in:
Tristan Lee
2022-03-14 18:19:57 -05:00
25 changed files with 512 additions and 134 deletions

View File

@@ -17,6 +17,8 @@ polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"}
garc = "*"
youtube-dl = "*"
telethon = "*"
pytesseract = "*"
pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"}
[dev-packages]
pytest = "*"

67
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "495ba305ca55a0ac5754037ba133518b47324965dd3ab0b8db8b69206524d68e"
"sha256": "c29fb4651dfcf05e182e5cc94323c9a6aedf2a821cd57ea17b1b48f707283646"
},
"pipfile-spec": 6,
"requires": {
@@ -42,11 +42,11 @@
},
"botocore": {
"hashes": [
"sha256:7ea8ef1ff7c4882ab59b337662f90ddf5ea860e95e7e209dca593a34ea585b1b",
"sha256:d2da7ccbc5ddd61fe3cd45fcbd3de380d9e3a15bfa8fbfd2d9259a93dcc60c56"
"sha256:5ed2be0e413961134f4c17eab16396d41a5b4b73a637588260c04d20806d52ea",
"sha256:d0d77bce152ca51f3c2cd0f9bf05cb3b623e719406ad58b4c20444e237fe82eb"
],
"markers": "python_version >= '3.6'",
"version": "==1.24.18"
"version": "==1.24.19"
},
"bs4": {
"hashes": [
@@ -344,6 +344,47 @@
"markers": "python_version >= '3.8'",
"version": "==1.4.1"
},
"pillow": {
"hashes": [
"sha256:011233e0c42a4a7836498e98c1acf5e744c96a67dd5032a6f666cc1fb97eab97",
"sha256:0f29d831e2151e0b7b39981756d201f7108d3d215896212ffe2e992d06bfe049",
"sha256:12875d118f21cf35604176872447cdb57b07126750a33748bac15e77f90f1f9c",
"sha256:14d4b1341ac07ae07eb2cc682f459bec932a380c3b122f5540432d8977e64eae",
"sha256:1c3c33ac69cf059bbb9d1a71eeaba76781b450bc307e2291f8a4764d779a6b28",
"sha256:1d19397351f73a88904ad1aee421e800fe4bbcd1aeee6435fb62d0a05ccd1030",
"sha256:253e8a302a96df6927310a9d44e6103055e8fb96a6822f8b7f514bb7ef77de56",
"sha256:2632d0f846b7c7600edf53c48f8f9f1e13e62f66a6dbc15191029d950bfed976",
"sha256:335ace1a22325395c4ea88e00ba3dc89ca029bd66bd5a3c382d53e44f0ccd77e",
"sha256:413ce0bbf9fc6278b2d63309dfeefe452835e1c78398efb431bab0672fe9274e",
"sha256:5100b45a4638e3c00e4d2320d3193bdabb2d75e79793af7c3eb139e4f569f16f",
"sha256:514ceac913076feefbeaf89771fd6febde78b0c4c1b23aaeab082c41c694e81b",
"sha256:528a2a692c65dd5cafc130de286030af251d2ee0483a5bf50c9348aefe834e8a",
"sha256:6295f6763749b89c994fcb6d8a7f7ce03c3992e695f89f00b741b4580b199b7e",
"sha256:6c8bc8238a7dfdaf7a75f5ec5a663f4173f8c367e5a39f87e720495e1eed75fa",
"sha256:718856856ba31f14f13ba885ff13874be7fefc53984d2832458f12c38205f7f7",
"sha256:7f7609a718b177bf171ac93cea9fd2ddc0e03e84d8fa4e887bdfc39671d46b00",
"sha256:80ca33961ced9c63358056bd08403ff866512038883e74f3a4bf88ad3eb66838",
"sha256:80fe64a6deb6fcfdf7b8386f2cf216d329be6f2781f7d90304351811fb591360",
"sha256:81c4b81611e3a3cb30e59b0cf05b888c675f97e3adb2c8672c3154047980726b",
"sha256:855c583f268edde09474b081e3ddcd5cf3b20c12f26e0d434e1386cc5d318e7a",
"sha256:9bfdb82cdfeccec50aad441afc332faf8606dfa5e8efd18a6692b5d6e79f00fd",
"sha256:a5d24e1d674dd9d72c66ad3ea9131322819ff86250b30dc5821cbafcfa0b96b4",
"sha256:a9f44cd7e162ac6191491d7249cceb02b8116b0f7e847ee33f739d7cb1ea1f70",
"sha256:b5b3f092fe345c03bca1e0b687dfbb39364b21ebb8ba90e3fa707374b7915204",
"sha256:b9618823bd237c0d2575283f2939655f54d51b4527ec3972907a927acbcc5bfc",
"sha256:cef9c85ccbe9bee00909758936ea841ef12035296c748aaceee535969e27d31b",
"sha256:d21237d0cd37acded35154e29aec853e945950321dd2ffd1a7d86fe686814669",
"sha256:d3c5c79ab7dfce6d88f1ba639b77e77a17ea33a01b07b99840d6ed08031cb2a7",
"sha256:d9d7942b624b04b895cb95af03a23407f17646815495ce4547f0e60e0b06f58e",
"sha256:db6d9fac65bd08cea7f3540b899977c6dee9edad959fa4eaf305940d9cbd861c",
"sha256:ede5af4a2702444a832a800b8eb7f0a7a1c0eed55b644642e049c98d589e5092",
"sha256:effb7749713d5317478bb3acb3f81d9d7c7f86726d41c1facca068a04cf5bb4c",
"sha256:f154d173286a5d1863637a7dcd8c3437bb557520b01bddb0be0258dcb72696b5",
"sha256:f25ed6e28ddf50de7e7ea99d7a976d6a9c415f03adcaac9c41ff6ff41b6d86ac"
],
"markers": "python_version >= '3.7'",
"version": "==9.0.1"
},
"pluggy": {
"hashes": [
"sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
@@ -388,6 +429,10 @@
],
"version": "==0.4.8"
},
"pyexiftool": {
"git": "https://github.com/smarnach/pyexiftool.git",
"ref": "3db3764895e687d75b42d3ae4e554ca8664a7f6f"
},
"pyparsing": {
"hashes": [
"sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea",
@@ -404,6 +449,14 @@
],
"version": "==1.7.1"
},
"pytesseract": {
"hashes": [
"sha256:7e2bafc7f48d1bb71443ce4633a56f5e21925a98f220a36c336297edcd1956d0",
"sha256:fecda37d1e4eaf744c657cd03a5daab4eb97c61506ac5550274322c8ae32eca2"
],
"index": "pypi",
"version": "==0.3.9"
},
"pytest": {
"hashes": [
"sha256:b555252a95bbb2a37a97b5ac2eb050c436f7989993565f5e0c9128fcaacadd0e",
@@ -528,7 +581,7 @@
"sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17",
"sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"
],
"markers": "python_version >= '3.6' and python_version < '4'",
"markers": "python_version >= '3.6' and python_version < '4.0'",
"version": "==4.8"
},
"s3transfer": {
@@ -637,7 +690,7 @@
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'",
"version": "==1.26.8"
},
"youtube-dl": {
@@ -1002,7 +1055,7 @@
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'",
"version": "==1.26.8"
},
"zipp": {

View File

@@ -1,9 +1,16 @@
from typing import List
from dataclasses import dataclass
from datetime import datetime
from sqlalchemy.orm import registry
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey
import pytesseract
import PIL
import io
import exiftool
import json
import os
from .utils import make_request
@dataclass
class ScraperResult:
@@ -96,7 +103,7 @@ class TransformedResult:
platform: str
#: User-specified integer that uniquely identifies a channel, e.g. ``15``.
channel: str
channel: int
#: Datetime (relative to UTC) that the scraped post was created at.
date: datetime
@@ -107,15 +114,16 @@ class TransformedResult:
#: URL of the original post
url: str
#: Text of the original post
content: str
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
author_id: str
#: Username of author who made post.
author_username: str
#: Text of the original post
content: str
mapper_registry = registry()
raw_data_table = Table('raw_data', mapper_registry.metadata,
@@ -139,13 +147,78 @@ analysis_table = Table('analysis', mapper_registry.metadata,
Column('scraper', String),
Column('transformer', String),
Column('platform', String),
Column('channel', String),
Column('channel', Integer),
Column('date', DateTime),
Column('date_archived', DateTime),
Column('url', String),
Column('content', String),
Column('author_id', String),
Column('author_username', String)
Column('author_username', String),
Column('content', String)
)
mapper_registry.map_imperatively(TransformedResult, analysis_table)
mapper_registry.map_imperatively(TransformedResult, analysis_table)
@dataclass
class Media:
raw_id: int
post: int
url: str
original_url: str
exif: str = None
def get_blob(self):
blob = make_request(self.url)
return blob.content
def hydrate(self, blob = None):
if blob is None:
blob = self.get_blob()
self.hydrate_exif(blob)
def hydrate_exif(self, blob):
f = open('tmp', 'wb')
f.write(blob)
f.close()
with exiftool.ExifTool() as et:
exif = et.get_metadata('tmp')
self.exif = json.dumps(exif)
os.remove('tmp')
@dataclass
class Image(Media):
ocr: str = None
def hydrate(self, blob=None):
if blob is None:
blob = self.get_blob()
super().hydrate(blob)
self.hydrate_ocr(blob)
def hydrate_ocr(self, blob):
image = PIL.Image.open(io.BytesIO(blob))
self.ocr = pytesseract.image_to_string(image)
@dataclass
class Video(Media):
pass
media_table = Table('media', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('type', String),
Column('raw_id', Integer, ForeignKey('raw_data.id')),
Column('post', Integer, ForeignKey('analysis.id')),
Column('url', String),
Column('original_url', String),
Column('exif', String),
Column('ocr', String)
)
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')

View File

@@ -1,4 +1,4 @@
from .utils import make_request
from cisticola.utils import make_request
from .base import Scraper, ScraperController
from .bitchute import BitchuteScraper
from .gab import GabScraper

View File

@@ -10,7 +10,7 @@ import ffmpeg
from sqlalchemy.orm import sessionmaker
from cisticola.base import Channel, ScraperResult, mapper_registry
from cisticola.scraper import make_request
from cisticola.utils import make_request
class Scraper:
"""Base class for defining platform-specific scrapers for scraping all posts
@@ -204,7 +204,6 @@ class ScraperController:
def __init__(self):
self.scrapers = []
self.session = None
self.mapper_registry = None
def register_scraper(self, scraper: Scraper):
"""Register a single Scraper instance to the controller.
@@ -275,4 +274,12 @@ class ScraperController:
mapper_registry.metadata.create_all(bind=engine)
self.session = sessionmaker()
self.session.configure(bind=engine)
self.engine = engine
self.session.configure(bind=self.engine)
def reset_db(self):
mapper_registry.metadata.drop_all(bind=self.engine)
self.connect_to_db(self.engine)

View File

@@ -236,8 +236,8 @@ def append_details(video, detail):
video["video_url"] = soup.select_one("video#player source").get("src")
video["thumbnail_image"] = soup.select_one("video#player").get("poster")
video["subject"] = soup.select_one("h1#video-title").text
video["author"] = soup.select_one("div.channel-banner p.name a").text
video["author_id"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2]
video["author_id"] = soup.select_one("p.owner a").get("href").split("/")[2]
video["author"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2]
video["body"] = soup.select_one("div#video-description").encode_contents().decode("utf-8").strip()
# we need *two more requests* to get the comment count and like/dislike counts

View File

@@ -67,9 +67,13 @@ class TwitterScraper(Scraper):
parsed_url = urlparse(url)
queries = parse_qs(parsed_url.query)
ext = ''
# TODO might require additional statements for other media formats
if 'jpg' in queries.get('format', []):
ext = '.jpg'
elif 'png' in queries.get('format', []):
ext = '.png'
elif parsed_url.path.endswith('.mp4'):
ext = ''

View File

@@ -1,2 +1,3 @@
from . import base
from .twitter import TwitterTransformer
from .base import ETLController
from .twitter import TwitterTransformer
from .bitchute import BitchuteTransformer

View File

@@ -1,7 +1,12 @@
from cisticola.base import ScraperResult, TransformedResult
from typing import List, Generator
from loguru import logger
from sqlalchemy.orm import sessionmaker
from sqlalchemy.engine.base import Engine
from cisticola.base import ScraperResult, TransformedResult, Media, mapper_registry
class Transformer:
"""Interface class for transformers"""
"""Interface class for transformers."""
__version__ = "Transformer 0.0.0"
@@ -9,8 +14,158 @@ class Transformer:
pass
def can_handle(data: ScraperResult) -> bool:
"""Specifies whether or not a Transformer is capable of handling a particular
piece of scraped data.
Parameters
----------
data : ScraperResult
The ScraperResult object to check for ability to handle.
Returns
-------
bool
True if it can be handled by this Transformer, false otherwise.
"""
pass
def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
"""Yields Media objects from each piece of media present in a raw ScraperResult.
Parameters
----------
data : ScraperResult
The ScraperResult object to process
transformed : TransformedResult
The TransformedResult version of `data`. (E.g. as generated by `Transformer.transform()`)
Yields
------
Media
A media object generated from the ScraperResult. One ScraperResult can have multiple pieces
of media contained within it, so this can generate an arbitrary number of Media objects
(or their subclasses.) These Media objects are not fully hydrated.
"""
pass
def transform(data: ScraperResult) -> TransformedResult:
"""Transform a ScraperResult into a TransformedResult object. This extracts additional attributes
that can be used directly for analysis.
Parameters
----------
data : ScraperResult
The ScraperResult object to process.
Returns
-------
TransformedResult
A TransformedResult representation of the `data` object.
"""
pass
class ETLController:
"""An ETLController will transform raw scraped data (ScrapedResult objects) into a more detailed format
for analysis by using Transformer objects that have been registered with the controller.
"""
def __init__(self):
self.transformers = []
def register_transformer(self, transformer: Transformer):
"""Adds a Transformer to the list of available Transformers.
Parameters
----------
transformer : Transformer
The Transformer to register
"""
self.transformers.append(transformer)
def connect_to_db(self, engine: Engine):
"""Connects the ETLController to a SQLAlchemy engine.
Parameters
----------
engine : Engine
SQLAlchemy Engine object
"""
# create tables
mapper_registry.metadata.create_all(bind=engine)
self.session = sessionmaker()
self.session.configure(bind=engine)
@logger.catch
def transform_results(self, results: List[ScraperResult], hydrate: bool = True):
"""Transforms raw ScraperResults objects into TransformedResult objects and
Media objects. Then, adds them to the database.
Parameters
----------
results : List[ScraperResult]
A list of ScraperResult objects to be transformed
hydrate : bool
Whether or not to fully hydrate transformed media. Default True.
"""
if self.session is None:
logger.error("No DB session")
return
for result in results:
for transformer in self.transformers:
handled = False
if transformer.can_handle(result):
logger.info(f"{transformer} is handling result {result}")
handled = True
session = self.session()
transformed = transformer.transform(result)
session.add(transformed)
session.flush()
media = transformer.transform_media(result, transformed)
count = 0
for obj in media:
if hydrate:
logger.info(f"Hydrating {obj}")
obj.hydrate()
session.add(obj)
count += 1
session.commit()
logger.info(f"{transformer} generated {count} media objects")
break
if handled == False:
logger.warning(f"No Transformer could handle {result}")
@logger.catch
def transform_all_untransformed(self, hydrate: bool = True):
"""Transform all ScraperResult objects in the database that do not have an
equivalent TransformedResult object stored.
Parameters
----------
hydrate : bool
Whether or not to fully hydrate transformed media. Default True.
"""
if self.session is None:
logger.error("No DB session")
return
session = self.session()
untransformed = session.query(ScraperResult).join(TransformedResult, isouter=True).where(TransformedResult.raw_id == None).all()
logger.info(f"Found {len(untransformed)} items to ETL")
self.transform_results(untransformed, hydrate=hydrate)

View File

@@ -0,0 +1,51 @@
import json
from loguru import logger
from typing import Generator
from bs4 import BeautifulSoup
from cisticola.transformer.base import Transformer
from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media
class BitchuteTransformer(Transformer):
"""A Bitchute specific ScraperResult, with a method ETL/transforming"""
__version__ = "BitchuteTransformer 0.0.1"
def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ')
if scraper[0] == "BitchuteScraper":
return True
return False
def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
raw = json.loads(data.raw_data)
orig = raw['video_url']
new = data.archived_urls[orig]
m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
yield m
def transform(self, data: ScraperResult) -> TransformedResult:
raw = json.loads(data.raw_data)
soup = BeautifulSoup(raw['body'], features = 'html.parser')
content = soup.find_all('p')[-1].text
transformed = TransformedResult(
raw_id=data.id,
scraper=data.scraper,
transformer=self.__version__,
platform=data.platform,
channel=data.channel,
date=data.date,
date_archived=data.date_archived,
url=raw['url'],
content=content,
author_id=raw['author_id'],
author_username=raw['author'])
return transformed

View File

@@ -1,13 +1,51 @@
import json
from loguru import logger
from typing import Generator
from cisticola.transformer.base import Transformer
from cisticola.base import ScraperResult, TransformedResult
from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media
class TwitterTransformer(Transformer):
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
__version__ = "TwitterTransformer 0.0.1"
def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ')
if scraper[0] == "TwitterScraper":
return True
return False
def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
raw = json.loads(data.raw_data)
if raw['media']:
for media in raw['media']:
orig = None
if media["_type"] == "snscrape.modules.twitter.Photo":
orig = media["fullUrl"]
elif media["_type"] == "snscrape.modules.twitter.Gif":
orig = media["variants"][0]["url"]
elif media["_type"] == "snscrape.modules.twitter.Video":
variant = max([v for v in media["variants"] if v["bitrate"]], key=lambda v: v["bitrate"])
orig = variant["url"]
if orig is None:
logger.warning(f"No media URL found for {media}")
elif orig not in data.archived_urls:
logger.info("Media discovered but not archived")
else:
new = data.archived_urls[orig]
if media["_type"] == "snscrape.modules.twitter.Photo":
m = Image(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
else:
m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
yield m
def transform(self, data: ScraperResult) -> TransformedResult:
raw = json.loads(data.raw_data)

View File

@@ -1,5 +1,6 @@
import requests
from loguru import logger
import time
def make_request(url, headers = None, max_retries = 5, break_codes = None):
@@ -64,6 +65,9 @@ def request_until_200(url, headers = None, max_retries = 5, break_codes = None):
while r.status_code not in break_codes and n_retries < 5:
logger.warning(f"Request for url: {url} returned status: {r.status_code} on attempt: {n_retries}/{max_retries}")
n_retries += 1
# back off subsequent requests
time.sleep(n_retries)
r = requests.get(url, headers = headers)
if r.status_code not in break_codes:

105
test.py
View File

@@ -1,7 +1,7 @@
from sqlalchemy import create_engine
from loguru import logger
from cisticola.base import Channel
from cisticola.base import Channel, TransformedResult, ScraperResult
from cisticola.scraper import (
ScraperController,
BitchuteScraper,
@@ -12,104 +12,26 @@ from cisticola.scraper import (
TelegramSnscrapeScraper,
TelegramTelethonScraper,
TwitterScraper)
from cisticola.transformer import ETLController
from cisticola.transformer.twitter import TwitterTransformer
from sqlalchemy.orm import sessionmaker
logger.add("../test.log")
test_channels = [
Channel(
id=0,
name="Logan Williams (test)",
platform_id=891729132,
name="L Weber (test)",
platform_id=1424979017749442595,
category="test",
followers=None,
platform="Twitter",
url="https://twitter.com/obtusatum",
screenname="obtusatum",
url="https://twitter.com/LWeber33662141",
screenname="LWeber33662141",
country="US",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=1,
name="South West Ohio Proud Boys (test)",
platform_id=-1001276612436,
category="test",
followers=None,
platform="Telegram",
url="https://t.me/SouthwestOhioPB",
screenname="SouthwestOhioPB",
country="US",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=2,
name="LizardRepublic (test)",
platform_id='lizardrepublic',
category="test",
followers=None,
platform="Gettr",
url="https://www.gettr.com/user/lizardrepublic",
screenname="lizardrepublic",
country="US",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=4,
name="bestonlinejewelrystoresusa@gmail.com (test)", platform_id='bestonlinejewelrystoresusagmailcom',
category="test",
followers=None,
platform="Bitchute",
url="https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/", screenname=None,
country="US",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=5,
name="Mak1n' Bacon (test)",
platform_id='Mak1nBacon',
category="test",
followers=None,
platform="Odysee",
url="https://odysee.com/@Mak1nBacon",
screenname='Mak1nBacon',
country="US",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=6,
name="Capt. Marc Simon (test)",
platform_id='marc_capt',
category="test",
followers=None,
platform="Gab",
url="https://gab.com/marc_capt",
screenname='marc_capt',
country="CA",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=7,
name="we are uploading videos wow products and problem solving products.please share like and subscribe our channelwe are uploading videos wow products and problem solving products.please share like and subscribe our channel", platform_id='c-916305',
category="test",
followers=None,
platform="Rumble",
url="https://rumble.com/c/c-916305",
screenname='we are uploading',
country="CA",
influencer=None,
public=True,
chat=False,
notes="")]
controller = ScraperController()
@@ -126,7 +48,14 @@ scrapers = [
controller.register_scrapers(scrapers)
engine = create_engine('sqlite:///test3.db')
engine = create_engine('sqlite:///test.db')
controller.connect_to_db(engine)
controller.scrape_channels(test_channels, archive_media = False)
controller.scrape_channels(test_channels, archive_media = True)
transformer = TwitterTransformer()
etl_controller = ETLController()
etl_controller.register_transformer(transformer)
etl_controller.connect_to_db(engine)
etl_controller.transform_all_untransformed()

View File

@@ -3,6 +3,7 @@ import pytest
from sqlalchemy import create_engine
from cisticola.scraper import ScraperController
from cisticola.transformer import ETLController
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
@@ -98,13 +99,13 @@ TELEGRAM_CHANNEL_KWARGS = {
TWITTER_CHANNEL_KWARGS = {
'id': 5,
'name': 'Logan Williams (test)',
'platform_id': 891729132,
'name': 'L Weber (test)',
'platform_id': 1424979017749442595,
'category': 'test',
'followers': None,
'platform': 'Twitter',
'url': 'https://twitter.com/obtusatum',
'screenname': 'obtusatum',
'url': 'https://twitter.com/LWeber33662141',
'screenname': 'LWeber33662141',
'country': 'US',
'influencer': None,
'public': True,
@@ -113,35 +114,49 @@ TWITTER_CHANNEL_KWARGS = {
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
@pytest.fixture(scope='function')
def controller(tmpdir_factory):
"""Initialize ScraperController and SQLite database file to be used for all
tests in the package.
"""
@pytest.fixture(scope='package')
def engine(tmpdir_factory):
"""Initialize a SQLite database and SQLAlchemy engine to be used for all
tests in the package"""
file = tmpdir_factory.mktemp('test_data').join('test.db')
engine = create_engine(f'sqlite:///{file}')
return engine
@pytest.fixture(scope='package')
def controller(engine):
"""Initialize ScraperController to be used for all tests in the package."""
scraper_controller = ScraperController()
scraper_controller.connect_to_db(engine)
return scraper_controller
@pytest.fixture(scope='package')
def channel_kwargs():
def etl_controller(engine):
"""Initialize ETLController to be used for all tests in the package."""
etl_controller = ETLController()
etl_controller.connect_to_db(engine)
return etl_controller
@pytest.fixture(scope='package')
def channel_kwargs():
"""Define keyword arguments to use for defining test channels for each
platform to be scraped.
"""
return {
'bitchute' : BITCHUTE_CHANNEL_KWARGS,
'gab' : GAB_CHANNEL_KWARGS,
'gettr' : GETTR_CHANNEL_KWARGS,
'odysee' : ODYSEE_CHANNEL_KWARGS,
'rumble' : RUMBLE_CHANNEL_KWARGS,
'telegram' : TELEGRAM_CHANNEL_KWARGS,
'twitter' : TWITTER_CHANNEL_KWARGS}
'bitchute': BITCHUTE_CHANNEL_KWARGS,
'gab': GAB_CHANNEL_KWARGS,
'gettr': GETTR_CHANNEL_KWARGS,
'odysee': ODYSEE_CHANNEL_KWARGS,
'rumble': RUMBLE_CHANNEL_KWARGS,
'telegram': TELEGRAM_CHANNEL_KWARGS,
'twitter': TWITTER_CHANNEL_KWARGS}
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

View File

@@ -9,6 +9,8 @@ def test_scrape_bitchute_channel_no_media(controller, channel_kwargs):
def test_scrape_bitchute_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['bitchute'])]
controller.register_scraper(scraper = BitchuteScraper())
controller.scrape_channels(channels = channels, archive_media = True)

View File

@@ -8,6 +8,8 @@ def test_scrape_gab_channel_no_media(controller, channel_kwargs):
controller.scrape_channels(channels = channels, archive_media = False)
def test_scrape_gab_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['gab'])]
controller.register_scraper(scraper = GabScraper())

View File

@@ -9,6 +9,8 @@ def test_scrape_gettr_channel_no_media(controller, channel_kwargs):
def test_scrape_gettr_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['gettr'])]
controller.register_scraper(scraper = GettrScraper())
controller.scrape_channels(channels = channels, archive_media = True)

View File

@@ -9,6 +9,8 @@ def test_scrape_odysee_channel_no_media(controller, channel_kwargs):
def test_scrape_odysee_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['odysee'])]
controller.register_scraper(scraper = OdyseeScraper())
controller.scrape_channels(channels = channels, archive_media = True)

View File

@@ -9,6 +9,8 @@ def test_scrape_rumble_channel_no_media(controller, channel_kwargs):
def test_scrape_rumble_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['rumble'])]
controller.register_scraper(scraper = RumbleScraper())
controller.scrape_channels(channels = channels, archive_media = True)

View File

@@ -9,6 +9,8 @@ def test_scrape_telegram_snscrape_channel_no_media(controller, channel_kwargs):
def test_scrape_telegram_snscrape_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramSnscrapeScraper())
controller.scrape_channels(channels = channels, archive_media = True)

View File

@@ -9,6 +9,8 @@ def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
def test_scrape_telegram_telethon_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramTelethonScraper())
controller.scrape_channels(channels = channels, archive_media = True)

View File

@@ -9,6 +9,8 @@ def test_scrape_twitter_channel_no_media(controller, channel_kwargs):
def test_scrape_twitter_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['twitter'])]
controller.register_scraper(scraper = TwitterScraper())
controller.scrape_channels(channels = channels, archive_media = True)

View File

View File

@@ -0,0 +1,30 @@
from sqlalchemy.orm import sessionmaker, with_polymorphic
import json
from cisticola.base import Channel
from cisticola.scraper import TwitterScraper
from cisticola.transformer import TwitterTransformer
from cisticola.base import TransformedResult, Media
def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['twitter'])]
controller.register_scraper(scraper = TwitterScraper())
controller.scrape_channels(channels = channels, archive_media = True)
etl_controller.register_transformer(TwitterTransformer())
etl_controller.transform_all_untransformed()
sessionfactory = sessionmaker()
sessionfactory.configure(bind=engine)
session = sessionfactory()
posts = session.query(TransformedResult).all()
media = session.query(Media).all()
assert len(posts) == 3
assert len(media) == 2
assert posts[-1].content == "This is a test. https://t.co/rzTFL9uFi6"
assert json.loads(media[-1].exif)['Composite:ImageSize'] == "826 728"