diff --git a/Pipfile b/Pipfile index 328faea..26295e9 100644 --- a/Pipfile +++ b/Pipfile @@ -17,6 +17,8 @@ polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"} garc = "*" youtube-dl = "*" telethon = "*" +pytesseract = "*" +pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"} [dev-packages] pytest = "*" diff --git a/Pipfile.lock b/Pipfile.lock index f75c07f..bd5a480 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "495ba305ca55a0ac5754037ba133518b47324965dd3ab0b8db8b69206524d68e" + "sha256": "c29fb4651dfcf05e182e5cc94323c9a6aedf2a821cd57ea17b1b48f707283646" }, "pipfile-spec": 6, "requires": { @@ -42,11 +42,11 @@ }, "botocore": { "hashes": [ - "sha256:7ea8ef1ff7c4882ab59b337662f90ddf5ea860e95e7e209dca593a34ea585b1b", - "sha256:d2da7ccbc5ddd61fe3cd45fcbd3de380d9e3a15bfa8fbfd2d9259a93dcc60c56" + "sha256:5ed2be0e413961134f4c17eab16396d41a5b4b73a637588260c04d20806d52ea", + "sha256:d0d77bce152ca51f3c2cd0f9bf05cb3b623e719406ad58b4c20444e237fe82eb" ], "markers": "python_version >= '3.6'", - "version": "==1.24.18" + "version": "==1.24.19" }, "bs4": { "hashes": [ @@ -344,6 +344,47 @@ "markers": "python_version >= '3.8'", "version": "==1.4.1" }, + "pillow": { + "hashes": [ + "sha256:011233e0c42a4a7836498e98c1acf5e744c96a67dd5032a6f666cc1fb97eab97", + "sha256:0f29d831e2151e0b7b39981756d201f7108d3d215896212ffe2e992d06bfe049", + "sha256:12875d118f21cf35604176872447cdb57b07126750a33748bac15e77f90f1f9c", + "sha256:14d4b1341ac07ae07eb2cc682f459bec932a380c3b122f5540432d8977e64eae", + "sha256:1c3c33ac69cf059bbb9d1a71eeaba76781b450bc307e2291f8a4764d779a6b28", + "sha256:1d19397351f73a88904ad1aee421e800fe4bbcd1aeee6435fb62d0a05ccd1030", + "sha256:253e8a302a96df6927310a9d44e6103055e8fb96a6822f8b7f514bb7ef77de56", + "sha256:2632d0f846b7c7600edf53c48f8f9f1e13e62f66a6dbc15191029d950bfed976", + "sha256:335ace1a22325395c4ea88e00ba3dc89ca029bd66bd5a3c382d53e44f0ccd77e", + "sha256:413ce0bbf9fc6278b2d63309dfeefe452835e1c78398efb431bab0672fe9274e", + "sha256:5100b45a4638e3c00e4d2320d3193bdabb2d75e79793af7c3eb139e4f569f16f", + "sha256:514ceac913076feefbeaf89771fd6febde78b0c4c1b23aaeab082c41c694e81b", + "sha256:528a2a692c65dd5cafc130de286030af251d2ee0483a5bf50c9348aefe834e8a", + "sha256:6295f6763749b89c994fcb6d8a7f7ce03c3992e695f89f00b741b4580b199b7e", + "sha256:6c8bc8238a7dfdaf7a75f5ec5a663f4173f8c367e5a39f87e720495e1eed75fa", + "sha256:718856856ba31f14f13ba885ff13874be7fefc53984d2832458f12c38205f7f7", + "sha256:7f7609a718b177bf171ac93cea9fd2ddc0e03e84d8fa4e887bdfc39671d46b00", + "sha256:80ca33961ced9c63358056bd08403ff866512038883e74f3a4bf88ad3eb66838", + "sha256:80fe64a6deb6fcfdf7b8386f2cf216d329be6f2781f7d90304351811fb591360", + "sha256:81c4b81611e3a3cb30e59b0cf05b888c675f97e3adb2c8672c3154047980726b", + "sha256:855c583f268edde09474b081e3ddcd5cf3b20c12f26e0d434e1386cc5d318e7a", + "sha256:9bfdb82cdfeccec50aad441afc332faf8606dfa5e8efd18a6692b5d6e79f00fd", + "sha256:a5d24e1d674dd9d72c66ad3ea9131322819ff86250b30dc5821cbafcfa0b96b4", + "sha256:a9f44cd7e162ac6191491d7249cceb02b8116b0f7e847ee33f739d7cb1ea1f70", + "sha256:b5b3f092fe345c03bca1e0b687dfbb39364b21ebb8ba90e3fa707374b7915204", + "sha256:b9618823bd237c0d2575283f2939655f54d51b4527ec3972907a927acbcc5bfc", + "sha256:cef9c85ccbe9bee00909758936ea841ef12035296c748aaceee535969e27d31b", + "sha256:d21237d0cd37acded35154e29aec853e945950321dd2ffd1a7d86fe686814669", + "sha256:d3c5c79ab7dfce6d88f1ba639b77e77a17ea33a01b07b99840d6ed08031cb2a7", + "sha256:d9d7942b624b04b895cb95af03a23407f17646815495ce4547f0e60e0b06f58e", + "sha256:db6d9fac65bd08cea7f3540b899977c6dee9edad959fa4eaf305940d9cbd861c", + "sha256:ede5af4a2702444a832a800b8eb7f0a7a1c0eed55b644642e049c98d589e5092", + "sha256:effb7749713d5317478bb3acb3f81d9d7c7f86726d41c1facca068a04cf5bb4c", + "sha256:f154d173286a5d1863637a7dcd8c3437bb557520b01bddb0be0258dcb72696b5", + "sha256:f25ed6e28ddf50de7e7ea99d7a976d6a9c415f03adcaac9c41ff6ff41b6d86ac" + ], + "markers": "python_version >= '3.7'", + "version": "==9.0.1" + }, "pluggy": { "hashes": [ "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159", @@ -388,6 +429,10 @@ ], "version": "==0.4.8" }, + "pyexiftool": { + "git": "https://github.com/smarnach/pyexiftool.git", + "ref": "3db3764895e687d75b42d3ae4e554ca8664a7f6f" + }, "pyparsing": { "hashes": [ "sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea", @@ -404,6 +449,14 @@ ], "version": "==1.7.1" }, + "pytesseract": { + "hashes": [ + "sha256:7e2bafc7f48d1bb71443ce4633a56f5e21925a98f220a36c336297edcd1956d0", + "sha256:fecda37d1e4eaf744c657cd03a5daab4eb97c61506ac5550274322c8ae32eca2" + ], + "index": "pypi", + "version": "==0.3.9" + }, "pytest": { "hashes": [ "sha256:b555252a95bbb2a37a97b5ac2eb050c436f7989993565f5e0c9128fcaacadd0e", @@ -528,7 +581,7 @@ "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17", "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb" ], - "markers": "python_version >= '3.6' and python_version < '4'", + "markers": "python_version >= '3.6' and python_version < '4.0'", "version": "==4.8" }, "s3transfer": { @@ -637,7 +690,7 @@ "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed", "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", "version": "==1.26.8" }, "youtube-dl": { @@ -1002,7 +1055,7 @@ "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed", "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", "version": "==1.26.8" }, "zipp": { diff --git a/cisticola/base.py b/cisticola/base.py index 2c9ad83..fc07846 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -1,9 +1,16 @@ from typing import List from dataclasses import dataclass from datetime import datetime - from sqlalchemy.orm import registry from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey +import pytesseract +import PIL +import io +import exiftool +import json +import os + +from .utils import make_request @dataclass class ScraperResult: @@ -96,7 +103,7 @@ class TransformedResult: platform: str #: User-specified integer that uniquely identifies a channel, e.g. ``15``. - channel: str + channel: int #: Datetime (relative to UTC) that the scraped post was created at. date: datetime @@ -107,15 +114,16 @@ class TransformedResult: #: URL of the original post url: str - #: Text of the original post - content: str - #: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``. author_id: str #: Username of author who made post. author_username: str + #: Text of the original post + content: str + + mapper_registry = registry() raw_data_table = Table('raw_data', mapper_registry.metadata, @@ -139,13 +147,78 @@ analysis_table = Table('analysis', mapper_registry.metadata, Column('scraper', String), Column('transformer', String), Column('platform', String), - Column('channel', String), + Column('channel', Integer), Column('date', DateTime), Column('date_archived', DateTime), Column('url', String), - Column('content', String), Column('author_id', String), - Column('author_username', String) + Column('author_username', String), + Column('content', String) ) -mapper_registry.map_imperatively(TransformedResult, analysis_table) \ No newline at end of file +mapper_registry.map_imperatively(TransformedResult, analysis_table) + +@dataclass +class Media: + raw_id: int + post: int + url: str + original_url: str + + exif: str = None + + def get_blob(self): + blob = make_request(self.url) + return blob.content + + def hydrate(self, blob = None): + if blob is None: + blob = self.get_blob() + + self.hydrate_exif(blob) + + def hydrate_exif(self, blob): + f = open('tmp', 'wb') + f.write(blob) + f.close() + + with exiftool.ExifTool() as et: + exif = et.get_metadata('tmp') + self.exif = json.dumps(exif) + + os.remove('tmp') + +@dataclass +class Image(Media): + ocr: str = None + + def hydrate(self, blob=None): + if blob is None: + blob = self.get_blob() + + super().hydrate(blob) + self.hydrate_ocr(blob) + + def hydrate_ocr(self, blob): + image = PIL.Image.open(io.BytesIO(blob)) + self.ocr = pytesseract.image_to_string(image) + +@dataclass +class Video(Media): + pass + +media_table = Table('media', mapper_registry.metadata, + Column('id', Integer, primary_key=True, + autoincrement=True), + Column('type', String), + Column('raw_id', Integer, ForeignKey('raw_data.id')), + Column('post', Integer, ForeignKey('analysis.id')), + Column('url', String), + Column('original_url', String), + Column('exif', String), + Column('ocr', String) + ) + +mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media') +mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image') +mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video') \ No newline at end of file diff --git a/cisticola/scraper/__init__.py b/cisticola/scraper/__init__.py index 2d692e8..4f33931 100644 --- a/cisticola/scraper/__init__.py +++ b/cisticola/scraper/__init__.py @@ -1,4 +1,4 @@ -from .utils import make_request +from cisticola.utils import make_request from .base import Scraper, ScraperController from .bitchute import BitchuteScraper from .gab import GabScraper diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 524a729..f35a13e 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -10,7 +10,7 @@ import ffmpeg from sqlalchemy.orm import sessionmaker from cisticola.base import Channel, ScraperResult, mapper_registry -from cisticola.scraper import make_request +from cisticola.utils import make_request class Scraper: """Base class for defining platform-specific scrapers for scraping all posts @@ -204,7 +204,6 @@ class ScraperController: def __init__(self): self.scrapers = [] self.session = None - self.mapper_registry = None def register_scraper(self, scraper: Scraper): """Register a single Scraper instance to the controller. @@ -275,4 +274,12 @@ class ScraperController: mapper_registry.metadata.create_all(bind=engine) self.session = sessionmaker() - self.session.configure(bind=engine) \ No newline at end of file + self.engine = engine + self.session.configure(bind=self.engine) + + def reset_db(self): + + mapper_registry.metadata.drop_all(bind=self.engine) + self.connect_to_db(self.engine) + + diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index a5292aa..8a365f4 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -236,8 +236,8 @@ def append_details(video, detail): video["video_url"] = soup.select_one("video#player source").get("src") video["thumbnail_image"] = soup.select_one("video#player").get("poster") video["subject"] = soup.select_one("h1#video-title").text - video["author"] = soup.select_one("div.channel-banner p.name a").text - video["author_id"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2] + video["author_id"] = soup.select_one("p.owner a").get("href").split("/")[2] + video["author"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2] video["body"] = soup.select_one("div#video-description").encode_contents().decode("utf-8").strip() # we need *two more requests* to get the comment count and like/dislike counts diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index b528383..c019e27 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -67,9 +67,13 @@ class TwitterScraper(Scraper): parsed_url = urlparse(url) queries = parse_qs(parsed_url.query) + ext = '' + # TODO might require additional statements for other media formats if 'jpg' in queries.get('format', []): ext = '.jpg' + elif 'png' in queries.get('format', []): + ext = '.png' elif parsed_url.path.endswith('.mp4'): ext = '' diff --git a/cisticola/transformer/__init__.py b/cisticola/transformer/__init__.py index e3a4b49..7812b52 100644 --- a/cisticola/transformer/__init__.py +++ b/cisticola/transformer/__init__.py @@ -1,2 +1,3 @@ -from . import base -from .twitter import TwitterTransformer \ No newline at end of file +from .base import ETLController +from .twitter import TwitterTransformer +from .bitchute import BitchuteTransformer \ No newline at end of file diff --git a/cisticola/transformer/base.py b/cisticola/transformer/base.py index 8005b4a..ce6bb59 100644 --- a/cisticola/transformer/base.py +++ b/cisticola/transformer/base.py @@ -1,7 +1,12 @@ -from cisticola.base import ScraperResult, TransformedResult +from typing import List, Generator +from loguru import logger +from sqlalchemy.orm import sessionmaker +from sqlalchemy.engine.base import Engine + +from cisticola.base import ScraperResult, TransformedResult, Media, mapper_registry class Transformer: - """Interface class for transformers""" + """Interface class for transformers.""" __version__ = "Transformer 0.0.0" @@ -9,8 +14,158 @@ class Transformer: pass def can_handle(data: ScraperResult) -> bool: + """Specifies whether or not a Transformer is capable of handling a particular + piece of scraped data. + + Parameters + ---------- + data : ScraperResult + The ScraperResult object to check for ability to handle. + + Returns + ------- + bool + True if it can be handled by this Transformer, false otherwise. + """ + + pass + + def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]: + """Yields Media objects from each piece of media present in a raw ScraperResult. + + Parameters + ---------- + data : ScraperResult + The ScraperResult object to process + transformed : TransformedResult + The TransformedResult version of `data`. (E.g. as generated by `Transformer.transform()`) + + Yields + ------ + Media + A media object generated from the ScraperResult. One ScraperResult can have multiple pieces + of media contained within it, so this can generate an arbitrary number of Media objects + (or their subclasses.) These Media objects are not fully hydrated. + """ + pass def transform(data: ScraperResult) -> TransformedResult: + """Transform a ScraperResult into a TransformedResult object. This extracts additional attributes + that can be used directly for analysis. + + Parameters + ---------- + data : ScraperResult + The ScraperResult object to process. + + Returns + ------- + TransformedResult + A TransformedResult representation of the `data` object. + """ + pass + +class ETLController: + """An ETLController will transform raw scraped data (ScrapedResult objects) into a more detailed format + for analysis by using Transformer objects that have been registered with the controller. + """ + + def __init__(self): + self.transformers = [] + + def register_transformer(self, transformer: Transformer): + """Adds a Transformer to the list of available Transformers. + + Parameters + ---------- + transformer : Transformer + The Transformer to register + """ + + self.transformers.append(transformer) + + def connect_to_db(self, engine: Engine): + """Connects the ETLController to a SQLAlchemy engine. + + Parameters + ---------- + engine : Engine + SQLAlchemy Engine object + """ + # create tables + mapper_registry.metadata.create_all(bind=engine) + + self.session = sessionmaker() + self.session.configure(bind=engine) + + @logger.catch + def transform_results(self, results: List[ScraperResult], hydrate: bool = True): + """Transforms raw ScraperResults objects into TransformedResult objects and + Media objects. Then, adds them to the database. + + Parameters + ---------- + results : List[ScraperResult] + A list of ScraperResult objects to be transformed + hydrate : bool + Whether or not to fully hydrate transformed media. Default True. + """ + if self.session is None: + logger.error("No DB session") + return + + for result in results: + for transformer in self.transformers: + handled = False + + if transformer.can_handle(result): + logger.info(f"{transformer} is handling result {result}") + handled = True + session = self.session() + + transformed = transformer.transform(result) + + session.add(transformed) + session.flush() + + media = transformer.transform_media(result, transformed) + + count = 0 + for obj in media: + if hydrate: + logger.info(f"Hydrating {obj}") + obj.hydrate() + + session.add(obj) + count += 1 + + session.commit() + logger.info(f"{transformer} generated {count} media objects") + break + + if handled == False: + logger.warning(f"No Transformer could handle {result}") + + @logger.catch + def transform_all_untransformed(self, hydrate: bool = True): + """Transform all ScraperResult objects in the database that do not have an + equivalent TransformedResult object stored. + + Parameters + ---------- + hydrate : bool + Whether or not to fully hydrate transformed media. Default True. + """ + + if self.session is None: + logger.error("No DB session") + return + + session = self.session() + untransformed = session.query(ScraperResult).join(TransformedResult, isouter=True).where(TransformedResult.raw_id == None).all() + logger.info(f"Found {len(untransformed)} items to ETL") + + self.transform_results(untransformed, hydrate=hydrate) \ No newline at end of file diff --git a/cisticola/transformer/bitchute.py b/cisticola/transformer/bitchute.py new file mode 100644 index 0000000..de6f0a7 --- /dev/null +++ b/cisticola/transformer/bitchute.py @@ -0,0 +1,51 @@ +import json +from loguru import logger +from typing import Generator + +from bs4 import BeautifulSoup + +from cisticola.transformer.base import Transformer +from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media + +class BitchuteTransformer(Transformer): + """A Bitchute specific ScraperResult, with a method ETL/transforming""" + + __version__ = "BitchuteTransformer 0.0.1" + + def can_handle(self, data: ScraperResult) -> bool: + scraper = data.scraper.split(' ') + if scraper[0] == "BitchuteScraper": + return True + + return False + + def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]: + raw = json.loads(data.raw_data) + + orig = raw['video_url'] + new = data.archived_urls[orig] + + m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig) + + yield m + + def transform(self, data: ScraperResult) -> TransformedResult: + raw = json.loads(data.raw_data) + + soup = BeautifulSoup(raw['body'], features = 'html.parser') + content = soup.find_all('p')[-1].text + + transformed = TransformedResult( + raw_id=data.id, + scraper=data.scraper, + transformer=self.__version__, + platform=data.platform, + channel=data.channel, + date=data.date, + date_archived=data.date_archived, + url=raw['url'], + content=content, + author_id=raw['author_id'], + author_username=raw['author']) + + return transformed diff --git a/cisticola/transformer/twitter.py b/cisticola/transformer/twitter.py index 866a9fb..6c0838c 100644 --- a/cisticola/transformer/twitter.py +++ b/cisticola/transformer/twitter.py @@ -1,13 +1,51 @@ import json +from loguru import logger +from typing import Generator from cisticola.transformer.base import Transformer -from cisticola.base import ScraperResult, TransformedResult +from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media class TwitterTransformer(Transformer): """A Twitter specific ScraperResult, with a method ETL/transforming""" __version__ = "TwitterTransformer 0.0.1" + def can_handle(self, data: ScraperResult) -> bool: + scraper = data.scraper.split(' ') + if scraper[0] == "TwitterScraper": + return True + + return False + + def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]: + raw = json.loads(data.raw_data) + + if raw['media']: + for media in raw['media']: + orig = None + + if media["_type"] == "snscrape.modules.twitter.Photo": + orig = media["fullUrl"] + elif media["_type"] == "snscrape.modules.twitter.Gif": + orig = media["variants"][0]["url"] + elif media["_type"] == "snscrape.modules.twitter.Video": + variant = max([v for v in media["variants"] if v["bitrate"]], key=lambda v: v["bitrate"]) + orig = variant["url"] + + if orig is None: + logger.warning(f"No media URL found for {media}") + elif orig not in data.archived_urls: + logger.info("Media discovered but not archived") + else: + new = data.archived_urls[orig] + + if media["_type"] == "snscrape.modules.twitter.Photo": + m = Image(url=new, post=transformed.id, raw_id=data.id, original_url=orig) + else: + m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig) + + yield m + def transform(self, data: ScraperResult) -> TransformedResult: raw = json.loads(data.raw_data) diff --git a/cisticola/scraper/utils.py b/cisticola/utils.py similarity index 96% rename from cisticola/scraper/utils.py rename to cisticola/utils.py index 079bd1f..385884f 100644 --- a/cisticola/scraper/utils.py +++ b/cisticola/utils.py @@ -1,5 +1,6 @@ import requests from loguru import logger +import time def make_request(url, headers = None, max_retries = 5, break_codes = None): @@ -64,6 +65,9 @@ def request_until_200(url, headers = None, max_retries = 5, break_codes = None): while r.status_code not in break_codes and n_retries < 5: logger.warning(f"Request for url: {url} returned status: {r.status_code} on attempt: {n_retries}/{max_retries}") n_retries += 1 + + # back off subsequent requests + time.sleep(n_retries) r = requests.get(url, headers = headers) if r.status_code not in break_codes: diff --git a/test.py b/test.py index 8a2d624..ade611c 100644 --- a/test.py +++ b/test.py @@ -1,7 +1,7 @@ from sqlalchemy import create_engine from loguru import logger -from cisticola.base import Channel +from cisticola.base import Channel, TransformedResult, ScraperResult from cisticola.scraper import ( ScraperController, BitchuteScraper, @@ -12,104 +12,26 @@ from cisticola.scraper import ( TelegramSnscrapeScraper, TelegramTelethonScraper, TwitterScraper) +from cisticola.transformer import ETLController +from cisticola.transformer.twitter import TwitterTransformer +from sqlalchemy.orm import sessionmaker logger.add("../test.log") test_channels = [ Channel( id=0, - name="Logan Williams (test)", - platform_id=891729132, + name="L Weber (test)", + platform_id=1424979017749442595, category="test", followers=None, platform="Twitter", - url="https://twitter.com/obtusatum", - screenname="obtusatum", + url="https://twitter.com/LWeber33662141", + screenname="LWeber33662141", country="US", influencer=None, public=True, chat=False, - notes=""), - Channel( - id=1, - name="South West Ohio Proud Boys (test)", - platform_id=-1001276612436, - category="test", - followers=None, - platform="Telegram", - url="https://t.me/SouthwestOhioPB", - screenname="SouthwestOhioPB", - country="US", - influencer=None, - public=True, - chat=False, - notes=""), - Channel( - id=2, - name="LizardRepublic (test)", - platform_id='lizardrepublic', - category="test", - followers=None, - platform="Gettr", - url="https://www.gettr.com/user/lizardrepublic", - screenname="lizardrepublic", - country="US", - influencer=None, - public=True, - chat=False, - notes=""), - Channel( - id=4, - name="bestonlinejewelrystoresusa@gmail.com (test)", platform_id='bestonlinejewelrystoresusagmailcom', - category="test", - followers=None, - platform="Bitchute", - url="https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/", screenname=None, - country="US", - influencer=None, - public=True, - chat=False, - notes=""), - Channel( - id=5, - name="Mak1n' Bacon (test)", - platform_id='Mak1nBacon', - category="test", - followers=None, - platform="Odysee", - url="https://odysee.com/@Mak1nBacon", - screenname='Mak1nBacon', - country="US", - influencer=None, - public=True, - chat=False, - notes=""), - Channel( - id=6, - name="Capt. Marc Simon (test)", - platform_id='marc_capt', - category="test", - followers=None, - platform="Gab", - url="https://gab.com/marc_capt", - screenname='marc_capt', - country="CA", - influencer=None, - public=True, - chat=False, - notes=""), - Channel( - id=7, - name="we are uploading videos wow products and problem solving products.please share like and subscribe our channelwe are uploading videos wow products and problem solving products.please share like and subscribe our channel", platform_id='c-916305', - category="test", - followers=None, - platform="Rumble", - url="https://rumble.com/c/c-916305", - screenname='we are uploading', - country="CA", - influencer=None, - public=True, - chat=False, notes="")] controller = ScraperController() @@ -126,7 +48,14 @@ scrapers = [ controller.register_scrapers(scrapers) -engine = create_engine('sqlite:///test3.db') +engine = create_engine('sqlite:///test.db') controller.connect_to_db(engine) -controller.scrape_channels(test_channels, archive_media = False) \ No newline at end of file +controller.scrape_channels(test_channels, archive_media = True) + +transformer = TwitterTransformer() + +etl_controller = ETLController() +etl_controller.register_transformer(transformer) +etl_controller.connect_to_db(engine) +etl_controller.transform_all_untransformed() diff --git a/tests/conftest.py b/tests/conftest.py index 0608903..42548e9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,7 @@ import pytest from sqlalchemy import create_engine from cisticola.scraper import ScraperController +from cisticola.transformer import ETLController #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# @@ -98,13 +99,13 @@ TELEGRAM_CHANNEL_KWARGS = { TWITTER_CHANNEL_KWARGS = { 'id': 5, - 'name': 'Logan Williams (test)', - 'platform_id': 891729132, + 'name': 'L Weber (test)', + 'platform_id': 1424979017749442595, 'category': 'test', 'followers': None, 'platform': 'Twitter', - 'url': 'https://twitter.com/obtusatum', - 'screenname': 'obtusatum', + 'url': 'https://twitter.com/LWeber33662141', + 'screenname': 'LWeber33662141', 'country': 'US', 'influencer': None, 'public': True, @@ -113,35 +114,49 @@ TWITTER_CHANNEL_KWARGS = { #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -@pytest.fixture(scope='function') -def controller(tmpdir_factory): - """Initialize ScraperController and SQLite database file to be used for all - tests in the package. - """ - +@pytest.fixture(scope='package') +def engine(tmpdir_factory): + """Initialize a SQLite database and SQLAlchemy engine to be used for all + tests in the package""" + file = tmpdir_factory.mktemp('test_data').join('test.db') engine = create_engine(f'sqlite:///{file}') + return engine + + +@pytest.fixture(scope='package') +def controller(engine): + """Initialize ScraperController to be used for all tests in the package.""" + scraper_controller = ScraperController() scraper_controller.connect_to_db(engine) return scraper_controller @pytest.fixture(scope='package') -def channel_kwargs(): +def etl_controller(engine): + """Initialize ETLController to be used for all tests in the package.""" + etl_controller = ETLController() + etl_controller.connect_to_db(engine) + + return etl_controller + +@pytest.fixture(scope='package') +def channel_kwargs(): """Define keyword arguments to use for defining test channels for each platform to be scraped. """ return { - 'bitchute' : BITCHUTE_CHANNEL_KWARGS, - 'gab' : GAB_CHANNEL_KWARGS, - 'gettr' : GETTR_CHANNEL_KWARGS, - 'odysee' : ODYSEE_CHANNEL_KWARGS, - 'rumble' : RUMBLE_CHANNEL_KWARGS, - 'telegram' : TELEGRAM_CHANNEL_KWARGS, - 'twitter' : TWITTER_CHANNEL_KWARGS} + 'bitchute': BITCHUTE_CHANNEL_KWARGS, + 'gab': GAB_CHANNEL_KWARGS, + 'gettr': GETTR_CHANNEL_KWARGS, + 'odysee': ODYSEE_CHANNEL_KWARGS, + 'rumble': RUMBLE_CHANNEL_KWARGS, + 'telegram': TELEGRAM_CHANNEL_KWARGS, + 'twitter': TWITTER_CHANNEL_KWARGS} -#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# diff --git a/tests/scraper/__init__.py b/tests/scraper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/scraper/bitchute.py b/tests/scraper/bitchute.py index bc64c4b..c32e840 100644 --- a/tests/scraper/bitchute.py +++ b/tests/scraper/bitchute.py @@ -9,6 +9,8 @@ def test_scrape_bitchute_channel_no_media(controller, channel_kwargs): def test_scrape_bitchute_channel(controller, channel_kwargs): + controller.reset_db() + channels = [Channel(**channel_kwargs['bitchute'])] controller.register_scraper(scraper = BitchuteScraper()) controller.scrape_channels(channels = channels, archive_media = True) diff --git a/tests/scraper/gab.py b/tests/scraper/gab.py index 29fa34a..c864c37 100644 --- a/tests/scraper/gab.py +++ b/tests/scraper/gab.py @@ -8,6 +8,8 @@ def test_scrape_gab_channel_no_media(controller, channel_kwargs): controller.scrape_channels(channels = channels, archive_media = False) def test_scrape_gab_channel(controller, channel_kwargs): + + controller.reset_db() channels = [Channel(**channel_kwargs['gab'])] controller.register_scraper(scraper = GabScraper()) diff --git a/tests/scraper/gettr.py b/tests/scraper/gettr.py index 186b74c..7dd2f24 100644 --- a/tests/scraper/gettr.py +++ b/tests/scraper/gettr.py @@ -9,6 +9,8 @@ def test_scrape_gettr_channel_no_media(controller, channel_kwargs): def test_scrape_gettr_channel(controller, channel_kwargs): + controller.reset_db() + channels = [Channel(**channel_kwargs['gettr'])] controller.register_scraper(scraper = GettrScraper()) controller.scrape_channels(channels = channels, archive_media = True) diff --git a/tests/scraper/odysee.py b/tests/scraper/odysee.py index 8b9f89a..f97700e 100644 --- a/tests/scraper/odysee.py +++ b/tests/scraper/odysee.py @@ -9,6 +9,8 @@ def test_scrape_odysee_channel_no_media(controller, channel_kwargs): def test_scrape_odysee_channel(controller, channel_kwargs): + controller.reset_db() + channels = [Channel(**channel_kwargs['odysee'])] controller.register_scraper(scraper = OdyseeScraper()) controller.scrape_channels(channels = channels, archive_media = True) diff --git a/tests/scraper/rumble.py b/tests/scraper/rumble.py index daf59f6..5f640e5 100644 --- a/tests/scraper/rumble.py +++ b/tests/scraper/rumble.py @@ -9,6 +9,8 @@ def test_scrape_rumble_channel_no_media(controller, channel_kwargs): def test_scrape_rumble_channel(controller, channel_kwargs): + controller.reset_db() + channels = [Channel(**channel_kwargs['rumble'])] controller.register_scraper(scraper = RumbleScraper()) controller.scrape_channels(channels = channels, archive_media = True) diff --git a/tests/scraper/telegram_snscrape.py b/tests/scraper/telegram_snscrape.py index af25ed7..3848780 100644 --- a/tests/scraper/telegram_snscrape.py +++ b/tests/scraper/telegram_snscrape.py @@ -9,6 +9,8 @@ def test_scrape_telegram_snscrape_channel_no_media(controller, channel_kwargs): def test_scrape_telegram_snscrape_channel(controller, channel_kwargs): + controller.reset_db() + channels = [Channel(**channel_kwargs['telegram'])] controller.register_scraper(scraper = TelegramSnscrapeScraper()) controller.scrape_channels(channels = channels, archive_media = True) diff --git a/tests/scraper/telegram_telethon.py b/tests/scraper/telegram_telethon.py index 1cfc529..c015631 100644 --- a/tests/scraper/telegram_telethon.py +++ b/tests/scraper/telegram_telethon.py @@ -9,6 +9,8 @@ def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs): def test_scrape_telegram_telethon_channel(controller, channel_kwargs): + controller.reset_db() + channels = [Channel(**channel_kwargs['telegram'])] controller.register_scraper(scraper = TelegramTelethonScraper()) controller.scrape_channels(channels = channels, archive_media = True) diff --git a/tests/scraper/twitter.py b/tests/scraper/twitter.py index ef375b7..bd79a6a 100644 --- a/tests/scraper/twitter.py +++ b/tests/scraper/twitter.py @@ -9,6 +9,8 @@ def test_scrape_twitter_channel_no_media(controller, channel_kwargs): def test_scrape_twitter_channel(controller, channel_kwargs): + controller.reset_db() + channels = [Channel(**channel_kwargs['twitter'])] controller.register_scraper(scraper = TwitterScraper()) controller.scrape_channels(channels = channels, archive_media = True) diff --git a/tests/transformer/__init__.py b/tests/transformer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/transformer/twitter.py b/tests/transformer/twitter.py new file mode 100644 index 0000000..4ddeb68 --- /dev/null +++ b/tests/transformer/twitter.py @@ -0,0 +1,30 @@ +from sqlalchemy.orm import sessionmaker, with_polymorphic +import json + +from cisticola.base import Channel +from cisticola.scraper import TwitterScraper +from cisticola.transformer import TwitterTransformer +from cisticola.base import TransformedResult, Media + +def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs): + controller.reset_db() + + channels = [Channel(**channel_kwargs['twitter'])] + controller.register_scraper(scraper = TwitterScraper()) + controller.scrape_channels(channels = channels, archive_media = True) + + etl_controller.register_transformer(TwitterTransformer()) + etl_controller.transform_all_untransformed() + + sessionfactory = sessionmaker() + sessionfactory.configure(bind=engine) + session = sessionfactory() + + posts = session.query(TransformedResult).all() + media = session.query(Media).all() + + assert len(posts) == 3 + assert len(media) == 2 + + assert posts[-1].content == "This is a test. https://t.co/rzTFL9uFi6" + assert json.loads(media[-1].exif)['Composite:ImageSize'] == "826 728" \ No newline at end of file