mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
merged main
This commit is contained in:
2
Pipfile
2
Pipfile
@@ -17,6 +17,8 @@ polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"}
|
||||
garc = "*"
|
||||
youtube-dl = "*"
|
||||
telethon = "*"
|
||||
pytesseract = "*"
|
||||
pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"}
|
||||
|
||||
[dev-packages]
|
||||
pytest = "*"
|
||||
|
||||
67
Pipfile.lock
generated
67
Pipfile.lock
generated
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "495ba305ca55a0ac5754037ba133518b47324965dd3ab0b8db8b69206524d68e"
|
||||
"sha256": "c29fb4651dfcf05e182e5cc94323c9a6aedf2a821cd57ea17b1b48f707283646"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
@@ -42,11 +42,11 @@
|
||||
},
|
||||
"botocore": {
|
||||
"hashes": [
|
||||
"sha256:7ea8ef1ff7c4882ab59b337662f90ddf5ea860e95e7e209dca593a34ea585b1b",
|
||||
"sha256:d2da7ccbc5ddd61fe3cd45fcbd3de380d9e3a15bfa8fbfd2d9259a93dcc60c56"
|
||||
"sha256:5ed2be0e413961134f4c17eab16396d41a5b4b73a637588260c04d20806d52ea",
|
||||
"sha256:d0d77bce152ca51f3c2cd0f9bf05cb3b623e719406ad58b4c20444e237fe82eb"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.24.18"
|
||||
"version": "==1.24.19"
|
||||
},
|
||||
"bs4": {
|
||||
"hashes": [
|
||||
@@ -344,6 +344,47 @@
|
||||
"markers": "python_version >= '3.8'",
|
||||
"version": "==1.4.1"
|
||||
},
|
||||
"pillow": {
|
||||
"hashes": [
|
||||
"sha256:011233e0c42a4a7836498e98c1acf5e744c96a67dd5032a6f666cc1fb97eab97",
|
||||
"sha256:0f29d831e2151e0b7b39981756d201f7108d3d215896212ffe2e992d06bfe049",
|
||||
"sha256:12875d118f21cf35604176872447cdb57b07126750a33748bac15e77f90f1f9c",
|
||||
"sha256:14d4b1341ac07ae07eb2cc682f459bec932a380c3b122f5540432d8977e64eae",
|
||||
"sha256:1c3c33ac69cf059bbb9d1a71eeaba76781b450bc307e2291f8a4764d779a6b28",
|
||||
"sha256:1d19397351f73a88904ad1aee421e800fe4bbcd1aeee6435fb62d0a05ccd1030",
|
||||
"sha256:253e8a302a96df6927310a9d44e6103055e8fb96a6822f8b7f514bb7ef77de56",
|
||||
"sha256:2632d0f846b7c7600edf53c48f8f9f1e13e62f66a6dbc15191029d950bfed976",
|
||||
"sha256:335ace1a22325395c4ea88e00ba3dc89ca029bd66bd5a3c382d53e44f0ccd77e",
|
||||
"sha256:413ce0bbf9fc6278b2d63309dfeefe452835e1c78398efb431bab0672fe9274e",
|
||||
"sha256:5100b45a4638e3c00e4d2320d3193bdabb2d75e79793af7c3eb139e4f569f16f",
|
||||
"sha256:514ceac913076feefbeaf89771fd6febde78b0c4c1b23aaeab082c41c694e81b",
|
||||
"sha256:528a2a692c65dd5cafc130de286030af251d2ee0483a5bf50c9348aefe834e8a",
|
||||
"sha256:6295f6763749b89c994fcb6d8a7f7ce03c3992e695f89f00b741b4580b199b7e",
|
||||
"sha256:6c8bc8238a7dfdaf7a75f5ec5a663f4173f8c367e5a39f87e720495e1eed75fa",
|
||||
"sha256:718856856ba31f14f13ba885ff13874be7fefc53984d2832458f12c38205f7f7",
|
||||
"sha256:7f7609a718b177bf171ac93cea9fd2ddc0e03e84d8fa4e887bdfc39671d46b00",
|
||||
"sha256:80ca33961ced9c63358056bd08403ff866512038883e74f3a4bf88ad3eb66838",
|
||||
"sha256:80fe64a6deb6fcfdf7b8386f2cf216d329be6f2781f7d90304351811fb591360",
|
||||
"sha256:81c4b81611e3a3cb30e59b0cf05b888c675f97e3adb2c8672c3154047980726b",
|
||||
"sha256:855c583f268edde09474b081e3ddcd5cf3b20c12f26e0d434e1386cc5d318e7a",
|
||||
"sha256:9bfdb82cdfeccec50aad441afc332faf8606dfa5e8efd18a6692b5d6e79f00fd",
|
||||
"sha256:a5d24e1d674dd9d72c66ad3ea9131322819ff86250b30dc5821cbafcfa0b96b4",
|
||||
"sha256:a9f44cd7e162ac6191491d7249cceb02b8116b0f7e847ee33f739d7cb1ea1f70",
|
||||
"sha256:b5b3f092fe345c03bca1e0b687dfbb39364b21ebb8ba90e3fa707374b7915204",
|
||||
"sha256:b9618823bd237c0d2575283f2939655f54d51b4527ec3972907a927acbcc5bfc",
|
||||
"sha256:cef9c85ccbe9bee00909758936ea841ef12035296c748aaceee535969e27d31b",
|
||||
"sha256:d21237d0cd37acded35154e29aec853e945950321dd2ffd1a7d86fe686814669",
|
||||
"sha256:d3c5c79ab7dfce6d88f1ba639b77e77a17ea33a01b07b99840d6ed08031cb2a7",
|
||||
"sha256:d9d7942b624b04b895cb95af03a23407f17646815495ce4547f0e60e0b06f58e",
|
||||
"sha256:db6d9fac65bd08cea7f3540b899977c6dee9edad959fa4eaf305940d9cbd861c",
|
||||
"sha256:ede5af4a2702444a832a800b8eb7f0a7a1c0eed55b644642e049c98d589e5092",
|
||||
"sha256:effb7749713d5317478bb3acb3f81d9d7c7f86726d41c1facca068a04cf5bb4c",
|
||||
"sha256:f154d173286a5d1863637a7dcd8c3437bb557520b01bddb0be0258dcb72696b5",
|
||||
"sha256:f25ed6e28ddf50de7e7ea99d7a976d6a9c415f03adcaac9c41ff6ff41b6d86ac"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==9.0.1"
|
||||
},
|
||||
"pluggy": {
|
||||
"hashes": [
|
||||
"sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
|
||||
@@ -388,6 +429,10 @@
|
||||
],
|
||||
"version": "==0.4.8"
|
||||
},
|
||||
"pyexiftool": {
|
||||
"git": "https://github.com/smarnach/pyexiftool.git",
|
||||
"ref": "3db3764895e687d75b42d3ae4e554ca8664a7f6f"
|
||||
},
|
||||
"pyparsing": {
|
||||
"hashes": [
|
||||
"sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea",
|
||||
@@ -404,6 +449,14 @@
|
||||
],
|
||||
"version": "==1.7.1"
|
||||
},
|
||||
"pytesseract": {
|
||||
"hashes": [
|
||||
"sha256:7e2bafc7f48d1bb71443ce4633a56f5e21925a98f220a36c336297edcd1956d0",
|
||||
"sha256:fecda37d1e4eaf744c657cd03a5daab4eb97c61506ac5550274322c8ae32eca2"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.3.9"
|
||||
},
|
||||
"pytest": {
|
||||
"hashes": [
|
||||
"sha256:b555252a95bbb2a37a97b5ac2eb050c436f7989993565f5e0c9128fcaacadd0e",
|
||||
@@ -528,7 +581,7 @@
|
||||
"sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17",
|
||||
"sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"
|
||||
],
|
||||
"markers": "python_version >= '3.6' and python_version < '4'",
|
||||
"markers": "python_version >= '3.6' and python_version < '4.0'",
|
||||
"version": "==4.8"
|
||||
},
|
||||
"s3transfer": {
|
||||
@@ -637,7 +690,7 @@
|
||||
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
|
||||
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'",
|
||||
"version": "==1.26.8"
|
||||
},
|
||||
"youtube-dl": {
|
||||
@@ -1002,7 +1055,7 @@
|
||||
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
|
||||
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'",
|
||||
"version": "==1.26.8"
|
||||
},
|
||||
"zipp": {
|
||||
|
||||
@@ -1,9 +1,16 @@
|
||||
from typing import List
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy.orm import registry
|
||||
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey
|
||||
import pytesseract
|
||||
import PIL
|
||||
import io
|
||||
import exiftool
|
||||
import json
|
||||
import os
|
||||
|
||||
from .utils import make_request
|
||||
|
||||
@dataclass
|
||||
class ScraperResult:
|
||||
@@ -96,7 +103,7 @@ class TransformedResult:
|
||||
platform: str
|
||||
|
||||
#: User-specified integer that uniquely identifies a channel, e.g. ``15``.
|
||||
channel: str
|
||||
channel: int
|
||||
|
||||
#: Datetime (relative to UTC) that the scraped post was created at.
|
||||
date: datetime
|
||||
@@ -107,15 +114,16 @@ class TransformedResult:
|
||||
#: URL of the original post
|
||||
url: str
|
||||
|
||||
#: Text of the original post
|
||||
content: str
|
||||
|
||||
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
|
||||
author_id: str
|
||||
|
||||
#: Username of author who made post.
|
||||
author_username: str
|
||||
|
||||
#: Text of the original post
|
||||
content: str
|
||||
|
||||
|
||||
mapper_registry = registry()
|
||||
|
||||
raw_data_table = Table('raw_data', mapper_registry.metadata,
|
||||
@@ -139,13 +147,78 @@ analysis_table = Table('analysis', mapper_registry.metadata,
|
||||
Column('scraper', String),
|
||||
Column('transformer', String),
|
||||
Column('platform', String),
|
||||
Column('channel', String),
|
||||
Column('channel', Integer),
|
||||
Column('date', DateTime),
|
||||
Column('date_archived', DateTime),
|
||||
Column('url', String),
|
||||
Column('content', String),
|
||||
Column('author_id', String),
|
||||
Column('author_username', String)
|
||||
Column('author_username', String),
|
||||
Column('content', String)
|
||||
)
|
||||
|
||||
mapper_registry.map_imperatively(TransformedResult, analysis_table)
|
||||
mapper_registry.map_imperatively(TransformedResult, analysis_table)
|
||||
|
||||
@dataclass
|
||||
class Media:
|
||||
raw_id: int
|
||||
post: int
|
||||
url: str
|
||||
original_url: str
|
||||
|
||||
exif: str = None
|
||||
|
||||
def get_blob(self):
|
||||
blob = make_request(self.url)
|
||||
return blob.content
|
||||
|
||||
def hydrate(self, blob = None):
|
||||
if blob is None:
|
||||
blob = self.get_blob()
|
||||
|
||||
self.hydrate_exif(blob)
|
||||
|
||||
def hydrate_exif(self, blob):
|
||||
f = open('tmp', 'wb')
|
||||
f.write(blob)
|
||||
f.close()
|
||||
|
||||
with exiftool.ExifTool() as et:
|
||||
exif = et.get_metadata('tmp')
|
||||
self.exif = json.dumps(exif)
|
||||
|
||||
os.remove('tmp')
|
||||
|
||||
@dataclass
|
||||
class Image(Media):
|
||||
ocr: str = None
|
||||
|
||||
def hydrate(self, blob=None):
|
||||
if blob is None:
|
||||
blob = self.get_blob()
|
||||
|
||||
super().hydrate(blob)
|
||||
self.hydrate_ocr(blob)
|
||||
|
||||
def hydrate_ocr(self, blob):
|
||||
image = PIL.Image.open(io.BytesIO(blob))
|
||||
self.ocr = pytesseract.image_to_string(image)
|
||||
|
||||
@dataclass
|
||||
class Video(Media):
|
||||
pass
|
||||
|
||||
media_table = Table('media', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
Column('type', String),
|
||||
Column('raw_id', Integer, ForeignKey('raw_data.id')),
|
||||
Column('post', Integer, ForeignKey('analysis.id')),
|
||||
Column('url', String),
|
||||
Column('original_url', String),
|
||||
Column('exif', String),
|
||||
Column('ocr', String)
|
||||
)
|
||||
|
||||
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
|
||||
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
|
||||
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')
|
||||
@@ -1,4 +1,4 @@
|
||||
from .utils import make_request
|
||||
from cisticola.utils import make_request
|
||||
from .base import Scraper, ScraperController
|
||||
from .bitchute import BitchuteScraper
|
||||
from .gab import GabScraper
|
||||
|
||||
@@ -10,7 +10,7 @@ import ffmpeg
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from cisticola.base import Channel, ScraperResult, mapper_registry
|
||||
from cisticola.scraper import make_request
|
||||
from cisticola.utils import make_request
|
||||
|
||||
class Scraper:
|
||||
"""Base class for defining platform-specific scrapers for scraping all posts
|
||||
@@ -204,7 +204,6 @@ class ScraperController:
|
||||
def __init__(self):
|
||||
self.scrapers = []
|
||||
self.session = None
|
||||
self.mapper_registry = None
|
||||
|
||||
def register_scraper(self, scraper: Scraper):
|
||||
"""Register a single Scraper instance to the controller.
|
||||
@@ -275,4 +274,12 @@ class ScraperController:
|
||||
mapper_registry.metadata.create_all(bind=engine)
|
||||
|
||||
self.session = sessionmaker()
|
||||
self.session.configure(bind=engine)
|
||||
self.engine = engine
|
||||
self.session.configure(bind=self.engine)
|
||||
|
||||
def reset_db(self):
|
||||
|
||||
mapper_registry.metadata.drop_all(bind=self.engine)
|
||||
self.connect_to_db(self.engine)
|
||||
|
||||
|
||||
|
||||
@@ -236,8 +236,8 @@ def append_details(video, detail):
|
||||
video["video_url"] = soup.select_one("video#player source").get("src")
|
||||
video["thumbnail_image"] = soup.select_one("video#player").get("poster")
|
||||
video["subject"] = soup.select_one("h1#video-title").text
|
||||
video["author"] = soup.select_one("div.channel-banner p.name a").text
|
||||
video["author_id"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2]
|
||||
video["author_id"] = soup.select_one("p.owner a").get("href").split("/")[2]
|
||||
video["author"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2]
|
||||
video["body"] = soup.select_one("div#video-description").encode_contents().decode("utf-8").strip()
|
||||
|
||||
# we need *two more requests* to get the comment count and like/dislike counts
|
||||
|
||||
@@ -67,9 +67,13 @@ class TwitterScraper(Scraper):
|
||||
parsed_url = urlparse(url)
|
||||
queries = parse_qs(parsed_url.query)
|
||||
|
||||
ext = ''
|
||||
|
||||
# TODO might require additional statements for other media formats
|
||||
if 'jpg' in queries.get('format', []):
|
||||
ext = '.jpg'
|
||||
elif 'png' in queries.get('format', []):
|
||||
ext = '.png'
|
||||
elif parsed_url.path.endswith('.mp4'):
|
||||
ext = ''
|
||||
|
||||
|
||||
@@ -1,2 +1,3 @@
|
||||
from . import base
|
||||
from .twitter import TwitterTransformer
|
||||
from .base import ETLController
|
||||
from .twitter import TwitterTransformer
|
||||
from .bitchute import BitchuteTransformer
|
||||
@@ -1,7 +1,12 @@
|
||||
from cisticola.base import ScraperResult, TransformedResult
|
||||
from typing import List, Generator
|
||||
from loguru import logger
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from sqlalchemy.engine.base import Engine
|
||||
|
||||
from cisticola.base import ScraperResult, TransformedResult, Media, mapper_registry
|
||||
|
||||
class Transformer:
|
||||
"""Interface class for transformers"""
|
||||
"""Interface class for transformers."""
|
||||
|
||||
__version__ = "Transformer 0.0.0"
|
||||
|
||||
@@ -9,8 +14,158 @@ class Transformer:
|
||||
pass
|
||||
|
||||
def can_handle(data: ScraperResult) -> bool:
|
||||
"""Specifies whether or not a Transformer is capable of handling a particular
|
||||
piece of scraped data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ScraperResult
|
||||
The ScraperResult object to check for ability to handle.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
True if it can be handled by this Transformer, false otherwise.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
|
||||
"""Yields Media objects from each piece of media present in a raw ScraperResult.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ScraperResult
|
||||
The ScraperResult object to process
|
||||
transformed : TransformedResult
|
||||
The TransformedResult version of `data`. (E.g. as generated by `Transformer.transform()`)
|
||||
|
||||
Yields
|
||||
------
|
||||
Media
|
||||
A media object generated from the ScraperResult. One ScraperResult can have multiple pieces
|
||||
of media contained within it, so this can generate an arbitrary number of Media objects
|
||||
(or their subclasses.) These Media objects are not fully hydrated.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
def transform(data: ScraperResult) -> TransformedResult:
|
||||
"""Transform a ScraperResult into a TransformedResult object. This extracts additional attributes
|
||||
that can be used directly for analysis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ScraperResult
|
||||
The ScraperResult object to process.
|
||||
|
||||
Returns
|
||||
-------
|
||||
TransformedResult
|
||||
A TransformedResult representation of the `data` object.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class ETLController:
|
||||
"""An ETLController will transform raw scraped data (ScrapedResult objects) into a more detailed format
|
||||
for analysis by using Transformer objects that have been registered with the controller.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.transformers = []
|
||||
|
||||
def register_transformer(self, transformer: Transformer):
|
||||
"""Adds a Transformer to the list of available Transformers.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
transformer : Transformer
|
||||
The Transformer to register
|
||||
"""
|
||||
|
||||
self.transformers.append(transformer)
|
||||
|
||||
def connect_to_db(self, engine: Engine):
|
||||
"""Connects the ETLController to a SQLAlchemy engine.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
engine : Engine
|
||||
SQLAlchemy Engine object
|
||||
"""
|
||||
# create tables
|
||||
mapper_registry.metadata.create_all(bind=engine)
|
||||
|
||||
self.session = sessionmaker()
|
||||
self.session.configure(bind=engine)
|
||||
|
||||
@logger.catch
|
||||
def transform_results(self, results: List[ScraperResult], hydrate: bool = True):
|
||||
"""Transforms raw ScraperResults objects into TransformedResult objects and
|
||||
Media objects. Then, adds them to the database.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
results : List[ScraperResult]
|
||||
A list of ScraperResult objects to be transformed
|
||||
hydrate : bool
|
||||
Whether or not to fully hydrate transformed media. Default True.
|
||||
"""
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
|
||||
for result in results:
|
||||
for transformer in self.transformers:
|
||||
handled = False
|
||||
|
||||
if transformer.can_handle(result):
|
||||
logger.info(f"{transformer} is handling result {result}")
|
||||
handled = True
|
||||
session = self.session()
|
||||
|
||||
transformed = transformer.transform(result)
|
||||
|
||||
session.add(transformed)
|
||||
session.flush()
|
||||
|
||||
media = transformer.transform_media(result, transformed)
|
||||
|
||||
count = 0
|
||||
for obj in media:
|
||||
if hydrate:
|
||||
logger.info(f"Hydrating {obj}")
|
||||
obj.hydrate()
|
||||
|
||||
session.add(obj)
|
||||
count += 1
|
||||
|
||||
session.commit()
|
||||
logger.info(f"{transformer} generated {count} media objects")
|
||||
break
|
||||
|
||||
if handled == False:
|
||||
logger.warning(f"No Transformer could handle {result}")
|
||||
|
||||
@logger.catch
|
||||
def transform_all_untransformed(self, hydrate: bool = True):
|
||||
"""Transform all ScraperResult objects in the database that do not have an
|
||||
equivalent TransformedResult object stored.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
hydrate : bool
|
||||
Whether or not to fully hydrate transformed media. Default True.
|
||||
"""
|
||||
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
|
||||
session = self.session()
|
||||
untransformed = session.query(ScraperResult).join(TransformedResult, isouter=True).where(TransformedResult.raw_id == None).all()
|
||||
logger.info(f"Found {len(untransformed)} items to ETL")
|
||||
|
||||
self.transform_results(untransformed, hydrate=hydrate)
|
||||
51
cisticola/transformer/bitchute.py
Normal file
51
cisticola/transformer/bitchute.py
Normal file
@@ -0,0 +1,51 @@
|
||||
import json
|
||||
from loguru import logger
|
||||
from typing import Generator
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media
|
||||
|
||||
class BitchuteTransformer(Transformer):
|
||||
"""A Bitchute specific ScraperResult, with a method ETL/transforming"""
|
||||
|
||||
__version__ = "BitchuteTransformer 0.0.1"
|
||||
|
||||
def can_handle(self, data: ScraperResult) -> bool:
|
||||
scraper = data.scraper.split(' ')
|
||||
if scraper[0] == "BitchuteScraper":
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
orig = raw['video_url']
|
||||
new = data.archived_urls[orig]
|
||||
|
||||
m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
|
||||
|
||||
yield m
|
||||
|
||||
def transform(self, data: ScraperResult) -> TransformedResult:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
soup = BeautifulSoup(raw['body'], features = 'html.parser')
|
||||
content = soup.find_all('p')[-1].text
|
||||
|
||||
transformed = TransformedResult(
|
||||
raw_id=data.id,
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
platform=data.platform,
|
||||
channel=data.channel,
|
||||
date=data.date,
|
||||
date_archived=data.date_archived,
|
||||
url=raw['url'],
|
||||
content=content,
|
||||
author_id=raw['author_id'],
|
||||
author_username=raw['author'])
|
||||
|
||||
return transformed
|
||||
@@ -1,13 +1,51 @@
|
||||
import json
|
||||
from loguru import logger
|
||||
from typing import Generator
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import ScraperResult, TransformedResult
|
||||
from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media
|
||||
|
||||
class TwitterTransformer(Transformer):
|
||||
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
|
||||
|
||||
__version__ = "TwitterTransformer 0.0.1"
|
||||
|
||||
def can_handle(self, data: ScraperResult) -> bool:
|
||||
scraper = data.scraper.split(' ')
|
||||
if scraper[0] == "TwitterScraper":
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
if raw['media']:
|
||||
for media in raw['media']:
|
||||
orig = None
|
||||
|
||||
if media["_type"] == "snscrape.modules.twitter.Photo":
|
||||
orig = media["fullUrl"]
|
||||
elif media["_type"] == "snscrape.modules.twitter.Gif":
|
||||
orig = media["variants"][0]["url"]
|
||||
elif media["_type"] == "snscrape.modules.twitter.Video":
|
||||
variant = max([v for v in media["variants"] if v["bitrate"]], key=lambda v: v["bitrate"])
|
||||
orig = variant["url"]
|
||||
|
||||
if orig is None:
|
||||
logger.warning(f"No media URL found for {media}")
|
||||
elif orig not in data.archived_urls:
|
||||
logger.info("Media discovered but not archived")
|
||||
else:
|
||||
new = data.archived_urls[orig]
|
||||
|
||||
if media["_type"] == "snscrape.modules.twitter.Photo":
|
||||
m = Image(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
|
||||
else:
|
||||
m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
|
||||
|
||||
yield m
|
||||
|
||||
def transform(self, data: ScraperResult) -> TransformedResult:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import requests
|
||||
from loguru import logger
|
||||
import time
|
||||
|
||||
def make_request(url, headers = None, max_retries = 5, break_codes = None):
|
||||
|
||||
@@ -64,6 +65,9 @@ def request_until_200(url, headers = None, max_retries = 5, break_codes = None):
|
||||
while r.status_code not in break_codes and n_retries < 5:
|
||||
logger.warning(f"Request for url: {url} returned status: {r.status_code} on attempt: {n_retries}/{max_retries}")
|
||||
n_retries += 1
|
||||
|
||||
# back off subsequent requests
|
||||
time.sleep(n_retries)
|
||||
r = requests.get(url, headers = headers)
|
||||
|
||||
if r.status_code not in break_codes:
|
||||
105
test.py
105
test.py
@@ -1,7 +1,7 @@
|
||||
from sqlalchemy import create_engine
|
||||
from loguru import logger
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.base import Channel, TransformedResult, ScraperResult
|
||||
from cisticola.scraper import (
|
||||
ScraperController,
|
||||
BitchuteScraper,
|
||||
@@ -12,104 +12,26 @@ from cisticola.scraper import (
|
||||
TelegramSnscrapeScraper,
|
||||
TelegramTelethonScraper,
|
||||
TwitterScraper)
|
||||
from cisticola.transformer import ETLController
|
||||
from cisticola.transformer.twitter import TwitterTransformer
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
logger.add("../test.log")
|
||||
|
||||
test_channels = [
|
||||
Channel(
|
||||
id=0,
|
||||
name="Logan Williams (test)",
|
||||
platform_id=891729132,
|
||||
name="L Weber (test)",
|
||||
platform_id=1424979017749442595,
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Twitter",
|
||||
url="https://twitter.com/obtusatum",
|
||||
screenname="obtusatum",
|
||||
url="https://twitter.com/LWeber33662141",
|
||||
screenname="LWeber33662141",
|
||||
country="US",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=1,
|
||||
name="South West Ohio Proud Boys (test)",
|
||||
platform_id=-1001276612436,
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Telegram",
|
||||
url="https://t.me/SouthwestOhioPB",
|
||||
screenname="SouthwestOhioPB",
|
||||
country="US",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=2,
|
||||
name="LizardRepublic (test)",
|
||||
platform_id='lizardrepublic',
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Gettr",
|
||||
url="https://www.gettr.com/user/lizardrepublic",
|
||||
screenname="lizardrepublic",
|
||||
country="US",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=4,
|
||||
name="bestonlinejewelrystoresusa@gmail.com (test)", platform_id='bestonlinejewelrystoresusagmailcom',
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Bitchute",
|
||||
url="https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/", screenname=None,
|
||||
country="US",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=5,
|
||||
name="Mak1n' Bacon (test)",
|
||||
platform_id='Mak1nBacon',
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Odysee",
|
||||
url="https://odysee.com/@Mak1nBacon",
|
||||
screenname='Mak1nBacon',
|
||||
country="US",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=6,
|
||||
name="Capt. Marc Simon (test)",
|
||||
platform_id='marc_capt',
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Gab",
|
||||
url="https://gab.com/marc_capt",
|
||||
screenname='marc_capt',
|
||||
country="CA",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=7,
|
||||
name="we are uploading videos wow products and problem solving products.please share like and subscribe our channelwe are uploading videos wow products and problem solving products.please share like and subscribe our channel", platform_id='c-916305',
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Rumble",
|
||||
url="https://rumble.com/c/c-916305",
|
||||
screenname='we are uploading',
|
||||
country="CA",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes="")]
|
||||
|
||||
controller = ScraperController()
|
||||
@@ -126,7 +48,14 @@ scrapers = [
|
||||
|
||||
controller.register_scrapers(scrapers)
|
||||
|
||||
engine = create_engine('sqlite:///test3.db')
|
||||
engine = create_engine('sqlite:///test.db')
|
||||
controller.connect_to_db(engine)
|
||||
|
||||
controller.scrape_channels(test_channels, archive_media = False)
|
||||
controller.scrape_channels(test_channels, archive_media = True)
|
||||
|
||||
transformer = TwitterTransformer()
|
||||
|
||||
etl_controller = ETLController()
|
||||
etl_controller.register_transformer(transformer)
|
||||
etl_controller.connect_to_db(engine)
|
||||
etl_controller.transform_all_untransformed()
|
||||
|
||||
@@ -3,6 +3,7 @@ import pytest
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
from cisticola.scraper import ScraperController
|
||||
from cisticola.transformer import ETLController
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
@@ -98,13 +99,13 @@ TELEGRAM_CHANNEL_KWARGS = {
|
||||
|
||||
TWITTER_CHANNEL_KWARGS = {
|
||||
'id': 5,
|
||||
'name': 'Logan Williams (test)',
|
||||
'platform_id': 891729132,
|
||||
'name': 'L Weber (test)',
|
||||
'platform_id': 1424979017749442595,
|
||||
'category': 'test',
|
||||
'followers': None,
|
||||
'platform': 'Twitter',
|
||||
'url': 'https://twitter.com/obtusatum',
|
||||
'screenname': 'obtusatum',
|
||||
'url': 'https://twitter.com/LWeber33662141',
|
||||
'screenname': 'LWeber33662141',
|
||||
'country': 'US',
|
||||
'influencer': None,
|
||||
'public': True,
|
||||
@@ -113,35 +114,49 @@ TWITTER_CHANNEL_KWARGS = {
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
@pytest.fixture(scope='function')
|
||||
def controller(tmpdir_factory):
|
||||
|
||||
"""Initialize ScraperController and SQLite database file to be used for all
|
||||
tests in the package.
|
||||
"""
|
||||
|
||||
@pytest.fixture(scope='package')
|
||||
def engine(tmpdir_factory):
|
||||
"""Initialize a SQLite database and SQLAlchemy engine to be used for all
|
||||
tests in the package"""
|
||||
|
||||
file = tmpdir_factory.mktemp('test_data').join('test.db')
|
||||
engine = create_engine(f'sqlite:///{file}')
|
||||
|
||||
return engine
|
||||
|
||||
|
||||
@pytest.fixture(scope='package')
|
||||
def controller(engine):
|
||||
"""Initialize ScraperController to be used for all tests in the package."""
|
||||
|
||||
scraper_controller = ScraperController()
|
||||
scraper_controller.connect_to_db(engine)
|
||||
|
||||
return scraper_controller
|
||||
|
||||
@pytest.fixture(scope='package')
|
||||
def channel_kwargs():
|
||||
def etl_controller(engine):
|
||||
"""Initialize ETLController to be used for all tests in the package."""
|
||||
|
||||
etl_controller = ETLController()
|
||||
etl_controller.connect_to_db(engine)
|
||||
|
||||
return etl_controller
|
||||
|
||||
@pytest.fixture(scope='package')
|
||||
def channel_kwargs():
|
||||
"""Define keyword arguments to use for defining test channels for each
|
||||
platform to be scraped.
|
||||
"""
|
||||
|
||||
return {
|
||||
'bitchute' : BITCHUTE_CHANNEL_KWARGS,
|
||||
'gab' : GAB_CHANNEL_KWARGS,
|
||||
'gettr' : GETTR_CHANNEL_KWARGS,
|
||||
'odysee' : ODYSEE_CHANNEL_KWARGS,
|
||||
'rumble' : RUMBLE_CHANNEL_KWARGS,
|
||||
'telegram' : TELEGRAM_CHANNEL_KWARGS,
|
||||
'twitter' : TWITTER_CHANNEL_KWARGS}
|
||||
'bitchute': BITCHUTE_CHANNEL_KWARGS,
|
||||
'gab': GAB_CHANNEL_KWARGS,
|
||||
'gettr': GETTR_CHANNEL_KWARGS,
|
||||
'odysee': ODYSEE_CHANNEL_KWARGS,
|
||||
'rumble': RUMBLE_CHANNEL_KWARGS,
|
||||
'telegram': TELEGRAM_CHANNEL_KWARGS,
|
||||
'twitter': TWITTER_CHANNEL_KWARGS}
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
0
tests/scraper/__init__.py
Normal file
0
tests/scraper/__init__.py
Normal file
@@ -9,6 +9,8 @@ def test_scrape_bitchute_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
def test_scrape_bitchute_channel(controller, channel_kwargs):
|
||||
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['bitchute'])]
|
||||
controller.register_scraper(scraper = BitchuteScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
@@ -8,6 +8,8 @@ def test_scrape_gab_channel_no_media(controller, channel_kwargs):
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
def test_scrape_gab_channel(controller, channel_kwargs):
|
||||
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['gab'])]
|
||||
controller.register_scraper(scraper = GabScraper())
|
||||
|
||||
@@ -9,6 +9,8 @@ def test_scrape_gettr_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
def test_scrape_gettr_channel(controller, channel_kwargs):
|
||||
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['gettr'])]
|
||||
controller.register_scraper(scraper = GettrScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
@@ -9,6 +9,8 @@ def test_scrape_odysee_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
def test_scrape_odysee_channel(controller, channel_kwargs):
|
||||
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['odysee'])]
|
||||
controller.register_scraper(scraper = OdyseeScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
@@ -9,6 +9,8 @@ def test_scrape_rumble_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
def test_scrape_rumble_channel(controller, channel_kwargs):
|
||||
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['rumble'])]
|
||||
controller.register_scraper(scraper = RumbleScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
@@ -9,6 +9,8 @@ def test_scrape_telegram_snscrape_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
def test_scrape_telegram_snscrape_channel(controller, channel_kwargs):
|
||||
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['telegram'])]
|
||||
controller.register_scraper(scraper = TelegramSnscrapeScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
@@ -9,6 +9,8 @@ def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
def test_scrape_telegram_telethon_channel(controller, channel_kwargs):
|
||||
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['telegram'])]
|
||||
controller.register_scraper(scraper = TelegramTelethonScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
@@ -9,6 +9,8 @@ def test_scrape_twitter_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
def test_scrape_twitter_channel(controller, channel_kwargs):
|
||||
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['twitter'])]
|
||||
controller.register_scraper(scraper = TwitterScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
0
tests/transformer/__init__.py
Normal file
0
tests/transformer/__init__.py
Normal file
30
tests/transformer/twitter.py
Normal file
30
tests/transformer/twitter.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from sqlalchemy.orm import sessionmaker, with_polymorphic
|
||||
import json
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import TwitterScraper
|
||||
from cisticola.transformer import TwitterTransformer
|
||||
from cisticola.base import TransformedResult, Media
|
||||
|
||||
def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['twitter'])]
|
||||
controller.register_scraper(scraper = TwitterScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
etl_controller.register_transformer(TwitterTransformer())
|
||||
etl_controller.transform_all_untransformed()
|
||||
|
||||
sessionfactory = sessionmaker()
|
||||
sessionfactory.configure(bind=engine)
|
||||
session = sessionfactory()
|
||||
|
||||
posts = session.query(TransformedResult).all()
|
||||
media = session.query(Media).all()
|
||||
|
||||
assert len(posts) == 3
|
||||
assert len(media) == 2
|
||||
|
||||
assert posts[-1].content == "This is a test. https://t.co/rzTFL9uFi6"
|
||||
assert json.loads(media[-1].exif)['Composite:ImageSize'] == "826 728"
|
||||
Reference in New Issue
Block a user