Merge pull request #2 from bellingcat/media

WIP: Archiving media, organization improvements
This commit is contained in:
Tristan Lee
2022-02-25 08:26:58 -06:00
committed by GitHub
12 changed files with 248 additions and 88 deletions

2
.gitignore vendored
View File

@@ -4,4 +4,4 @@
*.db
docs/build/
docs/source/_*
.env

View File

@@ -5,13 +5,14 @@ name = "pypi"
[packages]
sqlalchemy = "*"
snscrape = "*"
loguru = "*"
gogettr = "*"
requests = "*"
bs4 = "*"
dateparser = "*"
sphinx = "*"
boto3 = "*"
snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
[dev-packages]

42
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "cde7247f41da5501b9fc4fc5d01916548f719b3d4ea0f1dd1765c4cf0413bbf7"
"sha256": "d3ee112521273c2b0b9df074b4eb9a20649a2854bfffa433171749019acf8561"
},
"pipfile-spec": 6,
"requires": {
@@ -39,6 +39,22 @@
"markers": "python_version >= '3.1'",
"version": "==4.10.0"
},
"boto3": {
"hashes": [
"sha256:0e8d4d814f94031947035a4c2bb2c23832d5de941a6a492fb85794a02bafc44d",
"sha256:95d9b5b6fe3383fbf8f33d58f62258d3b3ea138d4369017031339b60fd5b8887"
],
"index": "pypi",
"version": "==1.21.6"
},
"botocore": {
"hashes": [
"sha256:359b9ea3870a1f8264113cb0b1216baa94bf1e8cee5d5d8af63a2e7ca6e7b33c",
"sha256:69aaa5a78ac7371f573e463be51fb962213c42fab08ef82380e84b9a87386949"
],
"markers": "python_version >= '3.6'",
"version": "==1.24.6"
},
"bs4": {
"hashes": [
"sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"
@@ -194,6 +210,14 @@
"markers": "python_version >= '3.6'",
"version": "==3.0.3"
},
"jmespath": {
"hashes": [
"sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9",
"sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f"
],
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.10.0"
},
"loguru": {
"hashes": [
"sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c",
@@ -460,6 +484,14 @@
"index": "pypi",
"version": "==2.27.1"
},
"s3transfer": {
"hashes": [
"sha256:25c140f5c66aa79e1ac60be50dcd45ddc59e83895f062a3aab263b870102911f",
"sha256:69d264d3e760e569b78aaa0f22c97e955891cd22e32b10c51f784eeda4d9d10a"
],
"markers": "python_version >= '3.6'",
"version": "==0.5.1"
},
"six": {
"hashes": [
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
@@ -476,12 +508,8 @@
"version": "==2.2.0"
},
"snscrape": {
"hashes": [
"sha256:af30d12872da692ff9ccaf5651962edceb1fd4a28cf7cc92c8c898902f009ce3",
"sha256:fd176765196ca17979be7f54e041f430e4cb23a5e651fa29cf3dc382258019f2"
],
"index": "pypi",
"version": "==0.4.3.20220106"
"git": "https://github.com/bellingcat/snscrape.git",
"ref": "de4ebed81f3f6a4bb4c65630daab6ec63784959b"
},
"soupsieve": {
"hashes": [

View File

@@ -1,6 +1,6 @@
from typing import List
import cisticola.scraper
import cisticola.base
import cisticola.scraper.base
from sqlalchemy.orm import sessionmaker
from loguru import logger
@@ -14,7 +14,7 @@ class ScraperController:
self.session = None
self.mapper_registry = None
def register_scraper(self, scraper: cisticola.scraper.Scraper):
def register_scraper(self, scraper: cisticola.scraper.base.Scraper):
self.scrapers.append(scraper)
def scrape_channels(self, channels: List[cisticola.base.Channel]):
@@ -27,10 +27,15 @@ class ScraperController:
for scraper in self.scrapers:
if scraper.can_handle(channel):
session = self.session()
handled = True
added = 0
# get most recent post
session = self.session()
rows = session.query(cisticola.base.ScraperResult).order_by(
cisticola.base.ScraperResult.date_archived).limit(1).all()
rows = session.query(cisticola.base.ScraperResult).where(
cisticola.base.ScraperResult.channel == channel.id).order_by(
cisticola.base.ScraperResult.date.desc()).limit(1).all()
if len(rows) == 1:
since = rows[0]
@@ -38,21 +43,19 @@ class ScraperController:
since = None
posts = scraper.get_posts(channel, since=since)
handled = True
for post in posts:
session.add(post)
added += 1
session.commit()
logger.info(
f"{scraper} found {len(posts)} new posts from {channel}")
f"{scraper} found {added} new posts from {channel}")
break
if not handled:
logger.warning(f"No handler found for Channel {channel}")
session = self.session()
session.bulk_save_objects(posts)
session.commit()
logger.info(f"Added {len(posts)} entries to database")
def connect_to_db(self, engine):
# create tables
cisticola.base.mapper_registry.metadata.create_all(bind=engine)

View File

@@ -1,7 +1,7 @@
from dataclasses import dataclass
from datetime import datetime
from sqlalchemy.orm import registry
from sqlalchemy import Table, Column, Integer, String, DateTime, ForeignKey
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey
mapper_registry = registry()
@@ -12,11 +12,12 @@ class ScraperResult:
scraper: str
platform: str
channel: int
channel: int #TODO there is probably a way of making this a Channel object foreign key
platform_id: str
date: datetime
raw_data: str
date_archived: datetime
archived_urls: dict
raw_data_table = Table('raw_data', mapper_registry.metadata,
@@ -28,7 +29,8 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
Column('platform_id', String),
Column('date', DateTime),
Column('raw_data', String),
Column('date_archived', DateTime))
Column('date_archived', DateTime),
Column('archived_urls', JSON))
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
@@ -42,6 +44,7 @@ class Channel:
followers: int
platform: str
url: str
screenname: str
country: str
influencer: str
public: bool

View File

@@ -1,18 +0,0 @@
from typing import List
import cisticola.base
class Scraper:
__version__ = "Scraper 0.0.0"
def __init__(self):
pass
def __str__(self):
return self.__version__
def can_handle(self, channel: cisticola.base.Channel) -> bool:
pass
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
pass

61
cisticola/scraper/base.py Normal file
View File

@@ -0,0 +1,61 @@
from typing import Generator
import cisticola.base
import requests
import os
import boto3
from io import BytesIO
from loguru import logger
class Scraper:
__version__ = "Scraper 0.0.0"
def __init__(self):
self.s3_client = boto3.client('s3',
region_name=os.getenv(
'DO_SPACES_REGION'),
endpoint_url='https://{}.digitaloceanspaces.com'.format(
os.getenv('DO_SPACES_REGION')),
aws_access_key_id=os.getenv(
'DO_SPACES_KEY'),
aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
pass
def __str__(self):
return self.__version__
def archive_media(self, url: str, key: str = None) -> str:
n_retries = 0
r = requests.get(url)
while r.status_code != 200 and n_retries < 5:
logger.warning(f"{n_retries}/5: Request for {url} failed")
n_retries += 1
r = requests.get(url)
if r.status_code != 200:
logger.error(f"Could not fetch URL {url}")
return url
blob = r.content
content_type = r.headers.get('Content-Type')
if key is None:
key = url.split('/')[-1]
key = key.split('?')[0]
filename = self.__version__.replace(' ', '_') + '/' + key
self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.getenv(
'DO_BUCKET'), Key=filename, ExtraArgs={'ACL': 'public-read', 'ContentType': content_type})
archived_url = os.getenv('DO_URL') + '/' + filename
return archived_url
def can_handle(self, channel: cisticola.base.Channel) -> bool:
pass
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
pass

View File

@@ -4,7 +4,7 @@ import re
from html.parser import HTMLParser
import dateparser
import json
from typing import List
from typing import Generator
import requests
from bs4 import BeautifulSoup
@@ -23,7 +23,7 @@ class BitchuteScraper(cisticola.scraper.Scraper):
return username
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
session = requests.Session()
session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"

View File

@@ -1,10 +1,11 @@
import cisticola.base
import cisticola.scraper.base
from datetime import datetime
import json
from typing import List
from typing import Generator
from gogettr import PublicClient
class GettrScraper(cisticola.scraper.Scraper):
class GettrScraper(cisticola.scraper.base.Scraper):
"""An implementation of a Scraper for Gettr, using gogettr library"""
__version__ = "GettrScraper 0.0.1"
@@ -15,26 +16,41 @@ class GettrScraper(cisticola.scraper.Scraper):
return username
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
posts = []
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
client = PublicClient()
username = GettrScraper.get_username_from_url(channel.url)
scraper = client.user_activity(username=username, type="posts")
for post in scraper:
if since is not None and post['cdate'] <= int(since.date_archived.timestamp()):
if since is not None and datetime.fromtimestamp(post['cdate']*0.001) <= since.date:
break
posts.append(cisticola.base.ScraperResult(
archived_urls = {}
if 'imgs' in post:
for img in post['imgs']:
url = "https://media.gettr.com/" + img
archived_url = self.archive_media(url)
archived_urls[img] = archived_url
if 'main' in post:
archived_url = self.archive_media("https://media.gettr.com/" + post['main'])
archived_urls[post['main']] = archived_url
# TODO this is just archiving the playlist file, not the actual video
if 'vid' in post:
archived_url = self.archive_media("https://media.gettr.com/" + post['vid'])
archived_urls[post['vid']] = archived_url
yield cisticola.base.ScraperResult(
scraper=self.__version__,
platform="Gettr",
channel=username,
channel=channel.id,
platform_id=post['_id'],
date=datetime.fromtimestamp(post['cdate']/1000.),
date_archived=datetime.now(),
raw_data=json.dumps(post)))
return posts
raw_data=json.dumps(post),
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None:

View File

@@ -0,0 +1,44 @@
import cisticola.base
import cisticola.scraper.base
from typing import Generator
import snscrape.modules
from datetime import datetime, timezone
class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
__version__ = "TelegramSnscrapeScraper 0.0.1"
def can_handle(self, channel):
if channel.platform == "Telegram" and channel.public and not channel.chat:
return True
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
scr = snscrape.modules.telegram.TelegramChannelScraper(
channel.screenname)
g = scr.get_items()
for post in g:
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
break
archived_urls = {}
for image_url in post.images:
archive_url = self.archive_media(image_url)
archived_urls[image_url] = archive_url
if post.video:
video_archive_url = self.archive_media(post.video)
archived_urls[post.video] = video_archive_url
yield cisticola.base.ScraperResult(
scraper=self.__version__,
platform="Telegram",
channel=channel.id,
platform_id=post.url,
date=post.date,
date_archived=datetime.now(timezone.utc),
raw_data=post.json(),
archived_urls=archived_urls
)

View File

@@ -1,42 +1,59 @@
import cisticola.base
from datetime import datetime
from typing import List
import cisticola.scraper.base
from datetime import datetime, timezone
from typing import Generator
import snscrape.modules
from loguru import logger
class TwitterScraper(cisticola.scraper.Scraper):
class TwitterScraper(cisticola.scraper.base.Scraper):
"""An implementation of a Scraper for Twitter, using snscrape library"""
__version__ = "TwitterScraper 0.0.1"
# TODO snscrape should be able to scrape from user ID alone, but there is
# currently a bug/other issue, so it is extracting the username from URL
def get_username_from_url(url):
username = url.split("twitter.com/")[1]
if len(username.split("/")) > 1:
return None
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
scraper = snscrape.modules.twitter.TwitterProfileScraper(channel.platform_id)
return username
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
posts = []
scraper = snscrape.modules.twitter.TwitterUserScraper(
TwitterScraper.get_username_from_url(channel.url))
first = True
for tweet in scraper.get_items():
if since is not None and tweet.date.timestamp() <= since.date_archived.timestamp():
break
if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
# with TwitterProfileScraper, the first tweet could be an old pinned tweet
if first:
first = False
continue
else:
break
posts.append(cisticola.base.ScraperResult(
archived_urls = {}
if tweet.media:
for media in tweet.media:
if type(media) == snscrape.modules.twitter.Video:
variant = max(
[v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
url = variant.url
elif type(media) == snscrape.modules.twitter.Gif:
url = media.variants[0].url
elif type(media) == snscrape.modules.twitter.Photo:
url = media.fullUrl
else:
logger.warning(f"Could not get media URL of {media}")
url = None
if url is not None:
archived_url = self.archive_media(url)
archived_urls[url] = archived_url
yield cisticola.base.ScraperResult(
scraper=self.__version__,
platform="Twitter",
channel=channel.id,
platform_id=tweet.id,
date=tweet.date,
date_archived=datetime.now(),
raw_data=tweet.json()))
return posts
raw_data=tweet.json(),
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Twitter" and TwitterScraper.get_username_from_url(channel.url) is not None:
if channel.platform == "Twitter" and channel.platform_id:
return True

29
test.py
View File

@@ -1,38 +1,43 @@
# TODO/TODECIDE:
# should 'username' be a part of the Channel definition somehow?
# still need to do some planning for handling media
import cisticola
import cisticola.scraper.telegram_snscrape
import cisticola.scraper.twitter
import cisticola.scraper.gettr
from sqlalchemy import create_engine
test_channels = [cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132,
test_channels = [
cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132,
category="test", followers=None, platform="Twitter",
url="https://twitter.com/obtusatum", country="US",
url="https://twitter.com/obtusatum", screenname="obtusatum", country="US",
influencer=None, public=True, chat=False,
notes=""),
cisticola.base.Channel(id=1, name="JQHN SPARTAN", platform_id=-1001181961026,
category="qanon", followers=None, platform="Telegram",
url="https://t.me/jqhnspartan", country="FR",
url="https://t.me/jqhnspartan", screenname="jqhnspartan", country="FR",
influencer="JQNH SPARTAN", public=True, chat=False, notes=""),
cisticola.base.Channel(id=2, name="LizardRepublic", platform_id='lizardrepublic',
category="qanon", followers=None, platform="Gettr",
url="https://www.gettr.com/user/lizardrepublic", country="US",
url="https://www.gettr.com/user/lizardrepublic", screenname="lizardrepublic", country="US",
influencer=None, public=True, chat=False, notes=""),
cisticola.base.Channel(id=3, name="Patriot Front", platform_id='OVv9QZL4sEsC',
category="nazi", followers=None, platform="Bitchute",
url="https://www.bitchute.com/channel/OVv9QZL4sEsC/", country="US",
url="https://www.bitchute.com/channel/OVv9QZL4sEsC/", screenname=None, country="US",
influencer=None, public=True, chat=False, notes=""),]
controller = cisticola.ScraperController()
scraper = cisticola.scraper.twitter.TwitterScraper()
controller.register_scraper(scraper)
twitter = cisticola.scraper.twitter.TwitterScraper()
controller.register_scraper(twitter)
engine = create_engine('sqlite:///test.db')
telegram = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper()
controller.register_scraper(telegram)
gettr = cisticola.scraper.gettr.GettrScraper()
controller.register_scraper(gettr)
engine = create_engine('sqlite:///test3.db')
controller.connect_to_db(engine)
controller.scrape_channels(test_channels)