Add method for archiving media, reoranize scraper base classes

This commit is contained in:
Logan Williams
2022-02-24 16:36:55 +01:00
parent e09e0f5202
commit 6092e4caa5
8 changed files with 58 additions and 37 deletions

5
.env Normal file
View File

@@ -0,0 +1,5 @@
DO_SPACES_REGION=ams3
DO_SPACES_KEY=DKIMQ7ABHPOBC4OZDEQR
DO_SPACES_SECRET=uqKaPQsV4WmskQr8/O2NTS+OHiTNV2yVJn8u9Ny0rsA
DO_BUCKET=cisticola-test
DO_URL=https://cisticola-test.ams3.digitaloceanspaces.com

View File

@@ -5,13 +5,14 @@ name = "pypi"
[packages]
sqlalchemy = "*"
snscrape = "*"
loguru = "*"
gogettr = "*"
requests = "*"
bs4 = "*"
dateparser = "*"
sphinx = "*"
boto3 = "*"
snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
[dev-packages]

42
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "cde7247f41da5501b9fc4fc5d01916548f719b3d4ea0f1dd1765c4cf0413bbf7"
"sha256": "d3ee112521273c2b0b9df074b4eb9a20649a2854bfffa433171749019acf8561"
},
"pipfile-spec": 6,
"requires": {
@@ -39,6 +39,22 @@
"markers": "python_version >= '3.1'",
"version": "==4.10.0"
},
"boto3": {
"hashes": [
"sha256:0e8d4d814f94031947035a4c2bb2c23832d5de941a6a492fb85794a02bafc44d",
"sha256:95d9b5b6fe3383fbf8f33d58f62258d3b3ea138d4369017031339b60fd5b8887"
],
"index": "pypi",
"version": "==1.21.6"
},
"botocore": {
"hashes": [
"sha256:359b9ea3870a1f8264113cb0b1216baa94bf1e8cee5d5d8af63a2e7ca6e7b33c",
"sha256:69aaa5a78ac7371f573e463be51fb962213c42fab08ef82380e84b9a87386949"
],
"markers": "python_version >= '3.6'",
"version": "==1.24.6"
},
"bs4": {
"hashes": [
"sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"
@@ -194,6 +210,14 @@
"markers": "python_version >= '3.6'",
"version": "==3.0.3"
},
"jmespath": {
"hashes": [
"sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9",
"sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f"
],
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.10.0"
},
"loguru": {
"hashes": [
"sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c",
@@ -460,6 +484,14 @@
"index": "pypi",
"version": "==2.27.1"
},
"s3transfer": {
"hashes": [
"sha256:25c140f5c66aa79e1ac60be50dcd45ddc59e83895f062a3aab263b870102911f",
"sha256:69d264d3e760e569b78aaa0f22c97e955891cd22e32b10c51f784eeda4d9d10a"
],
"markers": "python_version >= '3.6'",
"version": "==0.5.1"
},
"six": {
"hashes": [
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
@@ -476,12 +508,8 @@
"version": "==2.2.0"
},
"snscrape": {
"hashes": [
"sha256:af30d12872da692ff9ccaf5651962edceb1fd4a28cf7cc92c8c898902f009ce3",
"sha256:fd176765196ca17979be7f54e041f430e4cb23a5e651fa29cf3dc382258019f2"
],
"index": "pypi",
"version": "==0.4.3.20220106"
"git": "https://github.com/bellingcat/snscrape.git",
"ref": "72b26f2373f3fecf53bdf9c62d7408df3d15a329"
},
"soupsieve": {
"hashes": [

View File

@@ -1,6 +1,6 @@
from typing import List
import cisticola.scraper
import cisticola.base
import cisticola.scraper.base
from sqlalchemy.orm import sessionmaker
from loguru import logger
@@ -14,7 +14,7 @@ class ScraperController:
self.session = None
self.mapper_registry = None
def register_scraper(self, scraper: cisticola.scraper.Scraper):
def register_scraper(self, scraper: cisticola.scraper.base.Scraper):
self.scrapers.append(scraper)
def scrape_channels(self, channels: List[cisticola.base.Channel]):
@@ -30,7 +30,7 @@ class ScraperController:
# get most recent post
session = self.session()
rows = session.query(cisticola.base.ScraperResult).order_by(
cisticola.base.ScraperResult.date_archived).limit(1).all()
cisticola.base.ScraperResult.date.desc()).limit(1).all()
if len(rows) == 1:
since = rows[0]

View File

@@ -42,6 +42,7 @@ class Channel:
followers: int
platform: str
url: str
screenname: str
country: str
influencer: str
public: bool

View File

@@ -1,18 +0,0 @@
from typing import List
import cisticola.base
class Scraper:
__version__ = "Scraper 0.0.0"
def __init__(self):
pass
def __str__(self):
return self.__version__
def can_handle(self, channel: cisticola.base.Channel) -> bool:
pass
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
pass

View File

@@ -1,10 +1,11 @@
import cisticola.base
import cisticola.scraper.base
from datetime import datetime
from typing import List
import snscrape.modules
class TwitterScraper(cisticola.scraper.Scraper):
class TwitterScraper(cisticola.scraper.base.Scraper):
"""An implementation of a Scraper for Twitter, using snscrape library"""
__version__ = "TwitterScraper 0.0.1"

17
test.py
View File

@@ -3,36 +3,39 @@
# still need to do some planning for handling media
import cisticola
import cisticola.scraper.twitter
import cisticola.scraper.telegram_snscrape
from sqlalchemy import create_engine
test_channels = [cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132,
category="test", followers=None, platform="Twitter",
url="https://twitter.com/obtusatum", country="US",
url="https://twitter.com/obtusatum", screenname="obtusatum", country="US",
influencer=None, public=True, chat=False,
notes=""),
cisticola.base.Channel(id=1, name="JQHN SPARTAN", platform_id=-1001181961026,
category="qanon", followers=None, platform="Telegram",
url="https://t.me/jqhnspartan", country="FR",
url="https://t.me/jqhnspartan", screenname="jqhnspartan", country="FR",
influencer="JQNH SPARTAN", public=True, chat=False, notes=""),
cisticola.base.Channel(id=2, name="LizardRepublic", platform_id='lizardrepublic',
category="qanon", followers=None, platform="Gettr",
url="https://www.gettr.com/user/lizardrepublic", country="US",
url="https://www.gettr.com/user/lizardrepublic", screenname="lizardrepublic", country="US",
influencer=None, public=True, chat=False, notes=""),
cisticola.base.Channel(id=3, name="Patriot Front", platform_id='OVv9QZL4sEsC',
category="nazi", followers=None, platform="Bitchute",
url="https://www.bitchute.com/channel/OVv9QZL4sEsC/", country="US",
url="https://www.bitchute.com/channel/OVv9QZL4sEsC/", screenname=None, country="US",
influencer=None, public=True, chat=False, notes=""),]
controller = cisticola.ScraperController()
scraper = cisticola.scraper.twitter.TwitterScraper()
# scraper = cisticola.scraper.twitter.TwitterScraper()
# controller.register_scraper(scraper)
scraper = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper()
controller.register_scraper(scraper)
engine = create_engine('sqlite:///test.db')
engine = create_engine('sqlite:///test3.db')
controller.connect_to_db(engine)
controller.scrape_channels(test_channels)