From 6092e4caa5aaef84818c49862840412fce22b520 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Thu, 24 Feb 2022 16:36:55 +0100 Subject: [PATCH] Add method for archiving media, reoranize scraper base classes --- .env | 5 +++++ Pipfile | 3 ++- Pipfile.lock | 42 +++++++++++++++++++++++++++++------ cisticola/__init__.py | 6 ++--- cisticola/base.py | 1 + cisticola/scraper/__init__.py | 18 --------------- cisticola/scraper/twitter.py | 3 ++- test.py | 17 ++++++++------ 8 files changed, 58 insertions(+), 37 deletions(-) create mode 100644 .env diff --git a/.env b/.env new file mode 100644 index 0000000..ace9ca9 --- /dev/null +++ b/.env @@ -0,0 +1,5 @@ +DO_SPACES_REGION=ams3 +DO_SPACES_KEY=DKIMQ7ABHPOBC4OZDEQR +DO_SPACES_SECRET=uqKaPQsV4WmskQr8/O2NTS+OHiTNV2yVJn8u9Ny0rsA +DO_BUCKET=cisticola-test +DO_URL=https://cisticola-test.ams3.digitaloceanspaces.com \ No newline at end of file diff --git a/Pipfile b/Pipfile index cd6422b..533ee24 100644 --- a/Pipfile +++ b/Pipfile @@ -5,13 +5,14 @@ name = "pypi" [packages] sqlalchemy = "*" -snscrape = "*" loguru = "*" gogettr = "*" requests = "*" bs4 = "*" dateparser = "*" sphinx = "*" +boto3 = "*" +snscrape = {git = "https://github.com/bellingcat/snscrape.git"} [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index 4b40a97..919e8f0 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "cde7247f41da5501b9fc4fc5d01916548f719b3d4ea0f1dd1765c4cf0413bbf7" + "sha256": "d3ee112521273c2b0b9df074b4eb9a20649a2854bfffa433171749019acf8561" }, "pipfile-spec": 6, "requires": { @@ -39,6 +39,22 @@ "markers": "python_version >= '3.1'", "version": "==4.10.0" }, + "boto3": { + "hashes": [ + "sha256:0e8d4d814f94031947035a4c2bb2c23832d5de941a6a492fb85794a02bafc44d", + "sha256:95d9b5b6fe3383fbf8f33d58f62258d3b3ea138d4369017031339b60fd5b8887" + ], + "index": "pypi", + "version": "==1.21.6" + }, + "botocore": { + "hashes": [ + "sha256:359b9ea3870a1f8264113cb0b1216baa94bf1e8cee5d5d8af63a2e7ca6e7b33c", + "sha256:69aaa5a78ac7371f573e463be51fb962213c42fab08ef82380e84b9a87386949" + ], + "markers": "python_version >= '3.6'", + "version": "==1.24.6" + }, "bs4": { "hashes": [ "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a" @@ -194,6 +210,14 @@ "markers": "python_version >= '3.6'", "version": "==3.0.3" }, + "jmespath": { + "hashes": [ + "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9", + "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f" + ], + "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==0.10.0" + }, "loguru": { "hashes": [ "sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c", @@ -460,6 +484,14 @@ "index": "pypi", "version": "==2.27.1" }, + "s3transfer": { + "hashes": [ + "sha256:25c140f5c66aa79e1ac60be50dcd45ddc59e83895f062a3aab263b870102911f", + "sha256:69d264d3e760e569b78aaa0f22c97e955891cd22e32b10c51f784eeda4d9d10a" + ], + "markers": "python_version >= '3.6'", + "version": "==0.5.1" + }, "six": { "hashes": [ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", @@ -476,12 +508,8 @@ "version": "==2.2.0" }, "snscrape": { - "hashes": [ - "sha256:af30d12872da692ff9ccaf5651962edceb1fd4a28cf7cc92c8c898902f009ce3", - "sha256:fd176765196ca17979be7f54e041f430e4cb23a5e651fa29cf3dc382258019f2" - ], - "index": "pypi", - "version": "==0.4.3.20220106" + "git": "https://github.com/bellingcat/snscrape.git", + "ref": "72b26f2373f3fecf53bdf9c62d7408df3d15a329" }, "soupsieve": { "hashes": [ diff --git a/cisticola/__init__.py b/cisticola/__init__.py index 115d143..ee24a4e 100644 --- a/cisticola/__init__.py +++ b/cisticola/__init__.py @@ -1,6 +1,6 @@ from typing import List -import cisticola.scraper import cisticola.base +import cisticola.scraper.base from sqlalchemy.orm import sessionmaker from loguru import logger @@ -14,7 +14,7 @@ class ScraperController: self.session = None self.mapper_registry = None - def register_scraper(self, scraper: cisticola.scraper.Scraper): + def register_scraper(self, scraper: cisticola.scraper.base.Scraper): self.scrapers.append(scraper) def scrape_channels(self, channels: List[cisticola.base.Channel]): @@ -30,7 +30,7 @@ class ScraperController: # get most recent post session = self.session() rows = session.query(cisticola.base.ScraperResult).order_by( - cisticola.base.ScraperResult.date_archived).limit(1).all() + cisticola.base.ScraperResult.date.desc()).limit(1).all() if len(rows) == 1: since = rows[0] diff --git a/cisticola/base.py b/cisticola/base.py index 57880d0..da811df 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -42,6 +42,7 @@ class Channel: followers: int platform: str url: str + screenname: str country: str influencer: str public: bool diff --git a/cisticola/scraper/__init__.py b/cisticola/scraper/__init__.py index c8a3166..e69de29 100644 --- a/cisticola/scraper/__init__.py +++ b/cisticola/scraper/__init__.py @@ -1,18 +0,0 @@ -from typing import List -import cisticola.base - - -class Scraper: - __version__ = "Scraper 0.0.0" - - def __init__(self): - pass - - def __str__(self): - return self.__version__ - - def can_handle(self, channel: cisticola.base.Channel) -> bool: - pass - - def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: - pass diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index 19766a4..e3c399d 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -1,10 +1,11 @@ import cisticola.base +import cisticola.scraper.base from datetime import datetime from typing import List import snscrape.modules -class TwitterScraper(cisticola.scraper.Scraper): +class TwitterScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Twitter, using snscrape library""" __version__ = "TwitterScraper 0.0.1" diff --git a/test.py b/test.py index b8c093d..1885487 100644 --- a/test.py +++ b/test.py @@ -3,36 +3,39 @@ # still need to do some planning for handling media import cisticola -import cisticola.scraper.twitter +import cisticola.scraper.telegram_snscrape from sqlalchemy import create_engine test_channels = [cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132, category="test", followers=None, platform="Twitter", - url="https://twitter.com/obtusatum", country="US", + url="https://twitter.com/obtusatum", screenname="obtusatum", country="US", influencer=None, public=True, chat=False, notes=""), cisticola.base.Channel(id=1, name="JQHN SPARTAN", platform_id=-1001181961026, category="qanon", followers=None, platform="Telegram", - url="https://t.me/jqhnspartan", country="FR", + url="https://t.me/jqhnspartan", screenname="jqhnspartan", country="FR", influencer="JQNH SPARTAN", public=True, chat=False, notes=""), cisticola.base.Channel(id=2, name="LizardRepublic", platform_id='lizardrepublic', category="qanon", followers=None, platform="Gettr", - url="https://www.gettr.com/user/lizardrepublic", country="US", + url="https://www.gettr.com/user/lizardrepublic", screenname="lizardrepublic", country="US", influencer=None, public=True, chat=False, notes=""), cisticola.base.Channel(id=3, name="Patriot Front", platform_id='OVv9QZL4sEsC', category="nazi", followers=None, platform="Bitchute", - url="https://www.bitchute.com/channel/OVv9QZL4sEsC/", country="US", + url="https://www.bitchute.com/channel/OVv9QZL4sEsC/", screenname=None, country="US", influencer=None, public=True, chat=False, notes=""),] controller = cisticola.ScraperController() -scraper = cisticola.scraper.twitter.TwitterScraper() +# scraper = cisticola.scraper.twitter.TwitterScraper() +# controller.register_scraper(scraper) + +scraper = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper() controller.register_scraper(scraper) -engine = create_engine('sqlite:///test.db') +engine = create_engine('sqlite:///test3.db') controller.connect_to_db(engine) controller.scrape_channels(test_channels)