From c0a094eefaa5953968642a9e3251853ecac1c0a8 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Tue, 22 Mar 2022 11:37:47 +0100 Subject: [PATCH] Load channels from google sheet in test.py --- .gitignore | 4 +++- Pipfile | 5 +++++ test.py | 52 ++++++++++++++++++++++++++++++++-------------------- 3 files changed, 40 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index 747d85e..632ac68 100644 --- a/.gitignore +++ b/.gitignore @@ -9,10 +9,12 @@ docs/source/_* *.db .env *.session +service_account.json +.vscode/ # Unit test / coverage reports reports .coverage .cache .pytest_cache/ -cover/ \ No newline at end of file +cover/ diff --git a/Pipfile b/Pipfile index 5457fcc..ba81a5c 100644 --- a/Pipfile +++ b/Pipfile @@ -21,12 +21,17 @@ telethon = "*" pytesseract = "*" pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"} instaloader = "*" +gspread = "*" [dev-packages] pytest = "*" pytest-cov = "*" pytest-html = "*" pytest-metadata = "*" +black = "*" [requires] python_version = "3.9" + +[pipenv] +allow_prereleases = true diff --git a/test.py b/test.py index ade611c..454ad43 100644 --- a/test.py +++ b/test.py @@ -1,7 +1,10 @@ from sqlalchemy import create_engine from loguru import logger +import gspread +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker -from cisticola.base import Channel, TransformedResult, ScraperResult +from cisticola.base import Channel, TransformedResult, ScraperResult, mapper_registry from cisticola.scraper import ( ScraperController, BitchuteScraper, @@ -14,26 +17,9 @@ from cisticola.scraper import ( TwitterScraper) from cisticola.transformer import ETLController from cisticola.transformer.twitter import TwitterTransformer -from sqlalchemy.orm import sessionmaker logger.add("../test.log") -test_channels = [ - Channel( - id=0, - name="L Weber (test)", - platform_id=1424979017749442595, - category="test", - followers=None, - platform="Twitter", - url="https://twitter.com/LWeber33662141", - screenname="LWeber33662141", - country="US", - influencer=None, - public=True, - chat=False, - notes="")] - controller = ScraperController() scrapers = [ @@ -49,9 +35,35 @@ scrapers = [ controller.register_scrapers(scrapers) engine = create_engine('sqlite:///test.db') -controller.connect_to_db(engine) +mapper_registry.metadata.create_all(bind=engine) +session_generator = sessionmaker() +session_generator.configure(bind=engine) +session = session_generator() -controller.scrape_channels(test_channels, archive_media = True) +gc = gspread.service_account(filename='service_account.json') + +# Open a sheet from a spreadsheet in one go +wks = gc.open_by_url("https://docs.google.com/spreadsheets/d/1yxd6-2Mp0jZ8r9XJklb39WE-iIMrKRyA2kymJcIfGis/edit#gid=0") +channels = wks.worksheet("channels").get_all_records() + +for c in channels: + del c['followers'] + + for k in c.keys(): + if c[k] == 'TRUE': c[k] = True + if c[k] == 'FALSE': c[k] = False + + # check to see if this already exists, + channel = session.query(Channel).filter_by(platform_id=c['platform_id'], platform=c['platform']).first() + + if not channel: + channel = Channel(**c, source='researcher') + session.add(channel) + +session.commit() + +controller.connect_to_db(engine) +controller.scrape_all_channels(archive_media = True) transformer = TwitterTransformer()