From 4c580519ddcf14bcf14cdcbfe840d591c671ecff Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Sun, 3 Apr 2022 15:59:39 +0200 Subject: [PATCH] Remove Rumble scraper --- app.py | 99 +++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 60 insertions(+), 39 deletions(-) diff --git a/app.py b/app.py index b950697..160dd25 100644 --- a/app.py +++ b/app.py @@ -13,14 +13,15 @@ from cisticola.scraper import ( VkontakteScraper, TelegramTelethonScraper, GettrScraper, - RumbleScraper) +) + def sync_channels(args): logger.info("Synchronizing channels") session = get_db_session() - gc = gspread.service_account(filename='service_account.json') + gc = gspread.service_account(filename="service_account.json") # Open a sheet from a spreadsheet in one go wks = gc.open_by_url(args.gsheet).worksheet("channels") @@ -29,33 +30,44 @@ def sync_channels(args): for c in channels: # only adding channels, so skip everything with an ID - if c['id'] == '': - del c['id'] - del c['followers'] + if c["id"] == "": + del c["id"] + del c["followers"] + + if c["public"] == "": + c["public"] = False + if c["chat"] == "": + c["chat"] = False - if c['public'] == '': c['public'] = False - if c['chat'] == '': c['chat'] = False - for k in c.keys(): - if c[k] == 'TRUE' or c[k] == 'yes': c[k] = True - if c[k] == 'FALSE' or c[k] == 'no': c[k] = False + if c[k] == "TRUE" or c[k] == "yes": + c[k] = True + if c[k] == "FALSE" or c[k] == "no": + c[k] = False - if c[k] == '': c[k] = None + if c[k] == "": + c[k] = None - # check to see if this already exists, + # check to see if this already exists, platform_id = None - if c['platform_id'] != '': - platform_id = c['platform_id'] + if c["platform_id"] != "": + platform_id = c["platform_id"] - channel = session.query(Channel).filter_by(platform_id=str(platform_id), platform=c['platform'], url=c['url']).first() + channel = ( + session.query(Channel) + .filter_by( + platform_id=str(platform_id), platform=c["platform"], url=c["url"] + ) + .first() + ) if not channel: - channel = Channel(**c, source='researcher') + channel = Channel(**c, source="researcher") logger.debug(f"{channel} does not exist, adding") session.add(channel) session.flush() session.commit() - + wks.update_cell(row, 1, channel.id) time.sleep(1) @@ -63,37 +75,36 @@ def sync_channels(args): session.commit() + def get_db_session(): - engine = create_engine(os.environ['DB']) - + engine = create_engine(os.environ["DB"]) + session_generator = sessionmaker() session_generator.configure(bind=engine) session = session_generator() return session + def get_scraper_controller(): - engine = create_engine(os.environ['DB']) + engine = create_engine(os.environ["DB"]) controller = ScraperController() controller.connect_to_db(engine) - scrapers = [ - TelegramTelethonScraper(), - VkontakteScraper(), - GettrScraper(), - RumbleScraper() - ] + scrapers = [TelegramTelethonScraper(), VkontakteScraper(), GettrScraper()] controller.register_scrapers(scrapers) return controller + def scrape_channels(args): logger.info(f"Scraping channels, media: {args.media}") controller = get_scraper_controller() - controller.scrape_all_channels(archive_media = args.media) + controller.scrape_all_channels(archive_media=args.media) + def scrape_channel_info(args): logger.info(f"Scraping channel info") @@ -101,38 +112,48 @@ def scrape_channel_info(args): controller = get_scraper_controller() controller.scrape_all_channel_info() + def archive_media(args): logger.info(f"Archiving unarchived media") controller = get_scraper_controller() controller.archive_unarchived_media() + def init_db(): - engine = create_engine(os.environ['DB']) + engine = create_engine(os.environ["DB"]) mapper_registry.metadata.create_all(bind=engine) -if __name__ == '__main__': + +if __name__ == "__main__": logger.remove() logger.add(sys.stdout, level="DEBUG", catch=True) logger.add("./test.log", level="TRACE") - parser = argparse.ArgumentParser(description = 'Cisticola command line tools') - parser.add_argument('command', type=str, help='Command to run: "sync-channels", "scrape-channels", or "archive-media"') - parser.add_argument('--gsheet', type=str, help='[sync-channels] URL of Google Sheet to synchronize') - parser.add_argument('--media', action='store_true', help='[scrape-channels] Add this flag to media') + parser = argparse.ArgumentParser(description="Cisticola command line tools") + parser.add_argument( + "command", + type=str, + help='Command to run: "sync-channels", "scrape-channels", or "archive-media"', + ) + parser.add_argument( + "--gsheet", type=str, help="[sync-channels] URL of Google Sheet to synchronize" + ) + parser.add_argument( + "--media", action="store_true", help="[scrape-channels] Add this flag to media" + ) args = parser.parse_args() - - if args.command == 'init-db': + if args.command == "init-db": init_db() - elif args.command == 'sync-channels': + elif args.command == "sync-channels": sync_channels(args) - elif args.command == 'scrape-channels': + elif args.command == "scrape-channels": scrape_channels(args) - elif args.command == 'archive-media': + elif args.command == "archive-media": archive_media(args) - elif args.command == 'channel-info': + elif args.command == "channel-info": scrape_channel_info(args) else: logger.error(f"Unrecognized command {args.command}")