diff --git a/app.py b/app.py index b30f14c..03462c5 100644 --- a/app.py +++ b/app.py @@ -8,7 +8,7 @@ import sys from cisticola.base import mapper_registry from cisticola.scraper import ( ScraperController, - VkontakteScraper, +# VkontakteScraper, TelegramTelethonScraper, GettrScraper, BitchuteScraper, @@ -20,7 +20,7 @@ from cisticola.transformer import ( GettrTransformer, RumbleTransformer, BitchuteTransformer, - VkontakteTransformer, +# VkontakteTransformer, ) from sync_with_gsheet import sync_channels @@ -41,7 +41,7 @@ def get_scraper_controller(telethon_session_name = None): controller = ScraperController() controller.connect_to_db(engine) - scrapers = [VkontakteScraper(), + scrapers = [ #VkontakteScraper(), TelegramTelethonScraper(telethon_session_name = telethon_session_name), GettrScraper(), BitchuteScraper(), @@ -57,7 +57,7 @@ def get_transformer_controller(): controller = ETLController() controller.connect_to_db(engine) - transformers = [VkontakteTransformer(), + transformers = [ #VkontakteTransformer(), TelegramTelethonTransformer(), GettrTransformer(), BitchuteTransformer(), @@ -147,28 +147,28 @@ if __name__ == "__main__": if args.command == "init-db": init_db() elif args.command == "sync-channels": - logger.add("logs/sync-channels.log", level="TRACE", rotation="100 MB") + logger.add("logs/sync-channels.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip") sync_channels(args, get_db_session()) elif args.command == "scrape-channels": - logger.add("logs/scrape-channels.log", level="TRACE", rotation="100 MB") + logger.add("logs/scrape-channels.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip") scrape_channels(args) elif args.command == "scrape-channels-old": - logger.add("logs/scrape-channels-old.log", level="TRACE", rotation="100 MB") + logger.add("logs/scrape-channels-old.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip") scrape_channels_old(args) elif args.command == "archive-media": - logger.add("logs/archive-media.log", level="TRACE", rotation="100 MB") + logger.add("logs/archive-media.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip") archive_media(args) elif args.command == "channel-info": - logger.add("logs/channel-info.log", level="TRACE", rotation="100 MB") + logger.add("logs/channel-info.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip") scrape_channel_info(args) elif args.command == "transform": - logger.add("logs/transform.log", level="TRACE", rotation="100 MB") + logger.add("logs/transform.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip") transform(args) elif args.command == "transform-info": - logger.add("logs/transform-info.log", level="TRACE", rotation="100 MB") + logger.add("logs/transform-info.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip") transform_info(args) elif args.command == "transform-media": - logger.add("logs/transform-media.log", level="TRACE", rotation="100 MB") + logger.add("logs/transform-media.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip") transform_media(args) else: logger.error(f"Unrecognized command {args.command}") diff --git a/cisticola/transformer/base.py b/cisticola/transformer/base.py index c9b777a..6219bd8 100644 --- a/cisticola/transformer/base.py +++ b/cisticola/transformer/base.py @@ -220,6 +220,7 @@ class ETLController: batch = (session.query(ScraperResult) .join(Post, isouter=True) + .where(ScraperResult.id > 35000000) # TODO this can be a CLI argument or something .where(Post.raw_id == None) .order_by(ScraperResult.date.asc()) .limit(BATCH_SIZE) @@ -234,6 +235,7 @@ class ETLController: batch = (session.query(ScraperResult) .join(Post, isouter=True) + .where(ScraperResult.id > 35000000) # TODO this can be a CLI argument or something .where(Post.raw_id == None) .where(ScraperResult.date >= max(batch, key=lambda v: v.date).date) .order_by(ScraperResult.date.asc())