Files
cisticola/app.py
2023-05-04 10:00:14 +00:00

194 lines
6.1 KiB
Python

import argparse
from asyncio import streams
from loguru import logger
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import datetime
import os
import sys
from cisticola.base import mapper_registry
from cisticola.scraper import (
ScraperController,
# VkontakteScraper,
TelegramTelethonScraper,
GettrScraper,
BitchuteScraper,
RumbleScraper,
)
from cisticola.transformer import (
ETLController,
TelegramTelethonTransformer,
GettrTransformer,
RumbleTransformer,
BitchuteTransformer,
# VkontakteTransformer,
)
from sync_with_gsheet import sync_channels
def get_db_session():
engine = create_engine(os.environ["DB"])
session_generator = sessionmaker()
session_generator.configure(bind=engine)
session = session_generator()
return session
def get_scraper_controller(args):
engine = create_engine(os.environ["DB"])
controller = ScraperController()
controller.connect_to_db(engine)
if args.telethon_session:
telethon_session_name = args.telethon_session
else:
telethon_session_name = None
scrapers = [ #VkontakteScraper(),
TelegramTelethonScraper(telethon_session_name = telethon_session_name),
GettrScraper(),
BitchuteScraper(),
RumbleScraper()]
controller.register_scrapers(scrapers)
return controller
def get_transformer_controller(args):
engine = create_engine(os.environ["DB"])
controller = ETLController()
controller.connect_to_db(engine)
if args.telethon_session:
telethon_session_name = args.telethon_session
else:
telethon_session_name = None
transformers = [ #VkontakteTransformer(),
TelegramTelethonTransformer(telethon_session_name = telethon_session_name),
GettrTransformer(),
BitchuteTransformer(),
RumbleTransformer()]
controller.register_transformers(transformers)
return controller
def scrape_channels(args):
logger.info(f"Scraping channels, media: {args.media}")
controller = get_scraper_controller(args)
controller.scrape_all_channels(archive_media=args.media)
def scrape_channels_old(args):
logger.info(f"Scraping old posts from channels, media: {args.media}")
controller = get_scraper_controller(args)
controller.scrape_all_channels(archive_media=args.media, fetch_old=True)
def scrape_channel_info(args):
logger.info(f"Scraping channel info")
controller = get_scraper_controller(args)
controller.scrape_all_channel_info()
def archive_media(args):
logger.info(f"Archiving unarchived media")
controller = get_scraper_controller(args)
if args.chronological:
controller.archive_unarchived_media(chronological=True)
else:
controller.archive_unarchived_media()
def transform(args):
logger.info(f"Transforming untransformed posts")
controller = get_transformer_controller(args)
if args.min_date:
min_date = datetime.datetime.fromisoformat(args.min_date)
else:
min_date = 0
controller.transform_all_untransformed(min_date=min_date)
def transform_info(args):
logger.info(f"Transforming untransformed channel info")
controller = get_transformer_controller(args)
controller.transform_all_untransformed_info()
# sync_channels(args, get_db_session())
def transform_media(args):
logger.info(f"Transforming untransformed channel media")
controller = get_transformer_controller(args)
controller.transform_all_untransformed_media()
def init_db():
engine = create_engine(os.environ["DB"])
mapper_registry.metadata.create_all(bind=engine)
if __name__ == "__main__":
logger.remove()
logger.add(sys.stdout, level="DEBUG", catch=True)
parser = argparse.ArgumentParser(description="Cisticola command line tools")
parser.add_argument(
"command",
type=str,
help='Command to run: "sync-channels", "scrape-channels", or "archive-media"',
)
parser.add_argument(
"--gsheet", type=str, help="[sync-channels] URL of Google Sheet to synchronize"
)
parser.add_argument(
"--media", action="store_true", help="[scrape-channels] Add this flag to media"
)
parser.add_argument("--chronological", action="store_true")
parser.add_argument("--telethon_session", type=str)
parser.add_argument("--min_date", type=str)
args = parser.parse_args()
if args.command == "init-db":
init_db()
elif args.command == "sync-channels":
logger.add("logs/sync-channels.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
sync_channels(args, get_db_session())
elif args.command == "scrape-channels":
logger.add("logs/scrape-channels.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
scrape_channels(args)
elif args.command == "scrape-channels-old":
logger.add("logs/scrape-channels-old.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
scrape_channels_old(args)
elif args.command == "archive-media":
logger.add("logs/archive-media.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
archive_media(args)
elif args.command == "channel-info":
logger.add("logs/channel-info.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
scrape_channel_info(args)
elif args.command == "transform":
logger.add("logs/transform.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
logger.add("logs/transform_trace.log", level="TRACE", retention="7 days")
transform(args)
elif args.command == "transform-info":
logger.add("logs/transform-info.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
transform_info(args)
elif args.command == "transform-media":
logger.add("logs/transform-media.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
transform_media(args)
else:
logger.error(f"Unrecognized command {args.command}")