Transformer for Telegram, base transformer NLP hydration; no media

This commit is contained in:
Logan Williams
2022-04-14 11:45:09 +02:00
parent 59bab0d812
commit 4c221d1133
9 changed files with 263 additions and 79 deletions

21
app.py
View File

@@ -16,6 +16,7 @@ from cisticola.scraper import (
BitchuteScraper,
RumbleScraper,
)
from cisticola.transformer import (ETLController, TelegramTelethonTransformer)
def sync_channels(args):
@@ -122,6 +123,18 @@ def get_scraper_controller():
return controller
def get_transformer_controller():
engine = create_engine(os.environ["DB"])
controller = ETLController()
controller.connect_to_db(engine)
transformers = [TelegramTelethonTransformer()]
controller.register_transformers(transformers)
return controller
def scrape_channels(args):
logger.info(f"Scraping channels, media: {args.media}")
@@ -143,6 +156,12 @@ def archive_media(args):
controller = get_scraper_controller()
controller.archive_unarchived_media()
def transform(args):
logger.info(f"Transforming untransformed media")
controller = get_transformer_controller()
controller.transform_all_untransformed()
def init_db():
engine = create_engine(os.environ["DB"])
@@ -179,5 +198,7 @@ if __name__ == "__main__":
archive_media(args)
elif args.command == "channel-info":
scrape_channel_info(args)
elif args.command == "transform":
transform(args)
else:
logger.error(f"Unrecognized command {args.command}")