mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
Transformer for Telegram, base transformer NLP hydration; no media
This commit is contained in:
21
app.py
21
app.py
@@ -16,6 +16,7 @@ from cisticola.scraper import (
|
||||
BitchuteScraper,
|
||||
RumbleScraper,
|
||||
)
|
||||
from cisticola.transformer import (ETLController, TelegramTelethonTransformer)
|
||||
|
||||
|
||||
def sync_channels(args):
|
||||
@@ -122,6 +123,18 @@ def get_scraper_controller():
|
||||
|
||||
return controller
|
||||
|
||||
def get_transformer_controller():
|
||||
engine = create_engine(os.environ["DB"])
|
||||
|
||||
controller = ETLController()
|
||||
controller.connect_to_db(engine)
|
||||
|
||||
transformers = [TelegramTelethonTransformer()]
|
||||
|
||||
controller.register_transformers(transformers)
|
||||
|
||||
return controller
|
||||
|
||||
|
||||
def scrape_channels(args):
|
||||
logger.info(f"Scraping channels, media: {args.media}")
|
||||
@@ -143,6 +156,12 @@ def archive_media(args):
|
||||
controller = get_scraper_controller()
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
def transform(args):
|
||||
logger.info(f"Transforming untransformed media")
|
||||
|
||||
controller = get_transformer_controller()
|
||||
controller.transform_all_untransformed()
|
||||
|
||||
|
||||
def init_db():
|
||||
engine = create_engine(os.environ["DB"])
|
||||
@@ -179,5 +198,7 @@ if __name__ == "__main__":
|
||||
archive_media(args)
|
||||
elif args.command == "channel-info":
|
||||
scrape_channel_info(args)
|
||||
elif args.command == "transform":
|
||||
transform(args)
|
||||
else:
|
||||
logger.error(f"Unrecognized command {args.command}")
|
||||
|
||||
Reference in New Issue
Block a user