mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-13 05:48:33 +03:00
Remove Rumble scraper
This commit is contained in:
99
app.py
99
app.py
@@ -13,14 +13,15 @@ from cisticola.scraper import (
|
|||||||
VkontakteScraper,
|
VkontakteScraper,
|
||||||
TelegramTelethonScraper,
|
TelegramTelethonScraper,
|
||||||
GettrScraper,
|
GettrScraper,
|
||||||
RumbleScraper)
|
)
|
||||||
|
|
||||||
|
|
||||||
def sync_channels(args):
|
def sync_channels(args):
|
||||||
logger.info("Synchronizing channels")
|
logger.info("Synchronizing channels")
|
||||||
|
|
||||||
session = get_db_session()
|
session = get_db_session()
|
||||||
|
|
||||||
gc = gspread.service_account(filename='service_account.json')
|
gc = gspread.service_account(filename="service_account.json")
|
||||||
|
|
||||||
# Open a sheet from a spreadsheet in one go
|
# Open a sheet from a spreadsheet in one go
|
||||||
wks = gc.open_by_url(args.gsheet).worksheet("channels")
|
wks = gc.open_by_url(args.gsheet).worksheet("channels")
|
||||||
@@ -29,33 +30,44 @@ def sync_channels(args):
|
|||||||
|
|
||||||
for c in channels:
|
for c in channels:
|
||||||
# only adding channels, so skip everything with an ID
|
# only adding channels, so skip everything with an ID
|
||||||
if c['id'] == '':
|
if c["id"] == "":
|
||||||
del c['id']
|
del c["id"]
|
||||||
del c['followers']
|
del c["followers"]
|
||||||
|
|
||||||
|
if c["public"] == "":
|
||||||
|
c["public"] = False
|
||||||
|
if c["chat"] == "":
|
||||||
|
c["chat"] = False
|
||||||
|
|
||||||
if c['public'] == '': c['public'] = False
|
|
||||||
if c['chat'] == '': c['chat'] = False
|
|
||||||
|
|
||||||
for k in c.keys():
|
for k in c.keys():
|
||||||
if c[k] == 'TRUE' or c[k] == 'yes': c[k] = True
|
if c[k] == "TRUE" or c[k] == "yes":
|
||||||
if c[k] == 'FALSE' or c[k] == 'no': c[k] = False
|
c[k] = True
|
||||||
|
if c[k] == "FALSE" or c[k] == "no":
|
||||||
|
c[k] = False
|
||||||
|
|
||||||
if c[k] == '': c[k] = None
|
if c[k] == "":
|
||||||
|
c[k] = None
|
||||||
|
|
||||||
# check to see if this already exists,
|
# check to see if this already exists,
|
||||||
platform_id = None
|
platform_id = None
|
||||||
if c['platform_id'] != '':
|
if c["platform_id"] != "":
|
||||||
platform_id = c['platform_id']
|
platform_id = c["platform_id"]
|
||||||
|
|
||||||
channel = session.query(Channel).filter_by(platform_id=str(platform_id), platform=c['platform'], url=c['url']).first()
|
channel = (
|
||||||
|
session.query(Channel)
|
||||||
|
.filter_by(
|
||||||
|
platform_id=str(platform_id), platform=c["platform"], url=c["url"]
|
||||||
|
)
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
|
||||||
if not channel:
|
if not channel:
|
||||||
channel = Channel(**c, source='researcher')
|
channel = Channel(**c, source="researcher")
|
||||||
logger.debug(f"{channel} does not exist, adding")
|
logger.debug(f"{channel} does not exist, adding")
|
||||||
session.add(channel)
|
session.add(channel)
|
||||||
session.flush()
|
session.flush()
|
||||||
session.commit()
|
session.commit()
|
||||||
|
|
||||||
wks.update_cell(row, 1, channel.id)
|
wks.update_cell(row, 1, channel.id)
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
@@ -63,37 +75,36 @@ def sync_channels(args):
|
|||||||
|
|
||||||
session.commit()
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
def get_db_session():
|
def get_db_session():
|
||||||
engine = create_engine(os.environ['DB'])
|
engine = create_engine(os.environ["DB"])
|
||||||
|
|
||||||
session_generator = sessionmaker()
|
session_generator = sessionmaker()
|
||||||
session_generator.configure(bind=engine)
|
session_generator.configure(bind=engine)
|
||||||
session = session_generator()
|
session = session_generator()
|
||||||
|
|
||||||
return session
|
return session
|
||||||
|
|
||||||
|
|
||||||
def get_scraper_controller():
|
def get_scraper_controller():
|
||||||
engine = create_engine(os.environ['DB'])
|
engine = create_engine(os.environ["DB"])
|
||||||
|
|
||||||
controller = ScraperController()
|
controller = ScraperController()
|
||||||
controller.connect_to_db(engine)
|
controller.connect_to_db(engine)
|
||||||
|
|
||||||
scrapers = [
|
scrapers = [TelegramTelethonScraper(), VkontakteScraper(), GettrScraper()]
|
||||||
TelegramTelethonScraper(),
|
|
||||||
VkontakteScraper(),
|
|
||||||
GettrScraper(),
|
|
||||||
RumbleScraper()
|
|
||||||
]
|
|
||||||
|
|
||||||
controller.register_scrapers(scrapers)
|
controller.register_scrapers(scrapers)
|
||||||
|
|
||||||
return controller
|
return controller
|
||||||
|
|
||||||
|
|
||||||
def scrape_channels(args):
|
def scrape_channels(args):
|
||||||
logger.info(f"Scraping channels, media: {args.media}")
|
logger.info(f"Scraping channels, media: {args.media}")
|
||||||
|
|
||||||
controller = get_scraper_controller()
|
controller = get_scraper_controller()
|
||||||
controller.scrape_all_channels(archive_media = args.media)
|
controller.scrape_all_channels(archive_media=args.media)
|
||||||
|
|
||||||
|
|
||||||
def scrape_channel_info(args):
|
def scrape_channel_info(args):
|
||||||
logger.info(f"Scraping channel info")
|
logger.info(f"Scraping channel info")
|
||||||
@@ -101,38 +112,48 @@ def scrape_channel_info(args):
|
|||||||
controller = get_scraper_controller()
|
controller = get_scraper_controller()
|
||||||
controller.scrape_all_channel_info()
|
controller.scrape_all_channel_info()
|
||||||
|
|
||||||
|
|
||||||
def archive_media(args):
|
def archive_media(args):
|
||||||
logger.info(f"Archiving unarchived media")
|
logger.info(f"Archiving unarchived media")
|
||||||
|
|
||||||
controller = get_scraper_controller()
|
controller = get_scraper_controller()
|
||||||
controller.archive_unarchived_media()
|
controller.archive_unarchived_media()
|
||||||
|
|
||||||
|
|
||||||
def init_db():
|
def init_db():
|
||||||
engine = create_engine(os.environ['DB'])
|
engine = create_engine(os.environ["DB"])
|
||||||
mapper_registry.metadata.create_all(bind=engine)
|
mapper_registry.metadata.create_all(bind=engine)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
if __name__ == "__main__":
|
||||||
logger.remove()
|
logger.remove()
|
||||||
logger.add(sys.stdout, level="DEBUG", catch=True)
|
logger.add(sys.stdout, level="DEBUG", catch=True)
|
||||||
logger.add("./test.log", level="TRACE")
|
logger.add("./test.log", level="TRACE")
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description = 'Cisticola command line tools')
|
parser = argparse.ArgumentParser(description="Cisticola command line tools")
|
||||||
parser.add_argument('command', type=str, help='Command to run: "sync-channels", "scrape-channels", or "archive-media"')
|
parser.add_argument(
|
||||||
parser.add_argument('--gsheet', type=str, help='[sync-channels] URL of Google Sheet to synchronize')
|
"command",
|
||||||
parser.add_argument('--media', action='store_true', help='[scrape-channels] Add this flag to media')
|
type=str,
|
||||||
|
help='Command to run: "sync-channels", "scrape-channels", or "archive-media"',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--gsheet", type=str, help="[sync-channels] URL of Google Sheet to synchronize"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--media", action="store_true", help="[scrape-channels] Add this flag to media"
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.command == "init-db":
|
||||||
if args.command == 'init-db':
|
|
||||||
init_db()
|
init_db()
|
||||||
elif args.command == 'sync-channels':
|
elif args.command == "sync-channels":
|
||||||
sync_channels(args)
|
sync_channels(args)
|
||||||
elif args.command == 'scrape-channels':
|
elif args.command == "scrape-channels":
|
||||||
scrape_channels(args)
|
scrape_channels(args)
|
||||||
elif args.command == 'archive-media':
|
elif args.command == "archive-media":
|
||||||
archive_media(args)
|
archive_media(args)
|
||||||
elif args.command == 'channel-info':
|
elif args.command == "channel-info":
|
||||||
scrape_channel_info(args)
|
scrape_channel_info(args)
|
||||||
else:
|
else:
|
||||||
logger.error(f"Unrecognized command {args.command}")
|
logger.error(f"Unrecognized command {args.command}")
|
||||||
|
|||||||
Reference in New Issue
Block a user