mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-12 21:38:33 +03:00
Merge branch 'main' of https://github.com/bellingcat/cisticola into transformers
This commit is contained in:
5
app.py
5
app.py
@@ -171,7 +171,6 @@ def init_db():
|
||||
if __name__ == "__main__":
|
||||
logger.remove()
|
||||
logger.add(sys.stdout, level="DEBUG", catch=True)
|
||||
logger.add("logs/cisticola.log", level="TRACE", rotation="100 MB")
|
||||
|
||||
parser = argparse.ArgumentParser(description="Cisticola command line tools")
|
||||
parser.add_argument(
|
||||
@@ -191,12 +190,16 @@ if __name__ == "__main__":
|
||||
if args.command == "init-db":
|
||||
init_db()
|
||||
elif args.command == "sync-channels":
|
||||
logger.add("logs/sync-channels.log", level="TRACE", rotation="100 MB")
|
||||
sync_channels(args)
|
||||
elif args.command == "scrape-channels":
|
||||
logger.add("logs/scrape-channels.log", level="TRACE", rotation="100 MB")
|
||||
scrape_channels(args)
|
||||
elif args.command == "archive-media":
|
||||
logger.add("logs/archive-media.log", level="TRACE", rotation="100 MB")
|
||||
archive_media(args)
|
||||
elif args.command == "channel-info":
|
||||
logger.add("logs/channel-info.log", level="TRACE", rotation="100 MB")
|
||||
scrape_channel_info(args)
|
||||
elif args.command == "transform":
|
||||
transform(args)
|
||||
|
||||
@@ -12,8 +12,9 @@ from sqlalchemy.orm import sessionmaker
|
||||
import yt_dlp
|
||||
from sqlalchemy.sql.expression import func
|
||||
from pathlib import Path
|
||||
from sqlalchemy import nullsfirst
|
||||
|
||||
from cisticola.base import Channel, ScraperResult, mapper_registry
|
||||
from cisticola.base import Channel, RawChannelInfo, ScraperResult, mapper_registry
|
||||
from cisticola.utils import make_request
|
||||
|
||||
class Scraper:
|
||||
@@ -340,6 +341,8 @@ class ScraperController:
|
||||
|
||||
channels = session.query(Channel).where(Channel.source=='researcher').all()
|
||||
|
||||
session.close()
|
||||
|
||||
return self.scrape_channels(channels, archive_media=archive_media)
|
||||
|
||||
def scrape_all_channel_info(self):
|
||||
@@ -349,8 +352,15 @@ class ScraperController:
|
||||
|
||||
session = self.session()
|
||||
|
||||
channels = session.query(Channel).where(Channel.source=='researcher').all()
|
||||
# Because of rate limiting, we may not be able to succesfully scrape info for all of these channels.
|
||||
# This will sort the channels by the least recently scraped.
|
||||
most_recently_archived = session.query(func.max(RawChannelInfo.date_archived).label("date"), RawChannelInfo.channel.label("channel")).group_by(RawChannelInfo.channel).subquery()
|
||||
channels = session.query(Channel).\
|
||||
where(Channel.source=='researcher').\
|
||||
outerjoin(most_recently_archived, Channel.id == most_recently_archived.c.channel).\
|
||||
order_by(nullsfirst(most_recently_archived.c.date.asc())).all()
|
||||
|
||||
session.close()
|
||||
return self.scrape_channel_info(channels)
|
||||
|
||||
def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
|
||||
@@ -419,6 +429,8 @@ class ScraperController:
|
||||
if not handled:
|
||||
logger.warning(f"No handler found for Channel {channel}")
|
||||
|
||||
session.close()
|
||||
|
||||
@logger.catch(reraise = True)
|
||||
def archive_unarchived_media(self):
|
||||
if self.session is None:
|
||||
@@ -437,7 +449,8 @@ class ScraperController:
|
||||
handled = False
|
||||
|
||||
for scraper in self.scrapers:
|
||||
if scraper.__version__ == post.scraper:
|
||||
# compare major versions
|
||||
if scraper.__version__.split('.')[0] == post.scraper.split('.')[0]:
|
||||
handled = True
|
||||
logger.debug(f"{scraper} is archiving media for ID {post.id}")
|
||||
post = scraper.archive_files(post)
|
||||
@@ -452,6 +465,7 @@ class ScraperController:
|
||||
logger.warning(f"No handler found for post scraped with {post.scraper}")
|
||||
|
||||
session.commit()
|
||||
session.close()
|
||||
|
||||
@logger.catch(reraise = True)
|
||||
def scrape_channel_info(self, channels: List[Channel]):
|
||||
@@ -470,6 +484,7 @@ class ScraperController:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
|
||||
session = self.session()
|
||||
for channel in channels:
|
||||
handled = False
|
||||
|
||||
@@ -479,10 +494,13 @@ class ScraperController:
|
||||
handled = True
|
||||
|
||||
# get most recent post
|
||||
session = self.session()
|
||||
|
||||
try:
|
||||
info = scraper.get_profile(channel)
|
||||
if info is None:
|
||||
logger.warning(f"No info returned for {channel}")
|
||||
break
|
||||
|
||||
session.add(info)
|
||||
|
||||
session.commit()
|
||||
@@ -495,6 +513,8 @@ class ScraperController:
|
||||
if not handled:
|
||||
logger.warning(f"No handler found for Channel {channel}")
|
||||
|
||||
session.close()
|
||||
|
||||
def connect_to_db(self, engine):
|
||||
"""Connect the specified SQLAlchemy engine to the controller.
|
||||
"""
|
||||
|
||||
@@ -18,7 +18,7 @@ MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
|
||||
|
||||
class TelegramTelethonScraper(Scraper):
|
||||
"""An implementation of a Scraper for Telegram, using Telethon library"""
|
||||
__version__ = "TelegramTelethonScraper 0.0.1"
|
||||
__version__ = "TelegramTelethonScraper 0.0.2"
|
||||
|
||||
def get_username_from_url(self, url):
|
||||
username = url.split('https://t.me/')[1]
|
||||
@@ -62,6 +62,7 @@ class TelegramTelethonScraper(Scraper):
|
||||
result.media_archived = datetime.now(timezone.utc)
|
||||
else:
|
||||
logger.warning("Downloaded blob was None")
|
||||
result.archived_urls = {}
|
||||
result.media_archived = datetime.now(timezone.utc)
|
||||
|
||||
return result
|
||||
@@ -80,6 +81,10 @@ class TelegramTelethonScraper(Scraper):
|
||||
return self.archive_post_media(post, client=client)
|
||||
|
||||
if type(post.media) == types.MessageMediaDocument:
|
||||
if post.media.document.size/(1024*1024) > 50:
|
||||
logger.info(f"Skipping archive of large {type(post.media)} with size {post.media.document.size/(1024*1024)} MB")
|
||||
return None, None
|
||||
|
||||
logger.debug(f"Archiving {type(post.media)} with size {post.media.document.size/(1024*1024)} MB")
|
||||
else:
|
||||
logger.debug(f"Archiving {type(post.media)}")
|
||||
|
||||
Reference in New Issue
Block a user