This commit is contained in:
Logan Williams
2022-06-09 14:11:52 +00:00

View File

@@ -18,7 +18,23 @@ MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
class TelegramTelethonScraper(Scraper): class TelegramTelethonScraper(Scraper):
"""An implementation of a Scraper for Telegram, using Telethon library""" """An implementation of a Scraper for Telegram, using Telethon library"""
__version__ = "TelegramTelethonScraper 0.0.2" __version__ = "TelegramTelethonScraper 0.0.3"
client = None
def __init__(self):
super().__init__()
api_id = os.environ['TELEGRAM_API_ID']
api_hash = os.environ['TELEGRAM_API_HASH']
phone = os.environ['TELEGRAM_PHONE']
# set up a persistent client for Telethon
self.client = TelegramClient(phone, api_id, api_hash)
self.client.connect()
def __del__(self):
logger.info("Disconnecting Telethon client")
self.client.disconnect()
def get_username_from_url(url): def get_username_from_url(url):
username = url.split('https://t.me/')[1] username = url.split('https://t.me/')[1]
@@ -39,18 +55,10 @@ class TelegramTelethonScraper(Scraper):
return identifier return identifier
@logger.catch @logger.catch
def archive_files(self, result: ScraperResult, client : TelegramClient = None) -> ScraperResult: def archive_files(self, result: ScraperResult) -> ScraperResult:
if len(result.archived_urls.keys()) == 0: if len(result.archived_urls.keys()) == 0:
return result return result
if client is None:
api_id = os.environ['TELEGRAM_API_ID']
api_hash = os.environ['TELEGRAM_API_HASH']
phone = os.environ['TELEGRAM_PHONE']
with TelegramClient(phone, api_id, api_hash) as client:
return self.archive_files(result, client)
if len(list(result.archived_urls.keys())) != 1: if len(list(result.archived_urls.keys())) != 1:
logger.warning(f"Expected 1 key in archived_urls, found {result.archived_keys}") logger.warning(f"Expected 1 key in archived_urls, found {result.archived_keys}")
else: else:
@@ -59,11 +67,11 @@ class TelegramTelethonScraper(Scraper):
if result.archived_urls[key] is None: if result.archived_urls[key] is None:
raw = json.loads(result.raw_data) raw = json.loads(result.raw_data)
message = client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']]) message = self.client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']])
blob = None blob = None
if len(message) > 0: if len(message) > 0:
blob, output_file_with_ext = self.archive_post_media(message[0], client) blob, output_file_with_ext = self.archive_post_media(message[0])
else: else:
logger.warning("No message retrieved") logger.warning("No message retrieved")
@@ -83,19 +91,11 @@ class TelegramTelethonScraper(Scraper):
return result return result
def archive_post_media(self, post : types.Message, client : TelegramClient = None): def archive_post_media(self, post : types.Message):
if post.media is None: if post.media is None:
logger.debug("No media for post") logger.debug("No media for post")
return None, None return None, None
if client is None:
api_id = os.environ['TELEGRAM_API_ID']
api_hash = os.environ['TELEGRAM_API_HASH']
phone = os.environ['TELEGRAM_PHONE']
with TelegramClient(phone, api_id, api_hash) as client:
return self.archive_post_media(post, client=client)
if type(post.media) == types.MessageMediaDocument: if type(post.media) == types.MessageMediaDocument:
if post.media.document.size/(1024*1024) > 50: if post.media.document.size/(1024*1024) > 50:
logger.info(f"Skipping archive of large {type(post.media)} with size {post.media.document.size/(1024*1024)} MB") logger.info(f"Skipping archive of large {type(post.media)} with size {post.media.document.size/(1024*1024)} MB")
@@ -110,7 +110,7 @@ class TelegramTelethonScraper(Scraper):
with tempfile.TemporaryDirectory() as temp_dir: with tempfile.TemporaryDirectory() as temp_dir:
output_file = Path(temp_dir, key) output_file = Path(temp_dir, key)
client.download_media(post.media, output_file) self.client.download_media(post.media, output_file)
if len(os.listdir(temp_dir)) == 0: if len(os.listdir(temp_dir)) == 0:
logger.warning(f"No file present. Could not archive {post.media}") logger.warning(f"No file present. Could not archive {post.media}")
@@ -131,42 +131,37 @@ class TelegramTelethonScraper(Scraper):
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = TelegramTelethonScraper.get_channel_identifier(channel) username = TelegramTelethonScraper.get_channel_identifier(channel)
api_id = os.environ['TELEGRAM_API_ID'] for post in self.client.iter_messages(username):
api_hash = os.environ['TELEGRAM_API_HASH'] post_url = f'{channel.url}/{post.id}'
phone = os.environ['TELEGRAM_PHONE']
with TelegramClient(phone, api_id, api_hash) as client: logger.trace(f"Archiving post {post_url} from {post.date}")
for post in client.iter_messages(username):
post_url = f'{channel.url}/{post.id}'
logger.trace(f"Archiving post {post_url} from {post.date}") if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}')
break
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): archived_urls = {}
logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}')
break
archived_urls = {} if post.media is not None:
archived_urls[post_url] = None
if post.media is not None: # if archive_media:
archived_urls[post_url] = None # blob, output_file_with_ext = self.archive_post_media(post, client)
# if blob is not None:
# # TODO specify Content-Type
# archived_url = self.archive_blob(blob = blob, content_type = '', key = output_file_with_ext)
# archived_urls[post_url] = archived_url
# if archive_media: yield ScraperResult(
# blob, output_file_with_ext = self.archive_post_media(post, client) scraper=self.__version__,
# if blob is not None: platform="Telegram",
# # TODO specify Content-Type channel=channel.id,
# archived_url = self.archive_blob(blob = blob, content_type = '', key = output_file_with_ext) platform_id=post_url,
# archived_urls[post_url] = archived_url date=post.date.replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
yield ScraperResult( raw_data=json.dumps(post.to_dict(), default=str),
scraper=self.__version__, archived_urls=archived_urls,
platform="Telegram", media_archived=datetime.now(timezone.utc) if archive_media else None)
channel=channel.id,
platform_id=post_url,
date=post.date.replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post.to_dict(), default=str),
archived_urls=archived_urls,
media_archived=datetime.now(timezone.utc) if archive_media else None)
@logger.catch @logger.catch
def get_profile(self, channel: Channel) -> RawChannelInfo: def get_profile(self, channel: Channel) -> RawChannelInfo: