From 2601313249fd46581150b05692f908df5c709c07 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 23 Feb 2022 16:13:09 +0100 Subject: [PATCH] removed archivers.py --- archivers.py | 412 --------------------------------------------------- 1 file changed, 412 deletions(-) delete mode 100644 archivers.py diff --git a/archivers.py b/archivers.py deleted file mode 100644 index 7c8df8c..0000000 --- a/archivers.py +++ /dev/null @@ -1,412 +0,0 @@ -from dataclasses import dataclass -import youtube_dl -from bs4 import BeautifulSoup -import requests -import tiktok_downloader -from loguru import logger -import os -import datetime -import ffmpeg -from botocore.errorfactory import ClientError -import time -import traceback - -# TODO There should be a better way of generating keys, that adds the following info: -# - name of sheet that it is being archived from -# (this means we might archive the same media twice on different sheets, but that's OK I think) -# - name of archiver/platform that the video comes from -# This should make it easier to maintain and clean the archive later - -# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be -# cleaned up? Difficult is we don't know the filename until the archivers start working. - - -def get_cdn_url(key): - return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format( - os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key) - - -def do_s3_upload(s3_client, f, key): - s3_client.upload_fileobj(f, Bucket=os.getenv( - 'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'}) - - -def get_key(filename): - key = filename.split('/')[1] - if 'unknown_video' in key: - key = key.replace('unknown_video', 'jpg') - return key - - -def get_thumbnails(filename, s3_client, duration=None): - if not os.path.exists(filename.split('.')[0]): - os.mkdir(filename.split('.')[0]) - - fps = 0.5 - if duration is not None: - duration = float(duration) - - if duration < 60: - fps = 10.0 / duration - elif duration < 120: - fps = 20.0 / duration - else: - fps = 40.0 / duration - - stream = ffmpeg.input(filename) - stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1) - stream.output(filename.split('.')[0] + '/out%d.jpg').run() - - thumbnails = os.listdir(filename.split('.')[0] + '/') - cdn_urls = [] - - for fname in thumbnails: - if fname[-3:] == 'jpg': - thumbnail_filename = filename.split('.')[0] + '/' + fname - key = filename.split('/')[1].split('.')[0] + '/' + fname - - cdn_url = get_cdn_url(key) - - with open(thumbnail_filename, 'rb') as f: - do_s3_upload(s3_client, f, key) - - cdn_urls.append(cdn_url) - os.remove(thumbnail_filename) - - if len(cdn_urls) == 0: - return ('None', 'None') - - key_thumb = cdn_urls[int(len(cdn_urls)*0.1)] - - index_page = f'''{filename} - ''' - - for t in cdn_urls: - index_page += f'' - - index_page += f"" - index_fname = filename.split('.')[0] + '/index.html' - - with open(index_fname, 'w') as f: - f.write(index_page) - - thumb_index = filename.split('/')[1].split('.')[0] + '/index.html' - - s3_client.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv( - 'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'}) - - thumb_index_cdn_url = get_cdn_url(thumb_index) - - return (key_thumb, thumb_index_cdn_url) - - -@dataclass -class ArchiveResult: - status: str - cdn_url: str = None - thumbnail: str = None - thumbnail_index: str = None - duration: float = None - title: str = None - timestamp: datetime.datetime = None - - -class Archiver: - def __init__(self, s3_client): - self.s3 = s3_client - - def download(self, url): - pass - - -class TelegramArchiver(Archiver): - def download(self, url, check_if_exists=False): - # detect URLs that we definitely cannot handle - if 'http://t.me/' not in url and 'https://t.me/' not in url: - return False - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'} - status = "success" - - original_url = url - - if url[-8:] != "?embed=1": - url += "?embed=1" - - t = requests.get(url, headers=headers) - s = BeautifulSoup(t.content, 'html.parser') - video = s.find("video") - - if video is None: - return False # could not find video - - video_url = video.get('src') - key = video_url.split('/')[-1].split('?')[0] - filename = 'tmp/' + key - - if check_if_exists: - try: - self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) - - # file exists - cdn_url = get_cdn_url(key) - - status = 'already archived' - - except ClientError: - pass - - v = requests.get(video_url, headers=headers) - - with open(filename, 'wb') as f: - f.write(v.content) - - if status != 'already archived': - cdn_url = get_cdn_url(key) - - with open(filename, 'rb') as f: - do_s3_upload(self.s3, f, key) - - # extract duration from HTML - duration = s.find_all('time')[0].contents[0] - if ':' in duration: - duration = float(duration.split( - ':')[0])*60 + float(duration.split(':')[1]) - else: - duration = float(duration) - - # process thumbnails - key_thumb, thumb_index = get_thumbnails( - filename, self.s3, duration=duration) - os.remove(filename) - - return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, - duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime')) - - -class YoutubeDLArchiver(Archiver): - def download(self, url, check_if_exists=False): - ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} - if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'): - logger.info('Using Facebook cookie') - youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE') - - ydl = youtube_dl.YoutubeDL(ydl_opts) - cdn_url = None - status = 'success' - - try: - info = ydl.extract_info(url, download=False) - except youtube_dl.utils.DownloadError: - # no video here - return False - - if 'is_live' in info and info['is_live']: - logger.warning("Live streaming media, not archiving now") - return ArchiveResult(status="Streaming media") - - if check_if_exists: - if 'entries' in info: - if len(info['entries']) > 1: - logger.warning( - 'YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos') - return False - elif len(info['entries']) == 0: - logger.warning( - 'YoutubeDLArchiver succeeded but did not find video') - return False - - filename = ydl.prepare_filename(info['entries'][0]) - else: - filename = ydl.prepare_filename(info) - - key = get_key(filename) - - try: - self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) - - # file exists - cdn_url = get_cdn_url(key) - - status = 'already archived' - - except ClientError: - pass - - # sometimes this results in a different filename, so do this again - info = ydl.extract_info(url, download=True) - - if 'entries' in info: - if len(info['entries']) > 1: - logger.warning( - 'YoutubeDLArchiver cannot archive channels or pages with multiple videos') - return False - else: - info = info['entries'][0] - - filename = ydl.prepare_filename(info) - - if not os.path.exists(filename): - filename = filename.split('.')[0] + '.mkv' - - if status != 'already archived': - key = get_key(filename) - cdn_url = get_cdn_url(key) - - with open(filename, 'rb') as f: - do_s3_upload(self.s3, f, key) - - # get duration - duration = info['duration'] if 'duration' in info else None - - # get thumbnails - try: - key_thumb, thumb_index = get_thumbnails( - filename, self.s3, duration=duration) - except: - key_thumb = '' - thumb_index = 'Could not generate thumbnails' - - os.remove(filename) - - timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime( - info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None - - return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, - title=info['title'] if 'title' in info else None, - timestamp=timestamp) - - -class WaybackArchiver(Archiver): - def __init__(self, s3_client): - self.s3 = s3_client - self.seen_urls = {} - - def download(self, url, check_if_exists=False): - if check_if_exists and url in self.seen_urls: - return self.seen_urls[url] - - ia_headers = { - "Accept": "application/json", - "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET') - } - - r = requests.post( - 'https://web.archive.org/save/', headers=ia_headers, data={'url': url}) - - if r.status_code != 200: - return ArchiveResult(status="Internet archive failed") - - if 'job_id' not in r.json() and 'message' in r.json(): - return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}") - - job_id = r.json()['job_id'] - - status_r = requests.get( - 'https://web.archive.org/save/status/' + job_id, headers=ia_headers) - - retries = 0 - - # wait 90-120 seconds for the archive job to finish - while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30: - time.sleep(3) - - try: - status_r = requests.get( - 'https://web.archive.org/save/status/' + job_id, headers=ia_headers) - except: - time.sleep(1) - - retries += 1 - - if status_r.status_code != 200: - return ArchiveResult(status="Internet archive failed") - - status_json = status_r.json() - - if status_json['status'] != 'success': - return ArchiveResult(status='Internet Archive failed: ' + str(status_json)) - - archive_url = 'https://web.archive.org/web/' + \ - status_json['timestamp'] + '/' + status_json['original_url'] - - try: - r = requests.get(archive_url) - - parsed = BeautifulSoup( - r.content, 'html.parser') - - title = parsed.find_all('title')[ - 0].text - - if title == 'Wayback Machine': - title = 'Could not get title' - except: - title = "Could not get title" - - result = ArchiveResult( - status='Internet Archive fallback', cdn_url=archive_url, title=title) - self.seen_urls[url] = result - return result - - -class TiktokArchiver(Archiver): - def download(self, url, check_if_exists=False): - if 'tiktok.com' not in url: - return False - - status = 'success' - - try: - info = tiktok_downloader.info_post(url) - key = 'tiktok_' + str(info.id) + '.mp4' - cdn_url = get_cdn_url(key) - filename = 'tmp/' + key - - if check_if_exists: - try: - self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) - - # file exists - cdn_url = get_cdn_url(key) - - status = 'already archived' - - except ClientError: - pass - - media = tiktok_downloader.snaptik(url).get_media() - - if len(media) <= 0: - if status == 'already archived': - return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url) - else: - return ArchiveResult(status='Could not download media') - - media[0].download(filename) - - if status != 'already archived': - with open(filename, 'rb') as f: - do_s3_upload(self.s3, f, key) - - try: - key_thumb, thumb_index = get_thumbnails( - filename, self.s3, duration=info.duration) - except: - key_thumb = '' - thumb_index = 'error creating thumbnails' - - os.remove(filename) - - return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, - thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat()) - - except tiktok_downloader.Except.InvalidUrl: - status = 'Invalid URL' - return ArchiveResult(status=status) - - except: - error = traceback.format_exc() - status = 'Other Tiktok error: ' + str(error) - return ArchiveResult(status=status)