From 2097e42df0ca8658fd0d08503130396d5240f09a Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Wed, 25 Aug 2021 11:04:14 +0000 Subject: [PATCH] Dynamically adjust number of keyframes for contact sheet view. --- Pipfile.lock | 89 +++++++++++++++++++++++----------------------- auto_archive.py | 94 +++++++++++++++++++++++++++++++++---------------- 2 files changed, 108 insertions(+), 75 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index 698739d..ef838d8 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -35,18 +35,18 @@ }, "boto3": { "hashes": [ - "sha256:1d24c6d1f5db4b52bb29f1dfe13fd3e9d95d9fa4634b0638a096f5a884173cde", - "sha256:8ee8766813864796be6c87ad762c6da4bfef603977931854a38f49fe4db06495" + "sha256:7209b79833bdf13753aa24f76bf533890ffed2cc4fe1fe08619d223c209bbd11", + "sha256:f46c93d09acd4d4bfc6b9522ed852fecbdc508e0365f29ddfb3c146aae784b4e" ], "index": "pypi", - "version": "==1.17.84" + "version": "==1.18.27" }, "botocore": { "hashes": [ - "sha256:75e1397b80aa8757a26636b949eebd20b3cf67e8f1ed80dc01170907e06ea45d", - "sha256:bc59eb748fcb07835613ebea6dcc2600ae1a8be0fae30e40b9c1e81b73262296" + "sha256:8c99abd7093ab11ce8d09c68732aeeb6065a53d2fe371568452e99291817fff5", + "sha256:b9e2c90bad164d111c229102f58f995c28576e719dd116b446965e1b786f8fa5" ], - "version": "==1.20.84" + "version": "==1.21.27" }, "cachetools": { "hashes": [ @@ -62,12 +62,13 @@ ], "version": "==2021.5.30" }, - "chardet": { + "charset-normalizer": { "hashes": [ - "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa", - "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5" + "sha256:0c8911edd15d19223366a194a513099a302055a962bca2cec0f54b8b63175d8b", + "sha256:f23667ebe1084be45f6ae0538e4a5a865206544097e4e8bbcacf42cd02a348f3" ], - "version": "==4.0.0" + "markers": "python_version >= '3'", + "version": "==2.0.4" }, "ffmpeg-python": { "hashes": [ @@ -85,32 +86,33 @@ }, "google-auth": { "hashes": [ - "sha256:044d81b1e58012f8ebc71cc134e191c1fa312f543f1fbc99973afe28c25e3228", - "sha256:b3ca7a8ff9ab3bdefee3ad5aefb11fc6485423767eee016f5942d8e606ca23fb" + "sha256:c012c8be7c442c8309ca8fa0876fef33f5fd977c467be1e1c1c2f721e8ebd73c", + "sha256:ea1af050b3e06eb73e4470f704d23007307bc0e87c13e015f6b90460f1407bd3" ], - "version": "==1.30.1" + "version": "==2.0.1" }, "google-auth-oauthlib": { "hashes": [ - "sha256:09832c6e75032f93818edf1affe4746121d640c625a5bef9b5c96af676e98eee", - "sha256:0e92aacacfb94978de3b7972cf4b0f204c3cd206f74ddd0dc0b31e91164e6317" + "sha256:4ab58e6c3dc6ccf112f921fcced40e5426fba266768986ea502228488276eaba", + "sha256:b5a1ce7c617d247ccb2dfbba9d4bfc734b41096803d854a2c52592ae80150a67" ], - "version": "==0.4.4" + "version": "==0.4.5" }, "gspread": { "hashes": [ - "sha256:056ceb9fb4f439c15ec39d84c91653c6435f775a1c8afc8fe7f909f8393821fb", - "sha256:4bda4ab8c5edb9e41cf4ae40d4d5fb30447522b4e43608e05c01351ab1b96912" + "sha256:236a0f24e3724b49bae4cbd5144ed036b0ae6feaf5828ad033eb2824bf05e5be", + "sha256:4933c3e2359e82698c0990f3b0e312627fcbf8fecc8bc81d26713f5860e20b48" ], "index": "pypi", - "version": "==3.7.0" + "version": "==4.0.1" }, "idna": { "hashes": [ - "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6", - "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0" + "sha256:14475042e284991034cb48e06f6851428fb14c4dc953acd9be9a5e95c7b6dd7a", + "sha256:467fbad99067910785144ce333826c71fb0e63a425657295239737f7ecd125f3" ], - "version": "==2.10" + "markers": "python_version >= '3'", + "version": "==3.2" }, "jmespath": { "hashes": [ @@ -121,10 +123,10 @@ }, "oauthlib": { "hashes": [ - "sha256:bee41cc35fcca6e988463cacc3bcb8a96224f470ca547e697b604cc697b2f889", - "sha256:df884cd6cbe20e32633f1db1072e9356f53638e4361bef4e8b03c9127c9328ea" + "sha256:42bf6354c2ed8c6acb54d971fce6f88193d97297e18602a3a886603f9d7730cc", + "sha256:8f0215fcc533dd8dd1bee6f4c412d4f0cd7297307d43ac61666389e3bc3198a3" ], - "version": "==3.1.0" + "version": "==3.1.1" }, "pyasn1": { "hashes": [ @@ -164,25 +166,25 @@ }, "python-dateutil": { "hashes": [ - "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c", - "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a" + "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", + "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9" ], - "version": "==2.8.1" + "version": "==2.8.2" }, "python-dotenv": { "hashes": [ - "sha256:00aa34e92d992e9f8383730816359647f358f4a3be1ba45e5a5cefd27ee91544", - "sha256:b1ae5e9643d5ed987fc57cc2583021e38db531946518130777734f9589b3141f" + "sha256:aae25dc1ebe97c420f50b81fb0e5c949659af713f31fdb63c749ca68748f34b1", + "sha256:f521bc2ac9a8e03c736f62911605c5d83970021e3fa95b37d769e2bbbe9b6172" ], "index": "pypi", - "version": "==0.17.1" + "version": "==0.19.0" }, "requests": { "hashes": [ - "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804", - "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e" + "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24", + "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7" ], - "version": "==2.25.1" + "version": "==2.26.0" }, "requests-oauthlib": { "hashes": [ @@ -197,15 +199,14 @@ "sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2", "sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9" ], - "markers": "python_version >= '3.6'", "version": "==4.7.2" }, "s3transfer": { "hashes": [ - "sha256:9b3752887a2880690ce628bc263d6d13a3864083aeacff4890c1c9839a5eb0bc", - "sha256:cb022f4b16551edebbb31a377d3f09600dbada7363d8c5db7976e7f47732e1b2" + "sha256:50ed823e1dc5868ad40c8dc92072f757aa0e653a192845c94a3b676f4a62da4c", + "sha256:9c1dc369814391a6bda20ebbf4b70a0f34630592c9aa520856bf384916af2803" ], - "version": "==0.4.2" + "version": "==0.5.0" }, "six": { "hashes": [ @@ -224,18 +225,18 @@ }, "urllib3": { "hashes": [ - "sha256:753a0374df26658f99d826cfe40394a686d05985786d946fbe4165b5148f5a7c", - "sha256:a7acd0977125325f516bda9735fa7142b909a8d01e8b2e4c8108d0984e6e0098" + "sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4", + "sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f" ], - "version": "==1.26.5" + "version": "==1.26.6" }, "youtube-dl": { "hashes": [ - "sha256:4e569cb0477428fd96ee6f7e7a6640b7c9416be626ed708ac4b8ada6c5a6ffbe", - "sha256:deb489a17e541ec7ac35581375ae94161eb22a7ec3373b1216181a4360c187ab" + "sha256:263e04d53fb8ba3dfbd246ad09b7d388e896c132a20cc770c26ee7684de050ac", + "sha256:cb2d3ee002158ede783e97a82c95f3817594df54367ea6a77ce5ceea4772f0ab" ], "index": "pypi", - "version": "==2021.5.16" + "version": "==2021.6.6" } }, "develop": {} diff --git a/auto_archive.py b/auto_archive.py index 9eb95cf..47b3135 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -47,32 +47,45 @@ def index_to_col(index): return alphabet[index] -def get_thumbnails(filename, s3_client): +def get_thumbnails(filename, s3_client, duration = None): if not os.path.exists(filename.split('.')[0]): os.mkdir(filename.split('.')[0]) + fps = 0.5 + if duration is not None: + duration = float(duration) + + if duration < 60: + fps = 10.0 / duration + elif duration < 120: + fps = 20.0 / duration + else: + fps = 40.0 / duration + + stream = ffmpeg.input(filename) - stream = ffmpeg.filter(stream, 'fps', fps=0.5).filter('scale', 512, -1) + stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1) stream.output(filename.split('.')[0] + '/out%d.jpg').run() thumbnails = os.listdir(filename.split('.')[0] + '/') cdn_urls = [] for fname in thumbnails: - thumbnail_filename = filename.split('.')[0] + '/' + fname - key = filename.split('/')[1].split('.')[0] + '/' + fname + if fname[-3:] == 'jpg': + thumbnail_filename = filename.split('.')[0] + '/' + fname + key = filename.split('/')[1].split('.')[0] + '/' + fname - cdn_url = 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format( - os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key) + cdn_url = 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format( + os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key) - with open(thumbnail_filename, 'rb') as f: - s3_client.upload_fileobj(f, Bucket=os.getenv( - 'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'}) + with open(thumbnail_filename, 'rb') as f: + s3_client.upload_fileobj(f, Bucket=os.getenv( + 'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'}) - cdn_urls.append(cdn_url) - os.remove(thumbnail_filename) + cdn_urls.append(cdn_url) + os.remove(thumbnail_filename) - key_thumb = cdn_urls[int(len(cdn_urls)*0.25)] + key_thumb = cdn_urls[int(len(cdn_urls)*0.1)] index_page = f'''{filename} ''' @@ -117,7 +130,6 @@ def download_telegram_video(url, s3_client, check_if_exists=False): video_url = video.get('src') key = video_url.split('/')[-1].split('?')[0] filename = 'tmp/' + key - print(video_url, key) if check_if_exists: try: @@ -145,14 +157,20 @@ def download_telegram_video(url, s3_client, check_if_exists=False): s3_client.upload_fileobj(f, Bucket=os.getenv( 'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'}) - key_thumb, thumb_index = get_thumbnails(filename, s3_client) + duration = s.find_all('time')[0].contents[0] + if ':' in duration: + duration = float(duration.split(':')[0])*60 + float(duration.split(':')[1]) + else: + duration = float(duration) + + key_thumb, thumb_index = get_thumbnails(filename, s3_client, duration=duration) os.remove(filename) video_data = { 'cdn_url': cdn_url, 'thumbnail': key_thumb, 'thumbnail_index': thumb_index, - 'duration': s.find_all('time')[0].contents[0], + 'duration': duration, 'title': original_url, 'timestamp': s.find_all('time')[1].get('datetime') } @@ -183,11 +201,14 @@ def internet_archive(url, s3_client): while status_r.json()['status'] == 'pending' and retries < 40: time.sleep(5) - status_r = requests.get( - 'https://web.archive.org/save/status/' + job_id, headers={ - "Accept": "application/json", - "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET') - }) + try: + status_r = requests.get( + 'https://web.archive.org/save/status/' + job_id, headers={ + "Accept": "application/json", + "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET') + }) + except: + time.sleep(1) retries += 1 @@ -211,7 +232,8 @@ def internet_archive(url, s3_client): def download_vid(url, s3_client, check_if_exists=False): ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} - if url[0:20] == 'https://facebook.com' and os.getenv('FB_COOKIE'): + if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'): + print('Using cookie') youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE') ydl = youtube_dl.YoutubeDL(ydl_opts) cdn_url = None @@ -250,10 +272,10 @@ def download_vid(url, s3_client, check_if_exists=False): if len(info['entries']) > 1: raise Exception( 'ERROR: Cannot archive channels or pages with multiple videos') + else: + info = info['entries'][0] - filename = ydl.prepare_filename(info['entries'][0]) - else: - filename = ydl.prepare_filename(info) + filename = ydl.prepare_filename(info) if not os.path.exists(filename): filename = filename.split('.')[0] + '.mkv' @@ -267,14 +289,15 @@ def download_vid(url, s3_client, check_if_exists=False): s3_client.upload_fileobj(f, Bucket=os.getenv( 'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'}) - key_thumb, thumb_index = get_thumbnails(filename, s3_client) + duration = info['duration'] if 'duration' in info else None + key_thumb, thumb_index = get_thumbnails(filename, s3_client, duration=duration) os.remove(filename) video_data = { 'cdn_url': cdn_url, 'thumbnail': key_thumb, 'thumbnail_index': thumb_index, - 'duration': info['duration'] if 'duration' in info else None, + 'duration': duration, 'title': info['title'] if 'title' in info else None, 'timestamp': info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None, } @@ -404,17 +427,29 @@ def process_sheet(sheet): # check so we don't step on each others' toes if latest_val == '' or latest_val is None: - if 'http://t.me/' in v[url_index] or 'https://t.me/' in v[url_index]: - wks.update( + wks.update( columns['status'] + str(i), 'Archive in progress') + if 'http://t.me/' in v[url_index] or 'https://t.me/' in v[url_index]: video_data, status = download_telegram_video( v[url_index], s3_client, check_if_exists=True) + + if status == 'No telegram video found': + print("Trying Internet Archive fallback") + + video_data, status = internet_archive( + v[url_index], s3_client) + update_sheet(wks, i, status, video_data, columns, v) + else: try: ydl_opts = { 'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} + if (v[url_index][0:21] == 'https://facebook.com/' or v[url_index][0:25] == 'https://www.facebook.com/') and os.getenv('FB_COOKIE'): + print('Using cookie') + youtube_dl.utils.std_headers['cookie'] = os.getenv( + 'FB_COOKIE') ydl = youtube_dl.YoutubeDL(ydl_opts) info = ydl.extract_info( v[url_index], download=False) @@ -434,9 +469,6 @@ def process_sheet(sheet): except: # i'm sure there's a better way to handle this than nested try/catch blocks try: - wks.update( - columns['status'] + str(i), 'Archive in progress') - print("Trying Internet Archive fallback") video_data, status = internet_archive(