mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 20:58:29 +03:00
Dynamically adjust number of keyframes for contact sheet view.
This commit is contained in:
89
Pipfile.lock
generated
89
Pipfile.lock
generated
@@ -35,18 +35,18 @@
|
||||
},
|
||||
"boto3": {
|
||||
"hashes": [
|
||||
"sha256:1d24c6d1f5db4b52bb29f1dfe13fd3e9d95d9fa4634b0638a096f5a884173cde",
|
||||
"sha256:8ee8766813864796be6c87ad762c6da4bfef603977931854a38f49fe4db06495"
|
||||
"sha256:7209b79833bdf13753aa24f76bf533890ffed2cc4fe1fe08619d223c209bbd11",
|
||||
"sha256:f46c93d09acd4d4bfc6b9522ed852fecbdc508e0365f29ddfb3c146aae784b4e"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.17.84"
|
||||
"version": "==1.18.27"
|
||||
},
|
||||
"botocore": {
|
||||
"hashes": [
|
||||
"sha256:75e1397b80aa8757a26636b949eebd20b3cf67e8f1ed80dc01170907e06ea45d",
|
||||
"sha256:bc59eb748fcb07835613ebea6dcc2600ae1a8be0fae30e40b9c1e81b73262296"
|
||||
"sha256:8c99abd7093ab11ce8d09c68732aeeb6065a53d2fe371568452e99291817fff5",
|
||||
"sha256:b9e2c90bad164d111c229102f58f995c28576e719dd116b446965e1b786f8fa5"
|
||||
],
|
||||
"version": "==1.20.84"
|
||||
"version": "==1.21.27"
|
||||
},
|
||||
"cachetools": {
|
||||
"hashes": [
|
||||
@@ -62,12 +62,13 @@
|
||||
],
|
||||
"version": "==2021.5.30"
|
||||
},
|
||||
"chardet": {
|
||||
"charset-normalizer": {
|
||||
"hashes": [
|
||||
"sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa",
|
||||
"sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"
|
||||
"sha256:0c8911edd15d19223366a194a513099a302055a962bca2cec0f54b8b63175d8b",
|
||||
"sha256:f23667ebe1084be45f6ae0538e4a5a865206544097e4e8bbcacf42cd02a348f3"
|
||||
],
|
||||
"version": "==4.0.0"
|
||||
"markers": "python_version >= '3'",
|
||||
"version": "==2.0.4"
|
||||
},
|
||||
"ffmpeg-python": {
|
||||
"hashes": [
|
||||
@@ -85,32 +86,33 @@
|
||||
},
|
||||
"google-auth": {
|
||||
"hashes": [
|
||||
"sha256:044d81b1e58012f8ebc71cc134e191c1fa312f543f1fbc99973afe28c25e3228",
|
||||
"sha256:b3ca7a8ff9ab3bdefee3ad5aefb11fc6485423767eee016f5942d8e606ca23fb"
|
||||
"sha256:c012c8be7c442c8309ca8fa0876fef33f5fd977c467be1e1c1c2f721e8ebd73c",
|
||||
"sha256:ea1af050b3e06eb73e4470f704d23007307bc0e87c13e015f6b90460f1407bd3"
|
||||
],
|
||||
"version": "==1.30.1"
|
||||
"version": "==2.0.1"
|
||||
},
|
||||
"google-auth-oauthlib": {
|
||||
"hashes": [
|
||||
"sha256:09832c6e75032f93818edf1affe4746121d640c625a5bef9b5c96af676e98eee",
|
||||
"sha256:0e92aacacfb94978de3b7972cf4b0f204c3cd206f74ddd0dc0b31e91164e6317"
|
||||
"sha256:4ab58e6c3dc6ccf112f921fcced40e5426fba266768986ea502228488276eaba",
|
||||
"sha256:b5a1ce7c617d247ccb2dfbba9d4bfc734b41096803d854a2c52592ae80150a67"
|
||||
],
|
||||
"version": "==0.4.4"
|
||||
"version": "==0.4.5"
|
||||
},
|
||||
"gspread": {
|
||||
"hashes": [
|
||||
"sha256:056ceb9fb4f439c15ec39d84c91653c6435f775a1c8afc8fe7f909f8393821fb",
|
||||
"sha256:4bda4ab8c5edb9e41cf4ae40d4d5fb30447522b4e43608e05c01351ab1b96912"
|
||||
"sha256:236a0f24e3724b49bae4cbd5144ed036b0ae6feaf5828ad033eb2824bf05e5be",
|
||||
"sha256:4933c3e2359e82698c0990f3b0e312627fcbf8fecc8bc81d26713f5860e20b48"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.7.0"
|
||||
"version": "==4.0.1"
|
||||
},
|
||||
"idna": {
|
||||
"hashes": [
|
||||
"sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6",
|
||||
"sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"
|
||||
"sha256:14475042e284991034cb48e06f6851428fb14c4dc953acd9be9a5e95c7b6dd7a",
|
||||
"sha256:467fbad99067910785144ce333826c71fb0e63a425657295239737f7ecd125f3"
|
||||
],
|
||||
"version": "==2.10"
|
||||
"markers": "python_version >= '3'",
|
||||
"version": "==3.2"
|
||||
},
|
||||
"jmespath": {
|
||||
"hashes": [
|
||||
@@ -121,10 +123,10 @@
|
||||
},
|
||||
"oauthlib": {
|
||||
"hashes": [
|
||||
"sha256:bee41cc35fcca6e988463cacc3bcb8a96224f470ca547e697b604cc697b2f889",
|
||||
"sha256:df884cd6cbe20e32633f1db1072e9356f53638e4361bef4e8b03c9127c9328ea"
|
||||
"sha256:42bf6354c2ed8c6acb54d971fce6f88193d97297e18602a3a886603f9d7730cc",
|
||||
"sha256:8f0215fcc533dd8dd1bee6f4c412d4f0cd7297307d43ac61666389e3bc3198a3"
|
||||
],
|
||||
"version": "==3.1.0"
|
||||
"version": "==3.1.1"
|
||||
},
|
||||
"pyasn1": {
|
||||
"hashes": [
|
||||
@@ -164,25 +166,25 @@
|
||||
},
|
||||
"python-dateutil": {
|
||||
"hashes": [
|
||||
"sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c",
|
||||
"sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"
|
||||
"sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
|
||||
"sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"
|
||||
],
|
||||
"version": "==2.8.1"
|
||||
"version": "==2.8.2"
|
||||
},
|
||||
"python-dotenv": {
|
||||
"hashes": [
|
||||
"sha256:00aa34e92d992e9f8383730816359647f358f4a3be1ba45e5a5cefd27ee91544",
|
||||
"sha256:b1ae5e9643d5ed987fc57cc2583021e38db531946518130777734f9589b3141f"
|
||||
"sha256:aae25dc1ebe97c420f50b81fb0e5c949659af713f31fdb63c749ca68748f34b1",
|
||||
"sha256:f521bc2ac9a8e03c736f62911605c5d83970021e3fa95b37d769e2bbbe9b6172"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.17.1"
|
||||
"version": "==0.19.0"
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
"sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804",
|
||||
"sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"
|
||||
"sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
|
||||
"sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
|
||||
],
|
||||
"version": "==2.25.1"
|
||||
"version": "==2.26.0"
|
||||
},
|
||||
"requests-oauthlib": {
|
||||
"hashes": [
|
||||
@@ -197,15 +199,14 @@
|
||||
"sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2",
|
||||
"sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==4.7.2"
|
||||
},
|
||||
"s3transfer": {
|
||||
"hashes": [
|
||||
"sha256:9b3752887a2880690ce628bc263d6d13a3864083aeacff4890c1c9839a5eb0bc",
|
||||
"sha256:cb022f4b16551edebbb31a377d3f09600dbada7363d8c5db7976e7f47732e1b2"
|
||||
"sha256:50ed823e1dc5868ad40c8dc92072f757aa0e653a192845c94a3b676f4a62da4c",
|
||||
"sha256:9c1dc369814391a6bda20ebbf4b70a0f34630592c9aa520856bf384916af2803"
|
||||
],
|
||||
"version": "==0.4.2"
|
||||
"version": "==0.5.0"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
@@ -224,18 +225,18 @@
|
||||
},
|
||||
"urllib3": {
|
||||
"hashes": [
|
||||
"sha256:753a0374df26658f99d826cfe40394a686d05985786d946fbe4165b5148f5a7c",
|
||||
"sha256:a7acd0977125325f516bda9735fa7142b909a8d01e8b2e4c8108d0984e6e0098"
|
||||
"sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4",
|
||||
"sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f"
|
||||
],
|
||||
"version": "==1.26.5"
|
||||
"version": "==1.26.6"
|
||||
},
|
||||
"youtube-dl": {
|
||||
"hashes": [
|
||||
"sha256:4e569cb0477428fd96ee6f7e7a6640b7c9416be626ed708ac4b8ada6c5a6ffbe",
|
||||
"sha256:deb489a17e541ec7ac35581375ae94161eb22a7ec3373b1216181a4360c187ab"
|
||||
"sha256:263e04d53fb8ba3dfbd246ad09b7d388e896c132a20cc770c26ee7684de050ac",
|
||||
"sha256:cb2d3ee002158ede783e97a82c95f3817594df54367ea6a77ce5ceea4772f0ab"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2021.5.16"
|
||||
"version": "==2021.6.6"
|
||||
}
|
||||
},
|
||||
"develop": {}
|
||||
|
||||
@@ -47,32 +47,45 @@ def index_to_col(index):
|
||||
return alphabet[index]
|
||||
|
||||
|
||||
def get_thumbnails(filename, s3_client):
|
||||
def get_thumbnails(filename, s3_client, duration = None):
|
||||
if not os.path.exists(filename.split('.')[0]):
|
||||
os.mkdir(filename.split('.')[0])
|
||||
|
||||
fps = 0.5
|
||||
if duration is not None:
|
||||
duration = float(duration)
|
||||
|
||||
if duration < 60:
|
||||
fps = 10.0 / duration
|
||||
elif duration < 120:
|
||||
fps = 20.0 / duration
|
||||
else:
|
||||
fps = 40.0 / duration
|
||||
|
||||
|
||||
stream = ffmpeg.input(filename)
|
||||
stream = ffmpeg.filter(stream, 'fps', fps=0.5).filter('scale', 512, -1)
|
||||
stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
|
||||
stream.output(filename.split('.')[0] + '/out%d.jpg').run()
|
||||
|
||||
thumbnails = os.listdir(filename.split('.')[0] + '/')
|
||||
cdn_urls = []
|
||||
|
||||
for fname in thumbnails:
|
||||
thumbnail_filename = filename.split('.')[0] + '/' + fname
|
||||
key = filename.split('/')[1].split('.')[0] + '/' + fname
|
||||
if fname[-3:] == 'jpg':
|
||||
thumbnail_filename = filename.split('.')[0] + '/' + fname
|
||||
key = filename.split('/')[1].split('.')[0] + '/' + fname
|
||||
|
||||
cdn_url = 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
|
||||
os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
|
||||
cdn_url = 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
|
||||
os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
|
||||
|
||||
with open(thumbnail_filename, 'rb') as f:
|
||||
s3_client.upload_fileobj(f, Bucket=os.getenv(
|
||||
'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
|
||||
with open(thumbnail_filename, 'rb') as f:
|
||||
s3_client.upload_fileobj(f, Bucket=os.getenv(
|
||||
'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
|
||||
|
||||
cdn_urls.append(cdn_url)
|
||||
os.remove(thumbnail_filename)
|
||||
cdn_urls.append(cdn_url)
|
||||
os.remove(thumbnail_filename)
|
||||
|
||||
key_thumb = cdn_urls[int(len(cdn_urls)*0.25)]
|
||||
key_thumb = cdn_urls[int(len(cdn_urls)*0.1)]
|
||||
|
||||
index_page = f'''<html><head><title>{filename}</title></head>
|
||||
<body>'''
|
||||
@@ -117,7 +130,6 @@ def download_telegram_video(url, s3_client, check_if_exists=False):
|
||||
video_url = video.get('src')
|
||||
key = video_url.split('/')[-1].split('?')[0]
|
||||
filename = 'tmp/' + key
|
||||
print(video_url, key)
|
||||
|
||||
if check_if_exists:
|
||||
try:
|
||||
@@ -145,14 +157,20 @@ def download_telegram_video(url, s3_client, check_if_exists=False):
|
||||
s3_client.upload_fileobj(f, Bucket=os.getenv(
|
||||
'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
|
||||
|
||||
key_thumb, thumb_index = get_thumbnails(filename, s3_client)
|
||||
duration = s.find_all('time')[0].contents[0]
|
||||
if ':' in duration:
|
||||
duration = float(duration.split(':')[0])*60 + float(duration.split(':')[1])
|
||||
else:
|
||||
duration = float(duration)
|
||||
|
||||
key_thumb, thumb_index = get_thumbnails(filename, s3_client, duration=duration)
|
||||
os.remove(filename)
|
||||
|
||||
video_data = {
|
||||
'cdn_url': cdn_url,
|
||||
'thumbnail': key_thumb,
|
||||
'thumbnail_index': thumb_index,
|
||||
'duration': s.find_all('time')[0].contents[0],
|
||||
'duration': duration,
|
||||
'title': original_url,
|
||||
'timestamp': s.find_all('time')[1].get('datetime')
|
||||
}
|
||||
@@ -183,11 +201,14 @@ def internet_archive(url, s3_client):
|
||||
while status_r.json()['status'] == 'pending' and retries < 40:
|
||||
time.sleep(5)
|
||||
|
||||
status_r = requests.get(
|
||||
'https://web.archive.org/save/status/' + job_id, headers={
|
||||
"Accept": "application/json",
|
||||
"Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
|
||||
})
|
||||
try:
|
||||
status_r = requests.get(
|
||||
'https://web.archive.org/save/status/' + job_id, headers={
|
||||
"Accept": "application/json",
|
||||
"Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
|
||||
})
|
||||
except:
|
||||
time.sleep(1)
|
||||
|
||||
retries += 1
|
||||
|
||||
@@ -211,7 +232,8 @@ def internet_archive(url, s3_client):
|
||||
|
||||
def download_vid(url, s3_client, check_if_exists=False):
|
||||
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
|
||||
if url[0:20] == 'https://facebook.com' and os.getenv('FB_COOKIE'):
|
||||
if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
|
||||
print('Using cookie')
|
||||
youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
|
||||
ydl = youtube_dl.YoutubeDL(ydl_opts)
|
||||
cdn_url = None
|
||||
@@ -250,10 +272,10 @@ def download_vid(url, s3_client, check_if_exists=False):
|
||||
if len(info['entries']) > 1:
|
||||
raise Exception(
|
||||
'ERROR: Cannot archive channels or pages with multiple videos')
|
||||
else:
|
||||
info = info['entries'][0]
|
||||
|
||||
filename = ydl.prepare_filename(info['entries'][0])
|
||||
else:
|
||||
filename = ydl.prepare_filename(info)
|
||||
filename = ydl.prepare_filename(info)
|
||||
|
||||
if not os.path.exists(filename):
|
||||
filename = filename.split('.')[0] + '.mkv'
|
||||
@@ -267,14 +289,15 @@ def download_vid(url, s3_client, check_if_exists=False):
|
||||
s3_client.upload_fileobj(f, Bucket=os.getenv(
|
||||
'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
|
||||
|
||||
key_thumb, thumb_index = get_thumbnails(filename, s3_client)
|
||||
duration = info['duration'] if 'duration' in info else None
|
||||
key_thumb, thumb_index = get_thumbnails(filename, s3_client, duration=duration)
|
||||
os.remove(filename)
|
||||
|
||||
video_data = {
|
||||
'cdn_url': cdn_url,
|
||||
'thumbnail': key_thumb,
|
||||
'thumbnail_index': thumb_index,
|
||||
'duration': info['duration'] if 'duration' in info else None,
|
||||
'duration': duration,
|
||||
'title': info['title'] if 'title' in info else None,
|
||||
'timestamp': info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None,
|
||||
}
|
||||
@@ -404,17 +427,29 @@ def process_sheet(sheet):
|
||||
|
||||
# check so we don't step on each others' toes
|
||||
if latest_val == '' or latest_val is None:
|
||||
if 'http://t.me/' in v[url_index] or 'https://t.me/' in v[url_index]:
|
||||
wks.update(
|
||||
wks.update(
|
||||
columns['status'] + str(i), 'Archive in progress')
|
||||
|
||||
if 'http://t.me/' in v[url_index] or 'https://t.me/' in v[url_index]:
|
||||
video_data, status = download_telegram_video(
|
||||
v[url_index], s3_client, check_if_exists=True)
|
||||
|
||||
if status == 'No telegram video found':
|
||||
print("Trying Internet Archive fallback")
|
||||
|
||||
video_data, status = internet_archive(
|
||||
v[url_index], s3_client)
|
||||
|
||||
update_sheet(wks, i, status, video_data, columns, v)
|
||||
|
||||
else:
|
||||
try:
|
||||
ydl_opts = {
|
||||
'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
|
||||
if (v[url_index][0:21] == 'https://facebook.com/' or v[url_index][0:25] == 'https://www.facebook.com/') and os.getenv('FB_COOKIE'):
|
||||
print('Using cookie')
|
||||
youtube_dl.utils.std_headers['cookie'] = os.getenv(
|
||||
'FB_COOKIE')
|
||||
ydl = youtube_dl.YoutubeDL(ydl_opts)
|
||||
info = ydl.extract_info(
|
||||
v[url_index], download=False)
|
||||
@@ -434,9 +469,6 @@ def process_sheet(sheet):
|
||||
except:
|
||||
# i'm sure there's a better way to handle this than nested try/catch blocks
|
||||
try:
|
||||
wks.update(
|
||||
columns['status'] + str(i), 'Archive in progress')
|
||||
|
||||
print("Trying Internet Archive fallback")
|
||||
|
||||
video_data, status = internet_archive(
|
||||
|
||||
Reference in New Issue
Block a user