Add hash and screenshot methods; switch to more recent ytdl fork

This commit is contained in:
Logan Williams
2022-02-25 13:54:40 +01:00
parent d76e3bc7ec
commit 1eb17e4de5
7 changed files with 444 additions and 37 deletions

View File

@@ -5,6 +5,10 @@ import shutil
from dataclasses import dataclass
from abc import ABC, abstractmethod
from urllib.parse import urlparse
import hashlib
from selenium.common.exceptions import TimeoutException
from loguru import logger
import time
from storages import Storage
from utils import mkdir_if_not_exists
@@ -19,13 +23,16 @@ class ArchiveResult:
duration: float = None
title: str = None
timestamp: datetime.datetime = None
screenshot: str = None
hash: str = None
class Archiver(ABC):
name = "default"
def __init__(self, storage: Storage):
def __init__(self, storage: Storage, driver):
self.storage = storage
self.driver = driver
def __str__(self):
return self.__class__.__name__
@@ -46,6 +53,26 @@ class Archiver(ABC):
_id = _id.replace('unknown_video', 'jpg')
return f'{self.name}_{_id}{extension}'
def get_hash(self, filename):
f = open(filename, "rb")
bytes = f.read() # read entire file as bytes
hash = hashlib.sha256(bytes)
f.close()
return hash.hexdigest()
def get_screenshot(self, url):
key = self.get_key(urlparse(url).path.replace(
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
filename = 'tmp/' + key
self.driver.get(url)
time.sleep(6)
self.driver.save_screenshot(filename)
self.storage.upload(filename, key, extra_args={
'ACL': 'public-read', 'ContentType': 'image/png'})
return self.storage.get_cdn_url(key)
def get_thumbnails(self, filename, key, duration=None):
thumbnails_folder = filename.split('.')[0] + '/'
key_folder = key.split('.')[0] + '/'

View File

@@ -49,6 +49,9 @@ class TelegramArchiver(Archiver):
if status != 'already archived':
self.storage.upload(filename, key)
hash = self.get_hash(filename)
screenshot = self.get_screenshot(url)
# extract duration from HTML
duration = s.find_all('time')[0].contents[0]
if ':' in duration:
@@ -58,8 +61,9 @@ class TelegramArchiver(Archiver):
duration = float(duration)
# process thumbnails
key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
key_thumb, thumb_index = self.get_thumbnails(
filename, key, duration=duration)
os.remove(filename)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot)

View File

@@ -43,12 +43,16 @@ class TiktokArchiver(Archiver):
key_thumb = ''
thumb_index = 'error creating thumbnails'
hash = self.get_hash(filename)
screenshot = self.get_screenshot(url)
try: os.remove(filename)
except FileNotFoundError:
logger.info(f'tmp file not found thus not deleted {filename}')
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat())
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat(),
hash=hash, screenshot=screenshot)
except tiktok_downloader.Except.InvalidUrl:
status = 'Invalid URL'

View File

@@ -8,8 +8,8 @@ from .base_archiver import Archiver, ArchiveResult
class WaybackArchiver(Archiver):
name = "wayback"
def __init__(self, storage: Storage):
super(WaybackArchiver, self).__init__(storage)
def __init__(self, storage: Storage, driver):
super(WaybackArchiver, self).__init__(storage, driver)
self.seen_urls = {}
def download(self, url, check_if_exists=False):
@@ -71,6 +71,7 @@ class WaybackArchiver(Archiver):
except:
title = "Could not get title"
result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title)
screenshot = self.get_screenshot(url)
result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title, screenshot=screenshot)
self.seen_urls[url] = result
return result

View File

@@ -1,29 +1,29 @@
import os
import datetime
import youtube_dl
import yt_dlp
from loguru import logger
from .base_archiver import Archiver, ArchiveResult
class YoutubeDLArchiver(Archiver):
name = "yotube_dl"
name = "youtube_dl"
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
def download(self, url, check_if_exists=False):
netloc = self.get_netloc(url)
if netloc in ['facebook.com', 'wwww.facebook.com'] and os.getenv('FB_COOKIE'):
logger.info('Using Facebook cookie')
youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
yt_dlp.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
ydl = youtube_dl.YoutubeDL(YoutubeDLArchiver.ydl_opts)
ydl = yt_dlp.YoutubeDL(YoutubeDLArchiver.ydl_opts)
cdn_url = None
status = 'success'
try:
info = ydl.extract_info(url, download=False)
except youtube_dl.utils.DownloadError:
except yt_dlp.utils.DownloadError:
# no video here
return False
@@ -74,6 +74,9 @@ class YoutubeDLArchiver(Archiver):
self.storage.upload(filename, key)
hash = self.get_hash(filename)
screenshot = self.get_screenshot(url)
# get duration
duration = info.get('duration')
@@ -89,4 +92,4 @@ class YoutubeDLArchiver(Archiver):
timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
title=info['title'] if 'title' in info else None, timestamp=timestamp)
title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot)