mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 05:08:28 +03:00
Add hash and screenshot methods; switch to more recent ytdl fork
This commit is contained in:
@@ -5,6 +5,10 @@ import shutil
|
||||
from dataclasses import dataclass
|
||||
from abc import ABC, abstractmethod
|
||||
from urllib.parse import urlparse
|
||||
import hashlib
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from loguru import logger
|
||||
import time
|
||||
|
||||
from storages import Storage
|
||||
from utils import mkdir_if_not_exists
|
||||
@@ -19,13 +23,16 @@ class ArchiveResult:
|
||||
duration: float = None
|
||||
title: str = None
|
||||
timestamp: datetime.datetime = None
|
||||
screenshot: str = None
|
||||
hash: str = None
|
||||
|
||||
|
||||
class Archiver(ABC):
|
||||
name = "default"
|
||||
|
||||
def __init__(self, storage: Storage):
|
||||
def __init__(self, storage: Storage, driver):
|
||||
self.storage = storage
|
||||
self.driver = driver
|
||||
|
||||
def __str__(self):
|
||||
return self.__class__.__name__
|
||||
@@ -46,6 +53,26 @@ class Archiver(ABC):
|
||||
_id = _id.replace('unknown_video', 'jpg')
|
||||
return f'{self.name}_{_id}{extension}'
|
||||
|
||||
def get_hash(self, filename):
|
||||
f = open(filename, "rb")
|
||||
bytes = f.read() # read entire file as bytes
|
||||
hash = hashlib.sha256(bytes)
|
||||
f.close()
|
||||
return hash.hexdigest()
|
||||
|
||||
def get_screenshot(self, url):
|
||||
key = self.get_key(urlparse(url).path.replace(
|
||||
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
|
||||
filename = 'tmp/' + key
|
||||
|
||||
self.driver.get(url)
|
||||
time.sleep(6)
|
||||
|
||||
self.driver.save_screenshot(filename)
|
||||
self.storage.upload(filename, key, extra_args={
|
||||
'ACL': 'public-read', 'ContentType': 'image/png'})
|
||||
return self.storage.get_cdn_url(key)
|
||||
|
||||
def get_thumbnails(self, filename, key, duration=None):
|
||||
thumbnails_folder = filename.split('.')[0] + '/'
|
||||
key_folder = key.split('.')[0] + '/'
|
||||
|
||||
@@ -49,6 +49,9 @@ class TelegramArchiver(Archiver):
|
||||
if status != 'already archived':
|
||||
self.storage.upload(filename, key)
|
||||
|
||||
hash = self.get_hash(filename)
|
||||
screenshot = self.get_screenshot(url)
|
||||
|
||||
# extract duration from HTML
|
||||
duration = s.find_all('time')[0].contents[0]
|
||||
if ':' in duration:
|
||||
@@ -58,8 +61,9 @@ class TelegramArchiver(Archiver):
|
||||
duration = float(duration)
|
||||
|
||||
# process thumbnails
|
||||
key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
|
||||
key_thumb, thumb_index = self.get_thumbnails(
|
||||
filename, key, duration=duration)
|
||||
os.remove(filename)
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
|
||||
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))
|
||||
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot)
|
||||
|
||||
@@ -43,12 +43,16 @@ class TiktokArchiver(Archiver):
|
||||
key_thumb = ''
|
||||
thumb_index = 'error creating thumbnails'
|
||||
|
||||
hash = self.get_hash(filename)
|
||||
screenshot = self.get_screenshot(url)
|
||||
|
||||
try: os.remove(filename)
|
||||
except FileNotFoundError:
|
||||
logger.info(f'tmp file not found thus not deleted {filename}')
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
|
||||
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat())
|
||||
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat(),
|
||||
hash=hash, screenshot=screenshot)
|
||||
|
||||
except tiktok_downloader.Except.InvalidUrl:
|
||||
status = 'Invalid URL'
|
||||
|
||||
@@ -8,8 +8,8 @@ from .base_archiver import Archiver, ArchiveResult
|
||||
class WaybackArchiver(Archiver):
|
||||
name = "wayback"
|
||||
|
||||
def __init__(self, storage: Storage):
|
||||
super(WaybackArchiver, self).__init__(storage)
|
||||
def __init__(self, storage: Storage, driver):
|
||||
super(WaybackArchiver, self).__init__(storage, driver)
|
||||
self.seen_urls = {}
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
@@ -71,6 +71,7 @@ class WaybackArchiver(Archiver):
|
||||
except:
|
||||
title = "Could not get title"
|
||||
|
||||
result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title)
|
||||
screenshot = self.get_screenshot(url)
|
||||
result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title, screenshot=screenshot)
|
||||
self.seen_urls[url] = result
|
||||
return result
|
||||
|
||||
@@ -1,29 +1,29 @@
|
||||
|
||||
import os
|
||||
import datetime
|
||||
import youtube_dl
|
||||
import yt_dlp
|
||||
from loguru import logger
|
||||
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
|
||||
|
||||
class YoutubeDLArchiver(Archiver):
|
||||
name = "yotube_dl"
|
||||
name = "youtube_dl"
|
||||
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
netloc = self.get_netloc(url)
|
||||
if netloc in ['facebook.com', 'wwww.facebook.com'] and os.getenv('FB_COOKIE'):
|
||||
logger.info('Using Facebook cookie')
|
||||
youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
|
||||
yt_dlp.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
|
||||
|
||||
ydl = youtube_dl.YoutubeDL(YoutubeDLArchiver.ydl_opts)
|
||||
ydl = yt_dlp.YoutubeDL(YoutubeDLArchiver.ydl_opts)
|
||||
cdn_url = None
|
||||
status = 'success'
|
||||
|
||||
try:
|
||||
info = ydl.extract_info(url, download=False)
|
||||
except youtube_dl.utils.DownloadError:
|
||||
except yt_dlp.utils.DownloadError:
|
||||
# no video here
|
||||
return False
|
||||
|
||||
@@ -74,6 +74,9 @@ class YoutubeDLArchiver(Archiver):
|
||||
|
||||
self.storage.upload(filename, key)
|
||||
|
||||
hash = self.get_hash(filename)
|
||||
screenshot = self.get_screenshot(url)
|
||||
|
||||
# get duration
|
||||
duration = info.get('duration')
|
||||
|
||||
@@ -89,4 +92,4 @@ class YoutubeDLArchiver(Archiver):
|
||||
timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
|
||||
title=info['title'] if 'title' in info else None, timestamp=timestamp)
|
||||
title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot)
|
||||
|
||||
Reference in New Issue
Block a user