cleanup and docs

This commit is contained in:
msramalho
2022-02-23 16:07:58 +01:00
parent 9550cd509e
commit 9a264a7dfe
6 changed files with 55 additions and 27 deletions

View File

@@ -68,3 +68,23 @@ To make it easier to set up new auto-archiver sheets, the auto-auto-archiver wil
![A screenshot of a Google Spreadsheet configured to show instructional text and a list of sheet names to check with auto-archiver.](docs/auto-auto.png) ![A screenshot of a Google Spreadsheet configured to show instructional text and a list of sheet names to check with auto-archiver.](docs/auto-auto.png)
# Code structure
Code is split into functional concepts:
1. [Archivers](archivers/) - receive a URL that they try to archive
2. [Storages](storages/) - they deal with where the archived files go
3. utilities
1. [GWorksheet](gworksheet.py) - facilitates some of the reading/writing tasks for a Google Worksheet
### Current Archivers
```mermaid
graph TD
A(Archiver) -->|parent of| B(TelegramArchiver)
A -->|parent of| C(TikTokArchiver)
A -->|parent of| D(YoutubeDLArchiver)
A -->|parent of| E(WaybackArchiver)
```
### Current Storages
```mermaid
graph TD
A(BaseStorage) -->|parent of| B(S3Storage)
```

View File

@@ -3,6 +3,7 @@ import ffmpeg
import datetime import datetime
from dataclasses import dataclass from dataclasses import dataclass
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from urllib.parse import urlparse
from storages import Storage from storages import Storage
@@ -30,6 +31,9 @@ class Archiver(ABC):
@abstractmethod @abstractmethod
def download(self, url, check_if_exists=False): pass def download(self, url, check_if_exists=False): pass
def get_netloc(self, url):
return urlparse(url).netloc
def get_key(self, filename): def get_key(self, filename):
""" """
returns a key in the format "[archiverName]_[filename]" includes extension returns a key in the format "[archiverName]_[filename]" includes extension
@@ -40,9 +44,12 @@ class Archiver(ABC):
_id = _id.replace('unknown_video', 'jpg') _id = _id.replace('unknown_video', 'jpg')
return f'{self.name}_{_id}{extension}' return f'{self.name}_{_id}{extension}'
def get_thumbnails(self, filename, duration=None): def get_thumbnails(self, filename, key, duration=None):
if not os.path.exists(filename.split('.')[0]): thumbnails_folder = filename.split('.')[0] + '/'
os.mkdir(filename.split('.')[0]) key_folder = key.split('.')[0] + '/'
if not os.path.exists(thumbnails_folder):
os.mkdir(thumbnails_folder)
fps = 0.5 fps = 0.5
if duration is not None: if duration is not None:
@@ -57,15 +64,14 @@ class Archiver(ABC):
stream = ffmpeg.input(filename) stream = ffmpeg.input(filename)
stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1) stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
stream.output(filename.split('.')[0] + '/out%d.jpg').run() stream.output(thumbnails_folder + 'out%d.jpg').run()
thumbnails = os.listdir(filename.split('.')[0] + '/') thumbnails = os.listdir(thumbnails_folder)
cdn_urls = [] cdn_urls = []
for fname in thumbnails: for fname in thumbnails:
if fname[-3:] == 'jpg': if fname[-3:] == 'jpg':
thumbnail_filename = filename.split('.')[0] + '/' + fname thumbnail_filename = thumbnails_folder + fname
key = filename.split('/')[1].split('.')[0] + '/' + fname key = key_folder + fname
cdn_url = self.storage.get_cdn_url(key) cdn_url = self.storage.get_cdn_url(key)
@@ -86,12 +92,12 @@ class Archiver(ABC):
index_page += f'<img src="{t}" />' index_page += f'<img src="{t}" />'
index_page += f"</body></html>" index_page += f"</body></html>"
index_fname = filename.split('.')[0] + '/index.html' index_fname = thumbnails_folder + 'index.html'
with open(index_fname, 'w') as f: with open(index_fname, 'w') as f:
f.write(index_page) f.write(index_page)
thumb_index = filename.split('/')[1].split('.')[0] + '/index.html' thumb_index = key_folder + 'index.html'
self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'}) self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'})

View File

@@ -10,7 +10,7 @@ class TelegramArchiver(Archiver):
def download(self, url, check_if_exists=False): def download(self, url, check_if_exists=False):
# detect URLs that we definitely cannot handle # detect URLs that we definitely cannot handle
if 'http://t.me/' not in url and 'https://t.me/' not in url: if 't.me' != self.get_netloc(url):
return False return False
headers = { headers = {
@@ -20,7 +20,7 @@ class TelegramArchiver(Archiver):
original_url = url original_url = url
# TODO: check if we can do this more resilient to user-input # TODO: check if we can do this more resilient to variable URLs
if url[-8:] != "?embed=1": if url[-8:] != "?embed=1":
url += "?embed=1" url += "?embed=1"
@@ -32,8 +32,8 @@ class TelegramArchiver(Archiver):
return False # could not find video return False # could not find video
video_url = video.get('src') video_url = video.get('src')
key = video_url.split('/')[-1].split('?')[0] video_id = video_url.split('/')[-1].split('?')[0]
key = self.get_key(key) key = self.get_key(video_id)
filename = 'tmp/' + key filename = 'tmp/' + key
@@ -60,7 +60,7 @@ class TelegramArchiver(Archiver):
duration = float(duration) duration = float(duration)
# process thumbnails # process thumbnails
key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration) key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
os.remove(filename) os.remove(filename)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,

View File

@@ -37,8 +37,9 @@ class TiktokArchiver(Archiver):
self.storage.upload(filename, key) self.storage.upload(filename, key)
try: try:
key_thumb, thumb_index = self.get_thumbnails(filename, duration=info.duration) key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=info.duration)
except: except Exception as e:
logger.error(e)
key_thumb = '' key_thumb = ''
thumb_index = 'error creating thumbnails' thumb_index = 'error creating thumbnails'

View File

@@ -9,14 +9,15 @@ from .base_archiver import Archiver, ArchiveResult
class YoutubeDLArchiver(Archiver): class YoutubeDLArchiver(Archiver):
name = "yotube_dl" name = "yotube_dl"
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
def download(self, url, check_if_exists=False): def download(self, url, check_if_exists=False):
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} netloc = self.get_netloc(url)
if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'): if netloc in ['facebook.com', 'wwww.facebook.com'] and os.getenv('FB_COOKIE'):
logger.info('Using Facebook cookie') logger.info('Using Facebook cookie')
youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE') youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
ydl = youtube_dl.YoutubeDL(ydl_opts) ydl = youtube_dl.YoutubeDL(YoutubeDLArchiver.ydl_opts)
cdn_url = None cdn_url = None
status = 'success' status = 'success'
@@ -26,7 +27,7 @@ class YoutubeDLArchiver(Archiver):
# no video here # no video here
return False return False
if 'is_live' in info and info['is_live']: if info.get('is_live', False):
logger.warning("Live streaming media, not archiving now") logger.warning("Live streaming media, not archiving now")
return ArchiveResult(status="Streaming media") return ArchiveResult(status="Streaming media")
@@ -74,11 +75,11 @@ class YoutubeDLArchiver(Archiver):
self.storage.upload(filename, key) self.storage.upload(filename, key)
# get duration # get duration
duration = info['duration'] if 'duration' in info else None duration = info.get('duration')
# get thumbnails # get thumbnails
try: try:
key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration) key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
except: except:
key_thumb = '' key_thumb = ''
thumb_index = 'Could not generate thumbnails' thumb_index = 'Could not generate thumbnails'

View File

@@ -63,13 +63,13 @@ class GWorksheet:
""" """
cell_updates = [ cell_updates = [
{ {
'range': self.to_a1(row, self._col_index(col) + 1), 'range': self.to_a1(row, col),
'values': [[val]] 'values': [[val]]
} }
for row, col, val in cell_updates for row, col, val in cell_updates
] ]
self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED') self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
def to_a1(self, row: int, col: int): def to_a1(self, row: int, col: str):
# row, col are 1-based # row is 1-based
return utils.rowcol_to_a1(row, col) return utils.rowcol_to_a1(row, self._col_index(col) + 1)