cleanup and docs

This commit is contained in:
msramalho
2022-02-23 16:07:58 +01:00
parent 9550cd509e
commit 9a264a7dfe
6 changed files with 55 additions and 27 deletions

View File

@@ -68,3 +68,23 @@ To make it easier to set up new auto-archiver sheets, the auto-auto-archiver wil
![A screenshot of a Google Spreadsheet configured to show instructional text and a list of sheet names to check with auto-archiver.](docs/auto-auto.png)
# Code structure
Code is split into functional concepts:
1. [Archivers](archivers/) - receive a URL that they try to archive
2. [Storages](storages/) - they deal with where the archived files go
3. utilities
1. [GWorksheet](gworksheet.py) - facilitates some of the reading/writing tasks for a Google Worksheet
### Current Archivers
```mermaid
graph TD
A(Archiver) -->|parent of| B(TelegramArchiver)
A -->|parent of| C(TikTokArchiver)
A -->|parent of| D(YoutubeDLArchiver)
A -->|parent of| E(WaybackArchiver)
```
### Current Storages
```mermaid
graph TD
A(BaseStorage) -->|parent of| B(S3Storage)
```

View File

@@ -3,6 +3,7 @@ import ffmpeg
import datetime
from dataclasses import dataclass
from abc import ABC, abstractmethod
from urllib.parse import urlparse
from storages import Storage
@@ -30,6 +31,9 @@ class Archiver(ABC):
@abstractmethod
def download(self, url, check_if_exists=False): pass
def get_netloc(self, url):
return urlparse(url).netloc
def get_key(self, filename):
"""
returns a key in the format "[archiverName]_[filename]" includes extension
@@ -40,9 +44,12 @@ class Archiver(ABC):
_id = _id.replace('unknown_video', 'jpg')
return f'{self.name}_{_id}{extension}'
def get_thumbnails(self, filename, duration=None):
if not os.path.exists(filename.split('.')[0]):
os.mkdir(filename.split('.')[0])
def get_thumbnails(self, filename, key, duration=None):
thumbnails_folder = filename.split('.')[0] + '/'
key_folder = key.split('.')[0] + '/'
if not os.path.exists(thumbnails_folder):
os.mkdir(thumbnails_folder)
fps = 0.5
if duration is not None:
@@ -57,15 +64,14 @@ class Archiver(ABC):
stream = ffmpeg.input(filename)
stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
stream.output(filename.split('.')[0] + '/out%d.jpg').run()
stream.output(thumbnails_folder + 'out%d.jpg').run()
thumbnails = os.listdir(filename.split('.')[0] + '/')
thumbnails = os.listdir(thumbnails_folder)
cdn_urls = []
for fname in thumbnails:
if fname[-3:] == 'jpg':
thumbnail_filename = filename.split('.')[0] + '/' + fname
key = filename.split('/')[1].split('.')[0] + '/' + fname
thumbnail_filename = thumbnails_folder + fname
key = key_folder + fname
cdn_url = self.storage.get_cdn_url(key)
@@ -86,12 +92,12 @@ class Archiver(ABC):
index_page += f'<img src="{t}" />'
index_page += f"</body></html>"
index_fname = filename.split('.')[0] + '/index.html'
index_fname = thumbnails_folder + 'index.html'
with open(index_fname, 'w') as f:
f.write(index_page)
thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
thumb_index = key_folder + 'index.html'
self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'})

View File

@@ -10,7 +10,7 @@ class TelegramArchiver(Archiver):
def download(self, url, check_if_exists=False):
# detect URLs that we definitely cannot handle
if 'http://t.me/' not in url and 'https://t.me/' not in url:
if 't.me' != self.get_netloc(url):
return False
headers = {
@@ -20,7 +20,7 @@ class TelegramArchiver(Archiver):
original_url = url
# TODO: check if we can do this more resilient to user-input
# TODO: check if we can do this more resilient to variable URLs
if url[-8:] != "?embed=1":
url += "?embed=1"
@@ -32,8 +32,8 @@ class TelegramArchiver(Archiver):
return False # could not find video
video_url = video.get('src')
key = video_url.split('/')[-1].split('?')[0]
key = self.get_key(key)
video_id = video_url.split('/')[-1].split('?')[0]
key = self.get_key(video_id)
filename = 'tmp/' + key
@@ -60,7 +60,7 @@ class TelegramArchiver(Archiver):
duration = float(duration)
# process thumbnails
key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
os.remove(filename)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,

View File

@@ -37,8 +37,9 @@ class TiktokArchiver(Archiver):
self.storage.upload(filename, key)
try:
key_thumb, thumb_index = self.get_thumbnails(filename, duration=info.duration)
except:
key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=info.duration)
except Exception as e:
logger.error(e)
key_thumb = ''
thumb_index = 'error creating thumbnails'

View File

@@ -9,14 +9,15 @@ from .base_archiver import Archiver, ArchiveResult
class YoutubeDLArchiver(Archiver):
name = "yotube_dl"
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
def download(self, url, check_if_exists=False):
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
netloc = self.get_netloc(url)
if netloc in ['facebook.com', 'wwww.facebook.com'] and os.getenv('FB_COOKIE'):
logger.info('Using Facebook cookie')
youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
ydl = youtube_dl.YoutubeDL(ydl_opts)
ydl = youtube_dl.YoutubeDL(YoutubeDLArchiver.ydl_opts)
cdn_url = None
status = 'success'
@@ -26,7 +27,7 @@ class YoutubeDLArchiver(Archiver):
# no video here
return False
if 'is_live' in info and info['is_live']:
if info.get('is_live', False):
logger.warning("Live streaming media, not archiving now")
return ArchiveResult(status="Streaming media")
@@ -74,11 +75,11 @@ class YoutubeDLArchiver(Archiver):
self.storage.upload(filename, key)
# get duration
duration = info['duration'] if 'duration' in info else None
duration = info.get('duration')
# get thumbnails
try:
key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
except:
key_thumb = ''
thumb_index = 'Could not generate thumbnails'

View File

@@ -63,13 +63,13 @@ class GWorksheet:
"""
cell_updates = [
{
'range': self.to_a1(row, self._col_index(col) + 1),
'range': self.to_a1(row, col),
'values': [[val]]
}
for row, col, val in cell_updates
]
self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
def to_a1(self, row: int, col: int):
# row, col are 1-based
return utils.rowcol_to_a1(row, col)
def to_a1(self, row: int, col: str):
# row is 1-based
return utils.rowcol_to_a1(row, self._col_index(col) + 1)