diff --git a/README.md b/README.md index cec6e9a..3d7f751 100644 --- a/README.md +++ b/README.md @@ -68,3 +68,23 @@ To make it easier to set up new auto-archiver sheets, the auto-auto-archiver wil ![A screenshot of a Google Spreadsheet configured to show instructional text and a list of sheet names to check with auto-archiver.](docs/auto-auto.png) +# Code structure +Code is split into functional concepts: +1. [Archivers](archivers/) - receive a URL that they try to archive +2. [Storages](storages/) - they deal with where the archived files go +3. utilities + 1. [GWorksheet](gworksheet.py) - facilitates some of the reading/writing tasks for a Google Worksheet + +### Current Archivers +```mermaid +graph TD + A(Archiver) -->|parent of| B(TelegramArchiver) + A -->|parent of| C(TikTokArchiver) + A -->|parent of| D(YoutubeDLArchiver) + A -->|parent of| E(WaybackArchiver) +``` +### Current Storages +```mermaid +graph TD + A(BaseStorage) -->|parent of| B(S3Storage) +``` diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index b13a77f..6257aba 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -3,6 +3,7 @@ import ffmpeg import datetime from dataclasses import dataclass from abc import ABC, abstractmethod +from urllib.parse import urlparse from storages import Storage @@ -30,6 +31,9 @@ class Archiver(ABC): @abstractmethod def download(self, url, check_if_exists=False): pass + def get_netloc(self, url): + return urlparse(url).netloc + def get_key(self, filename): """ returns a key in the format "[archiverName]_[filename]" includes extension @@ -40,9 +44,12 @@ class Archiver(ABC): _id = _id.replace('unknown_video', 'jpg') return f'{self.name}_{_id}{extension}' - def get_thumbnails(self, filename, duration=None): - if not os.path.exists(filename.split('.')[0]): - os.mkdir(filename.split('.')[0]) + def get_thumbnails(self, filename, key, duration=None): + thumbnails_folder = filename.split('.')[0] + '/' + key_folder = key.split('.')[0] + '/' + + if not os.path.exists(thumbnails_folder): + os.mkdir(thumbnails_folder) fps = 0.5 if duration is not None: @@ -57,15 +64,14 @@ class Archiver(ABC): stream = ffmpeg.input(filename) stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1) - stream.output(filename.split('.')[0] + '/out%d.jpg').run() + stream.output(thumbnails_folder + 'out%d.jpg').run() - thumbnails = os.listdir(filename.split('.')[0] + '/') + thumbnails = os.listdir(thumbnails_folder) cdn_urls = [] - for fname in thumbnails: if fname[-3:] == 'jpg': - thumbnail_filename = filename.split('.')[0] + '/' + fname - key = filename.split('/')[1].split('.')[0] + '/' + fname + thumbnail_filename = thumbnails_folder + fname + key = key_folder + fname cdn_url = self.storage.get_cdn_url(key) @@ -86,12 +92,12 @@ class Archiver(ABC): index_page += f'' index_page += f"" - index_fname = filename.split('.')[0] + '/index.html' + index_fname = thumbnails_folder + 'index.html' with open(index_fname, 'w') as f: f.write(index_page) - thumb_index = filename.split('/')[1].split('.')[0] + '/index.html' + thumb_index = key_folder + 'index.html' self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'}) diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index 5593acd..5a9b013 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -10,7 +10,7 @@ class TelegramArchiver(Archiver): def download(self, url, check_if_exists=False): # detect URLs that we definitely cannot handle - if 'http://t.me/' not in url and 'https://t.me/' not in url: + if 't.me' != self.get_netloc(url): return False headers = { @@ -20,7 +20,7 @@ class TelegramArchiver(Archiver): original_url = url - # TODO: check if we can do this more resilient to user-input + # TODO: check if we can do this more resilient to variable URLs if url[-8:] != "?embed=1": url += "?embed=1" @@ -32,8 +32,8 @@ class TelegramArchiver(Archiver): return False # could not find video video_url = video.get('src') - key = video_url.split('/')[-1].split('?')[0] - key = self.get_key(key) + video_id = video_url.split('/')[-1].split('?')[0] + key = self.get_key(video_id) filename = 'tmp/' + key @@ -60,7 +60,7 @@ class TelegramArchiver(Archiver): duration = float(duration) # process thumbnails - key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration) + key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration) os.remove(filename) return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index b54f956..62aa415 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -37,8 +37,9 @@ class TiktokArchiver(Archiver): self.storage.upload(filename, key) try: - key_thumb, thumb_index = self.get_thumbnails(filename, duration=info.duration) - except: + key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=info.duration) + except Exception as e: + logger.error(e) key_thumb = '' thumb_index = 'error creating thumbnails' diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index 88f7970..ec11061 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -9,14 +9,15 @@ from .base_archiver import Archiver, ArchiveResult class YoutubeDLArchiver(Archiver): name = "yotube_dl" + ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} def download(self, url, check_if_exists=False): - ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} - if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'): + netloc = self.get_netloc(url) + if netloc in ['facebook.com', 'wwww.facebook.com'] and os.getenv('FB_COOKIE'): logger.info('Using Facebook cookie') youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE') - ydl = youtube_dl.YoutubeDL(ydl_opts) + ydl = youtube_dl.YoutubeDL(YoutubeDLArchiver.ydl_opts) cdn_url = None status = 'success' @@ -26,7 +27,7 @@ class YoutubeDLArchiver(Archiver): # no video here return False - if 'is_live' in info and info['is_live']: + if info.get('is_live', False): logger.warning("Live streaming media, not archiving now") return ArchiveResult(status="Streaming media") @@ -74,11 +75,11 @@ class YoutubeDLArchiver(Archiver): self.storage.upload(filename, key) # get duration - duration = info['duration'] if 'duration' in info else None + duration = info.get('duration') # get thumbnails try: - key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration) + key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration) except: key_thumb = '' thumb_index = 'Could not generate thumbnails' diff --git a/gworksheet.py b/gworksheet.py index 88de9a4..4349e2a 100644 --- a/gworksheet.py +++ b/gworksheet.py @@ -63,13 +63,13 @@ class GWorksheet: """ cell_updates = [ { - 'range': self.to_a1(row, self._col_index(col) + 1), + 'range': self.to_a1(row, col), 'values': [[val]] } for row, col, val in cell_updates ] self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED') - def to_a1(self, row: int, col: int): - # row, col are 1-based - return utils.rowcol_to_a1(row, col) + def to_a1(self, row: int, col: str): + # row is 1-based + return utils.rowcol_to_a1(row, self._col_index(col) + 1)