cleanup and docs

2026-06-08 03:18:28 +03:00 · 2022-02-23 16:07:58 +01:00
parent 9550cd509e
commit 9a264a7dfe
6 changed files with 55 additions and 27 deletions
--- a/README.md
+++ b/README.md
@@ -68,3 +68,23 @@ To make it easier to set up new auto-archiver sheets, the auto-auto-archiver wil

 ![A screenshot of a Google Spreadsheet configured to show instructional text and a list of sheet names to check with auto-archiver.](docs/auto-auto.png)

+# Code structure
+Code is split into functional concepts:
+1. [Archivers](archivers/) - receive a URL that they try to archive
+2. [Storages](storages/) - they deal with where the archived files go
+3. utilities
+   1. [GWorksheet](gworksheet.py) - facilitates some of the reading/writing tasks for a Google Worksheet
+
+### Current Archivers
+```mermaid
+graph TD
+    A(Archiver) -->|parent of| B(TelegramArchiver)
+    A -->|parent of| C(TikTokArchiver)
+    A -->|parent of| D(YoutubeDLArchiver)
+    A -->|parent of| E(WaybackArchiver)
+```
+### Current Storages
+```mermaid
+graph TD
+    A(BaseStorage) -->|parent of| B(S3Storage)
+```
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -3,6 +3,7 @@ import ffmpeg
 import datetime
 from dataclasses import dataclass
 from abc import ABC, abstractmethod
+from urllib.parse import urlparse

 from storages import Storage

@@ -30,6 +31,9 @@ class Archiver(ABC):
    @abstractmethod
    def download(self, url, check_if_exists=False): pass

+    def get_netloc(self, url):
+        return urlparse(url).netloc
+
    def get_key(self, filename):
        """
        returns a key in the format "[archiverName]_[filename]" includes extension
@@ -40,9 +44,12 @@ class Archiver(ABC):
            _id = _id.replace('unknown_video', 'jpg')
        return f'{self.name}_{_id}{extension}'

-    def get_thumbnails(self, filename, duration=None):
-        if not os.path.exists(filename.split('.')[0]):
-            os.mkdir(filename.split('.')[0])
+    def get_thumbnails(self, filename, key, duration=None):
+        thumbnails_folder = filename.split('.')[0] + '/'
+        key_folder = key.split('.')[0] + '/'
+
+        if not os.path.exists(thumbnails_folder):
+            os.mkdir(thumbnails_folder)

        fps = 0.5
        if duration is not None:
@@ -57,15 +64,14 @@ class Archiver(ABC):

        stream = ffmpeg.input(filename)
        stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
-        stream.output(filename.split('.')[0] + '/out%d.jpg').run()
+        stream.output(thumbnails_folder + 'out%d.jpg').run()

-        thumbnails = os.listdir(filename.split('.')[0] + '/')
+        thumbnails = os.listdir(thumbnails_folder)
        cdn_urls = []
-
        for fname in thumbnails:
            if fname[-3:] == 'jpg':
-                thumbnail_filename = filename.split('.')[0] + '/' + fname
-                key = filename.split('/')[1].split('.')[0] + '/' + fname
+                thumbnail_filename = thumbnails_folder + fname
+                key = key_folder + fname

                cdn_url = self.storage.get_cdn_url(key)

@@ -86,12 +92,12 @@ class Archiver(ABC):
            index_page += f'<img src="{t}" />'

        index_page += f"</body></html>"
-        index_fname = filename.split('.')[0] + '/index.html'
+        index_fname = thumbnails_folder + 'index.html'

        with open(index_fname, 'w') as f:
            f.write(index_page)

-        thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
+        thumb_index = key_folder + 'index.html'

        self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'})

--- a/archivers/telegram_archiver.py
+++ b/archivers/telegram_archiver.py
@@ -10,7 +10,7 @@ class TelegramArchiver(Archiver):

    def download(self, url, check_if_exists=False):
        # detect URLs that we definitely cannot handle
-        if 'http://t.me/' not in url and 'https://t.me/' not in url:
+        if 't.me' != self.get_netloc(url):
            return False

        headers = {
@@ -20,7 +20,7 @@ class TelegramArchiver(Archiver):

        original_url = url

-        # TODO: check if we can do this more resilient to user-input
+        # TODO: check if we can do this more resilient to variable URLs
        if url[-8:] != "?embed=1":
            url += "?embed=1"

@@ -32,8 +32,8 @@ class TelegramArchiver(Archiver):
            return False  # could not find video

        video_url = video.get('src')
-        key = video_url.split('/')[-1].split('?')[0]
-        key = self.get_key(key)
+        video_id = video_url.split('/')[-1].split('?')[0]
+        key = self.get_key(video_id)

        filename = 'tmp/' + key

@@ -60,7 +60,7 @@ class TelegramArchiver(Archiver):
            duration = float(duration)

        # process thumbnails
-        key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
+        key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
        os.remove(filename)

        return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
--- a/archivers/tiktok_archiver.py
+++ b/archivers/tiktok_archiver.py
@@ -37,8 +37,9 @@ class TiktokArchiver(Archiver):
                self.storage.upload(filename, key)

            try:
-                key_thumb, thumb_index = self.get_thumbnails(filename, duration=info.duration)
-            except:
+                key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=info.duration)
+            except Exception as e:
+                logger.error(e)
                key_thumb = ''
                thumb_index = 'error creating thumbnails'

--- a/archivers/youtubedl_archiver.py
+++ b/archivers/youtubedl_archiver.py
@@ -9,14 +9,15 @@ from .base_archiver import Archiver, ArchiveResult

 class YoutubeDLArchiver(Archiver):
    name = "yotube_dl"
+    ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}

    def download(self, url, check_if_exists=False):
-        ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
-        if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
+        netloc = self.get_netloc(url)
+        if netloc in ['facebook.com', 'wwww.facebook.com'] and os.getenv('FB_COOKIE'):
            logger.info('Using Facebook cookie')
            youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')

-        ydl = youtube_dl.YoutubeDL(ydl_opts)
+        ydl = youtube_dl.YoutubeDL(YoutubeDLArchiver.ydl_opts)
        cdn_url = None
        status = 'success'

@@ -26,7 +27,7 @@ class YoutubeDLArchiver(Archiver):
            # no video here
            return False

-        if 'is_live' in info and info['is_live']:
+        if info.get('is_live', False):
            logger.warning("Live streaming media, not archiving now")
            return ArchiveResult(status="Streaming media")

@@ -74,11 +75,11 @@ class YoutubeDLArchiver(Archiver):
            self.storage.upload(filename, key)

        # get duration
-        duration = info['duration'] if 'duration' in info else None
+        duration = info.get('duration')

        # get thumbnails
        try:
-            key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
+            key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
        except:
            key_thumb = ''
            thumb_index = 'Could not generate thumbnails'
--- a/gworksheet.py
+++ b/gworksheet.py
@@ -63,13 +63,13 @@ class GWorksheet:
        """
        cell_updates = [
            {
-                'range': self.to_a1(row, self._col_index(col) + 1),
+                'range': self.to_a1(row, col),
                'values': [[val]]
            }
            for row, col, val in cell_updates
        ]
        self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')

-    def to_a1(self, row: int, col: int):
-        # row, col are 1-based
-        return utils.rowcol_to_a1(row, col)
+    def to_a1(self, row: int, col: str):
+        # row is 1-based
+        return utils.rowcol_to_a1(row, self._col_index(col) + 1)