pyproject

2026-06-13 05:38:29 +03:00 · 2023-01-21 19:01:02 +00:00
parent ea2c266fa2
commit 753039240f
72 changed files with 398 additions and 683 deletions
--- a/src/auto_archiver/utils/init.py
+++ b/src/auto_archiver/utils/init.py
@@ -0,0 +1,6 @@
+# we need to explicitly expose the available imports here
+from .gworksheet import GWorksheet
+from .misc import *
+from .util import Util
+from .webdriver import Webdriver
+from .gsheet import Gsheets
--- a/src/auto_archiver/utils/gsheet.py
+++ b/src/auto_archiver/utils/gsheet.py
@@ -0,0 +1,44 @@
+import json, gspread
+
+from loguru import logger
+from ..core import Step
+
+
+class Gsheets(Step):
+    name = "gsheets"
+
+    def __init__(self, config: dict) -> None:
+        # without this STEP.__init__ is not called
+        super().__init__(config)
+        self.gsheets_client = gspread.service_account(filename=self.service_account)
+        assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
+        assert self.sheet is not None, "You need to define a sheet name in your orchestration file when using gsheets."
+
+    @staticmethod
+    def configs() -> dict:
+        return {
+            "sheet": {"default": None, "help": "name of the sheet to archive"},
+            "header": {"default": 1, "help": "index of the header row (starts at 1)"},
+            "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
+            "columns": {
+                "default": {
+                    'url': 'link',
+                    'status': 'archive status',
+                    'folder': 'destination folder',
+                    'archive': 'archive location',
+                    'date': 'archive date',
+                    'thumbnail': 'thumbnail',
+                    'thumbnail_index': 'thumbnail index',
+                    'timestamp': 'upload timestamp',
+                    'title': 'upload title',
+                    'text': 'text content',
+                    'duration': 'duration',
+                    'screenshot': 'screenshot',
+                    'hash': 'hash',
+                    'wacz': 'wacz',
+                    'replaywebpage': 'replaywebpage',
+                },
+                "help": "names of columns in the google sheet (stringified JSON object)",
+                "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
+            },
+        }
--- a/src/auto_archiver/utils/gworksheet.py
+++ b/src/auto_archiver/utils/gworksheet.py
@@ -0,0 +1,109 @@
+from gspread import utils
+
+
+class GWorksheet:
+    """
+    This class makes read/write operations to the a worksheet easier.
+    It can read the headers from a custom row number, but the row references
+    should always include the offset of the header. 
+    eg: if header=4, row 5 will be the first with data. 
+    """
+    COLUMN_NAMES = {
+        'url': 'link',
+        'status': 'archive status',
+        'folder': 'destination folder',
+        'archive': 'archive location',
+        'date': 'archive date',
+        'thumbnail': 'thumbnail',
+        'thumbnail_index': 'thumbnail index',
+        'timestamp': 'upload timestamp',
+        'title': 'upload title',
+        'duration': 'duration',
+        'screenshot': 'screenshot',
+        'hash': 'hash',
+        'wacz': 'wacz',
+        'replaywebpage': 'replaywebpage',
+    }
+
+    def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):
+        self.wks = worksheet
+        self.columns = columns
+        self.values = self.wks.get_values()
+        if len(self.values) > 0:
+            self.headers = [v.lower() for v in self.values[header_row - 1]]
+        else:
+            self.headers = []
+
+    def _check_col_exists(self, col: str):
+        if col not in self.columns:
+            raise Exception(f'Column {col} is not in the configured column names: {self.columns.keys()}')
+
+    def _col_index(self, col: str):
+        self._check_col_exists(col)
+        return self.headers.index(self.columns[col])
+
+    def col_exists(self, col: str):
+        self._check_col_exists(col)
+        return self.columns[col] in self.headers
+
+    def count_rows(self):
+        return len(self.values)
+
+    def get_row(self, row: int):
+        # row is 1-based
+        return self.values[row - 1]
+
+    def get_values(self):
+        return self.values
+
+    def get_cell(self, row, col: str, fresh=False):
+        """
+        returns the cell value from (row, col), 
+        where row can be an index (1-based) OR list of values
+        as received from self.get_row(row)
+        if fresh=True, the sheet is queried again for this cell
+        """
+        col_index = self._col_index(col)
+
+        if fresh:
+            return self.wks.cell(row, col_index + 1).value
+        if type(row) == int:
+            row = self.get_row(row)
+
+        if col_index >= len(row):
+            return ''
+        return row[col_index]
+
+    def get_cell_or_default(self, row, col: str, default: str = None, fresh=False, when_empty_use_default=True):
+        """
+        return self.get_cell or default value on error (eg: column is missing)
+        """
+        try:
+            val = self.get_cell(row, col, fresh)
+            if when_empty_use_default and val.strip() == "":
+                return default
+            return val
+        except:
+            return default
+
+    def set_cell(self, row: int, col: str, val):
+        # row is 1-based
+        col_index = self._col_index(col) + 1
+        self.wks.update_cell(row, col_index, val)
+
+    def batch_set_cell(self, cell_updates):
+        """
+        receives a list of [(row:int, col:str, val)] and batch updates it, the parameters are the same as in the self.set_cell() method
+        """
+        cell_updates = [
+            {
+                'range': self.to_a1(row, col),
+                'values': [[val]]
+            }
+            for row, col, val in cell_updates
+        ]
+        self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
+
+    def to_a1(self, row: int, col: str):
+        # row is 1-based
+        return utils.rowcol_to_a1(row, self._col_index(col) + 1)
--- a/src/auto_archiver/utils/misc.py
+++ b/src/auto_archiver/utils/misc.py
@@ -0,0 +1,42 @@
+
+import os, json, requests
+from datetime import datetime
+from loguru import logger
+
+
+def mkdir_if_not_exists(folder):
+    if not os.path.exists(folder):
+        os.makedirs(folder)
+
+
+def expand_url(url):
+    # expand short URL links
+    if 'https://t.co/' in url:
+        try:
+            r = requests.get(url)
+            logger.debug(f'Expanded url {url} to {r.url}')
+            return r.url
+        except:
+            logger.error(f'Failed to expand url {url}')
+    return url
+
+
+def getattr_or(o: object, prop: str, default=None):
+    try:
+        res = getattr(o, prop)
+        if res is None: raise
+        return res
+    except:
+        return default
+
+
+class DateTimeEncoder(json.JSONEncoder):
+    # to allow json.dump with datetimes do json.dumps(obj, cls=DateTimeEncoder)
+    def default(self, o):
+        if isinstance(o, datetime):
+            return str(o)  # with timezone
+        return json.JSONEncoder.default(self, o)
+
+
+def dump_payload(p):
+    return json.dumps(p, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
--- a/src/auto_archiver/utils/util.py
+++ b/src/auto_archiver/utils/util.py
@@ -0,0 +1,19 @@
+from __future__ import annotations
+from abc import abstractmethod
+from dataclasses import dataclass
+from ..core import Metadata, Step
+
+#TODO: likely unused
+@dataclass
+class Util(Step):
+    name = "util"
+
+    def __init__(self, config: dict) -> None:
+        Step.__init__(self)
+        
+    # only for typing...
+    def init(name: str, config: dict) -> Util:
+        return super().init(name, config, Util)
+
+    @abstractmethod
+    def enrich(self, item: Metadata) -> Metadata: pass
--- a/src/auto_archiver/utils/webdriver.py
+++ b/src/auto_archiver/utils/webdriver.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+from selenium import webdriver
+from selenium.common.exceptions import TimeoutException
+from loguru import logger
+from selenium.webdriver.common.by import By
+import time
+
+
+class Webdriver:
+    def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False) -> webdriver:
+        self.width = width
+        self.height = height
+        self.timeout_seconds = timeout_seconds
+        self.facebook_accept_cookies = facebook_accept_cookies
+
+    def __enter__(self) -> webdriver:
+        options = webdriver.FirefoxOptions()
+        options.headless = True
+        options.set_preference('network.protocol-handler.external.tg', False)
+        try:
+            self.driver = webdriver.Firefox(options=options)
+            self.driver.set_window_size(self.width, self.height)
+            self.driver.set_page_load_timeout(self.timeout_seconds)
+        except TimeoutException as e:
+            logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")
+
+        if self.facebook_accept_cookies:
+            try:
+                logger.debug(f'Trying fb click accept cookie popup.')
+                self.driver.get("http://www.facebook.com")
+                foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
+                foo.click()
+                logger.debug(f'fb click worked')
+                # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
+                time.sleep(2)
+            except:
+                logger.warning(f'Failed on fb accept cookies.')
+
+        return self.driver
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.driver.close()
+        self.driver.quit()
+        del self.driver
+        return True