pyproject

This commit is contained in:
msramalho
2023-01-21 19:01:02 +00:00
parent ea2c266fa2
commit 753039240f
72 changed files with 398 additions and 683 deletions

View File

@@ -0,0 +1,6 @@
# we need to explicitly expose the available imports here
from .gworksheet import GWorksheet
from .misc import *
from .util import Util
from .webdriver import Webdriver
from .gsheet import Gsheets

View File

@@ -0,0 +1,44 @@
import json, gspread
from loguru import logger
from ..core import Step
class Gsheets(Step):
name = "gsheets"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
self.gsheets_client = gspread.service_account(filename=self.service_account)
assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
assert self.sheet is not None, "You need to define a sheet name in your orchestration file when using gsheets."
@staticmethod
def configs() -> dict:
return {
"sheet": {"default": None, "help": "name of the sheet to archive"},
"header": {"default": 1, "help": "index of the header row (starts at 1)"},
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
"columns": {
"default": {
'url': 'link',
'status': 'archive status',
'folder': 'destination folder',
'archive': 'archive location',
'date': 'archive date',
'thumbnail': 'thumbnail',
'thumbnail_index': 'thumbnail index',
'timestamp': 'upload timestamp',
'title': 'upload title',
'text': 'text content',
'duration': 'duration',
'screenshot': 'screenshot',
'hash': 'hash',
'wacz': 'wacz',
'replaywebpage': 'replaywebpage',
},
"help": "names of columns in the google sheet (stringified JSON object)",
"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
},
}

View File

@@ -0,0 +1,109 @@
from gspread import utils
class GWorksheet:
"""
This class makes read/write operations to the a worksheet easier.
It can read the headers from a custom row number, but the row references
should always include the offset of the header.
eg: if header=4, row 5 will be the first with data.
"""
COLUMN_NAMES = {
'url': 'link',
'status': 'archive status',
'folder': 'destination folder',
'archive': 'archive location',
'date': 'archive date',
'thumbnail': 'thumbnail',
'thumbnail_index': 'thumbnail index',
'timestamp': 'upload timestamp',
'title': 'upload title',
'duration': 'duration',
'screenshot': 'screenshot',
'hash': 'hash',
'wacz': 'wacz',
'replaywebpage': 'replaywebpage',
}
def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):
self.wks = worksheet
self.columns = columns
self.values = self.wks.get_values()
if len(self.values) > 0:
self.headers = [v.lower() for v in self.values[header_row - 1]]
else:
self.headers = []
def _check_col_exists(self, col: str):
if col not in self.columns:
raise Exception(f'Column {col} is not in the configured column names: {self.columns.keys()}')
def _col_index(self, col: str):
self._check_col_exists(col)
return self.headers.index(self.columns[col])
def col_exists(self, col: str):
self._check_col_exists(col)
return self.columns[col] in self.headers
def count_rows(self):
return len(self.values)
def get_row(self, row: int):
# row is 1-based
return self.values[row - 1]
def get_values(self):
return self.values
def get_cell(self, row, col: str, fresh=False):
"""
returns the cell value from (row, col),
where row can be an index (1-based) OR list of values
as received from self.get_row(row)
if fresh=True, the sheet is queried again for this cell
"""
col_index = self._col_index(col)
if fresh:
return self.wks.cell(row, col_index + 1).value
if type(row) == int:
row = self.get_row(row)
if col_index >= len(row):
return ''
return row[col_index]
def get_cell_or_default(self, row, col: str, default: str = None, fresh=False, when_empty_use_default=True):
"""
return self.get_cell or default value on error (eg: column is missing)
"""
try:
val = self.get_cell(row, col, fresh)
if when_empty_use_default and val.strip() == "":
return default
return val
except:
return default
def set_cell(self, row: int, col: str, val):
# row is 1-based
col_index = self._col_index(col) + 1
self.wks.update_cell(row, col_index, val)
def batch_set_cell(self, cell_updates):
"""
receives a list of [(row:int, col:str, val)] and batch updates it, the parameters are the same as in the self.set_cell() method
"""
cell_updates = [
{
'range': self.to_a1(row, col),
'values': [[val]]
}
for row, col, val in cell_updates
]
self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
def to_a1(self, row: int, col: str):
# row is 1-based
return utils.rowcol_to_a1(row, self._col_index(col) + 1)

View File

@@ -0,0 +1,42 @@
import os, json, requests
from datetime import datetime
from loguru import logger
def mkdir_if_not_exists(folder):
if not os.path.exists(folder):
os.makedirs(folder)
def expand_url(url):
# expand short URL links
if 'https://t.co/' in url:
try:
r = requests.get(url)
logger.debug(f'Expanded url {url} to {r.url}')
return r.url
except:
logger.error(f'Failed to expand url {url}')
return url
def getattr_or(o: object, prop: str, default=None):
try:
res = getattr(o, prop)
if res is None: raise
return res
except:
return default
class DateTimeEncoder(json.JSONEncoder):
# to allow json.dump with datetimes do json.dumps(obj, cls=DateTimeEncoder)
def default(self, o):
if isinstance(o, datetime):
return str(o) # with timezone
return json.JSONEncoder.default(self, o)
def dump_payload(p):
return json.dumps(p, ensure_ascii=False, indent=4, cls=DateTimeEncoder)

View File

@@ -0,0 +1,19 @@
from __future__ import annotations
from abc import abstractmethod
from dataclasses import dataclass
from ..core import Metadata, Step
#TODO: likely unused
@dataclass
class Util(Step):
name = "util"
def __init__(self, config: dict) -> None:
Step.__init__(self)
# only for typing...
def init(name: str, config: dict) -> Util:
return super().init(name, config, Util)
@abstractmethod
def enrich(self, item: Metadata) -> Metadata: pass

View File

@@ -0,0 +1,45 @@
from __future__ import annotations
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from loguru import logger
from selenium.webdriver.common.by import By
import time
class Webdriver:
def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False) -> webdriver:
self.width = width
self.height = height
self.timeout_seconds = timeout_seconds
self.facebook_accept_cookies = facebook_accept_cookies
def __enter__(self) -> webdriver:
options = webdriver.FirefoxOptions()
options.headless = True
options.set_preference('network.protocol-handler.external.tg', False)
try:
self.driver = webdriver.Firefox(options=options)
self.driver.set_window_size(self.width, self.height)
self.driver.set_page_load_timeout(self.timeout_seconds)
except TimeoutException as e:
logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")
if self.facebook_accept_cookies:
try:
logger.debug(f'Trying fb click accept cookie popup.')
self.driver.get("http://www.facebook.com")
foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
foo.click()
logger.debug(f'fb click worked')
# linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
time.sleep(2)
except:
logger.warning(f'Failed on fb accept cookies.')
return self.driver
def __exit__(self, exc_type, exc_val, exc_tb):
self.driver.close()
self.driver.quit()
del self.driver
return True