diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index 16c6ccf..5593acd 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -54,8 +54,8 @@ class TelegramArchiver(Archiver): # extract duration from HTML duration = s.find_all('time')[0].contents[0] if ':' in duration: - duration = float(duration.split( - ':')[0]) * 60 + float(duration.split(':')[1]) + duration = float(duration.split(':')[0]) * 60 + + float(duration.split(':')[1]) else: duration = float(duration) diff --git a/auto_archive.py b/auto_archive.py index 7e624d0..cb70c58 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -14,17 +14,18 @@ load_dotenv() def update_sheet(gw, row, result: archivers.ArchiveResult): - update = [] + cell_updates = [] + row_values = gw.get_row(row) def batch_if_valid(col, val, final_value=None): final_value = final_value or val - if val and gw.col_exists(col) and gw.cell(row, col) == '': - update.append((row, col, final_value)) + if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '': + cell_updates.append((row, col, final_value)) - update.append((row, 'status', result.status)) + cell_updates.append((row, 'status', result.status)) batch_if_valid('archive', result.cdn_url) - batch_if_valid('archive', True, datetime.datetime.now().isoformat()) + batch_if_valid('date', True, datetime.datetime.now().isoformat()) batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")') batch_if_valid('thumbnail_index', result.thumbnail_index) batch_if_valid('title', result.title) @@ -34,7 +35,18 @@ def update_sheet(gw, row, result: archivers.ArchiveResult): result.timestamp = datetime.datetime.fromtimestamp(result.timestamp).isoformat() batch_if_valid('timestamp', result.timestamp) - gw.update_batch(update) + gw.batch_set_cell(cell_updates) + + +def expand_url(url): + # expand short URL links + if 'https://t.co/' in url: + try: + r = requests.get(url) + url = r.url + except: + logger.error(f'Failed to expand url {url}') + return url def process_sheet(sheet): @@ -74,38 +86,34 @@ def process_sheet(sheet): ] # loop through rows in worksheet - for i in range(2, gw.count_rows() + 1): - row = gw.get_row(i) - url = gw.cell(row, 'url') - status = gw.cell(row, 'status') + for row in range(2, gw.count_rows() + 1): + url = gw.get_cell(row, 'url') + status = gw.get_cell(row, 'status') if url != '' and status in ['', None]: - gw.update(i, 'status', 'Archive in progress') + gw.set_cell(row, 'status', 'Archive in progress') - # expand short URL links - if 'https://t.co/' in url: - r = requests.get(url) - url = r.url + url = expand_url(url) for archiver in active_archivers: - logger.debug(f'Trying {archiver} on row {i}') + logger.debug(f'Trying {archiver} on row {row}') + # TODO: add support for multiple videos/images - result = archiver.download(url, check_if_exists=True) + try: + result = archiver.download(url, check_if_exists=True) + except Exception as e: + result = False + logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}') if result: - logger.success(f'{archiver} succeeded on row {i}') - break + if result.status in ['success', 'already archived']: + logger.success(f'{archiver} succeeded on row {row}') + break + logger.warning(f'{archiver} did not succeed on row {row}, final status: {result.status}') if result: - update_sheet(gw, i, result) + update_sheet(gw, row, result) else: - gw.update(i, 'status', 'failed: no archiver') - - # # except: - # # if any unexpected errors occured, log these into the Google Sheet - # # t, value, traceback = sys.exc_info() - - # # update_sheet(wks, i, str( - # # value), {}, columns, v) + gw.set_cell(row, 'status', 'failed: no archiver') def main(): diff --git a/gworksheet.py b/gworksheet.py index 496ddcc..88de9a4 100644 --- a/gworksheet.py +++ b/gworksheet.py @@ -19,20 +19,18 @@ class GWorksheet: self.headers = [v.lower() for v in self.wks.row_values(1)] self.columns = columns - def worksheet(self): return self.wks - def _check_col_exists(self, col: str): if col not in self.columns: raise Exception(f'Column {col} is not in the configured column names: {self.columns.keys()}') + def _col_index(self, col: str): + self._check_col_exists(col) + return self.headers.index(self.columns[col]) + def col_exists(self, col: str): self._check_col_exists(col) return self.columns[col] in self.headers - def col_index(self, col: str): - self._check_col_exists(col) - return self.headers.index(self.columns[col]) - def count_rows(self): return len(self.wks.get_values()) @@ -40,30 +38,37 @@ class GWorksheet: # row is 1-based return self.wks.row_values(row) - def cell(self, row, col: str): - # row can be index (1-based) or list of values + def get_cell(self, row, col: str): + """ + returns the cell value from (row, col), + where row can be an index (1-based) OR list of values + as received from self.get_row(row) + """ if type(row) == int: row = self.get_row(row) - col_index = self.col_index(col) + col_index = self._col_index(col) if col_index >= len(row): return '' return row[col_index] - def update(self, row: int, col: str, val): + def set_cell(self, row: int, col: str, val): # row is 1-based - col_index = self.col_index(col) + 1 + col_index = self._col_index(col) + 1 self.wks.update_cell(row, col_index, val) - def update_batch(self, updates): - updates = [ + def batch_set_cell(self, cell_updates): + """ + receives a list of [(row:int, col:str, val)] and batch updates it, the parameters are the same as in the self.set_cell() method + """ + cell_updates = [ { - 'range': self.to_a1(row, self.col_index(col) + 1), + 'range': self.to_a1(row, self._col_index(col) + 1), 'values': [[val]] } - for row, col, val in updates + for row, col, val in cell_updates ] - self.wks.batch_update(updates, value_input_option='USER_ENTERED') + self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED') def to_a1(self, row: int, col: int): # row, col are 1-based