mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 20:58:29 +03:00
The [browsertrix-crawler] utility is a browser-based crawler that can crawl one or more pages. browsertrix-crawler creates archives in the [WACZ] format which is essentially a standardized ZIP file (similar to DOCX, EPUB, JAR, etc) which can then be replayed using the [ReplayWeb.page] web component, or unzipped to get the original WARC data (the ISO standard format used by the Internet Archive Wayback Machine). This PR adds browsertrix-crawler to archiver classes where screenshots are made made. The WACZ is uploaded to storage and then added to a new column in the spreadsheet. A column can be added that will display the WACZ, loaded from cloud storage (S3, digitalocean, etc) using the client side ReplayWeb page. You can see an example of the spreadsheet here: https://docs.google.com/spreadsheets/d/1Tk-iJWzT9Sx2-YccuPttL9HcMdZEnhv_OR7Bc6tfeu8/edit#gid=0 browsertrix-crawler requires Docker to be installed. If Docker is not installed an error message will be logged and things continue as normal. [browsertrix-crawler]: https://github.com/webrecorder/browsertrix-crawler [WACZ]: https://specs.webrecorder.net/wacz/latest/ [ReplayWeb.page]: https://replayweb.page
109 lines
3.5 KiB
Python
109 lines
3.5 KiB
Python
from gspread import utils
|
|
|
|
|
|
class GWorksheet:
|
|
"""
|
|
This class makes read/write operations to the a worksheet easier.
|
|
It can read the headers from a custom row number, but the row references
|
|
should always include the offset of the header.
|
|
eg: if header=4, row 5 will be the first with data.
|
|
"""
|
|
COLUMN_NAMES = {
|
|
'url': 'link',
|
|
'status': 'archive status',
|
|
'folder': 'destination folder',
|
|
'archive': 'archive location',
|
|
'date': 'archive date',
|
|
'thumbnail': 'thumbnail',
|
|
'thumbnail_index': 'thumbnail index',
|
|
'timestamp': 'upload timestamp',
|
|
'title': 'upload title',
|
|
'duration': 'duration',
|
|
'screenshot': 'screenshot',
|
|
'hash': 'hash',
|
|
'wacz': 'wacz'
|
|
}
|
|
|
|
def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):
|
|
self.wks = worksheet
|
|
self.columns = columns
|
|
self.values = self.wks.get_values()
|
|
if len(self.values) > 0:
|
|
self.headers = [v.lower() for v in self.values[header_row - 1]]
|
|
else:
|
|
self.headers = []
|
|
|
|
def _check_col_exists(self, col: str):
|
|
if col not in self.columns:
|
|
raise Exception(f'Column {col} is not in the configured column names: {self.columns.keys()}')
|
|
|
|
def _col_index(self, col: str):
|
|
self._check_col_exists(col)
|
|
return self.headers.index(self.columns[col])
|
|
|
|
def col_exists(self, col: str):
|
|
self._check_col_exists(col)
|
|
return self.columns[col] in self.headers
|
|
|
|
def count_rows(self):
|
|
return len(self.values)
|
|
|
|
def get_row(self, row: int):
|
|
# row is 1-based
|
|
return self.values[row - 1]
|
|
|
|
def get_values(self):
|
|
return self.values
|
|
|
|
def get_cell(self, row, col: str, fresh=False):
|
|
"""
|
|
returns the cell value from (row, col),
|
|
where row can be an index (1-based) OR list of values
|
|
as received from self.get_row(row)
|
|
if fresh=True, the sheet is queried again for this cell
|
|
"""
|
|
col_index = self._col_index(col)
|
|
|
|
if fresh:
|
|
return self.wks.cell(row, col_index + 1).value
|
|
if type(row) == int:
|
|
row = self.get_row(row)
|
|
|
|
if col_index >= len(row):
|
|
return ''
|
|
return row[col_index]
|
|
|
|
def get_cell_or_default(self, row, col: str, default: str = None, fresh=False, when_empty_use_default=True):
|
|
"""
|
|
return self.get_cell or default value on error (eg: column is missing)
|
|
"""
|
|
try:
|
|
val = self.get_cell(row, col, fresh)
|
|
if when_empty_use_default and val.strip() == "":
|
|
return default
|
|
return val
|
|
except:
|
|
return default
|
|
|
|
def set_cell(self, row: int, col: str, val):
|
|
# row is 1-based
|
|
col_index = self._col_index(col) + 1
|
|
self.wks.update_cell(row, col_index, val)
|
|
|
|
def batch_set_cell(self, cell_updates):
|
|
"""
|
|
receives a list of [(row:int, col:str, val)] and batch updates it, the parameters are the same as in the self.set_cell() method
|
|
"""
|
|
cell_updates = [
|
|
{
|
|
'range': self.to_a1(row, col),
|
|
'values': [[val]]
|
|
}
|
|
for row, col, val in cell_updates
|
|
]
|
|
self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
|
|
|
|
def to_a1(self, row: int, col: str):
|
|
# row is 1-based
|
|
return utils.rowcol_to_a1(row, self._col_index(col) + 1)
|