mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-13 05:38:29 +03:00
pyproject
This commit is contained in:
6
src/auto_archiver/utils/__init__.py
Normal file
6
src/auto_archiver/utils/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# we need to explicitly expose the available imports here
|
||||
from .gworksheet import GWorksheet
|
||||
from .misc import *
|
||||
from .util import Util
|
||||
from .webdriver import Webdriver
|
||||
from .gsheet import Gsheets
|
||||
44
src/auto_archiver/utils/gsheet.py
Normal file
44
src/auto_archiver/utils/gsheet.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import json, gspread
|
||||
|
||||
from loguru import logger
|
||||
from ..core import Step
|
||||
|
||||
|
||||
class Gsheets(Step):
|
||||
name = "gsheets"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
|
||||
assert self.sheet is not None, "You need to define a sheet name in your orchestration file when using gsheets."
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"sheet": {"default": None, "help": "name of the sheet to archive"},
|
||||
"header": {"default": 1, "help": "index of the header row (starts at 1)"},
|
||||
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
|
||||
"columns": {
|
||||
"default": {
|
||||
'url': 'link',
|
||||
'status': 'archive status',
|
||||
'folder': 'destination folder',
|
||||
'archive': 'archive location',
|
||||
'date': 'archive date',
|
||||
'thumbnail': 'thumbnail',
|
||||
'thumbnail_index': 'thumbnail index',
|
||||
'timestamp': 'upload timestamp',
|
||||
'title': 'upload title',
|
||||
'text': 'text content',
|
||||
'duration': 'duration',
|
||||
'screenshot': 'screenshot',
|
||||
'hash': 'hash',
|
||||
'wacz': 'wacz',
|
||||
'replaywebpage': 'replaywebpage',
|
||||
},
|
||||
"help": "names of columns in the google sheet (stringified JSON object)",
|
||||
"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
|
||||
},
|
||||
}
|
||||
109
src/auto_archiver/utils/gworksheet.py
Normal file
109
src/auto_archiver/utils/gworksheet.py
Normal file
@@ -0,0 +1,109 @@
|
||||
from gspread import utils
|
||||
|
||||
|
||||
class GWorksheet:
|
||||
"""
|
||||
This class makes read/write operations to the a worksheet easier.
|
||||
It can read the headers from a custom row number, but the row references
|
||||
should always include the offset of the header.
|
||||
eg: if header=4, row 5 will be the first with data.
|
||||
"""
|
||||
COLUMN_NAMES = {
|
||||
'url': 'link',
|
||||
'status': 'archive status',
|
||||
'folder': 'destination folder',
|
||||
'archive': 'archive location',
|
||||
'date': 'archive date',
|
||||
'thumbnail': 'thumbnail',
|
||||
'thumbnail_index': 'thumbnail index',
|
||||
'timestamp': 'upload timestamp',
|
||||
'title': 'upload title',
|
||||
'duration': 'duration',
|
||||
'screenshot': 'screenshot',
|
||||
'hash': 'hash',
|
||||
'wacz': 'wacz',
|
||||
'replaywebpage': 'replaywebpage',
|
||||
}
|
||||
|
||||
def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):
|
||||
self.wks = worksheet
|
||||
self.columns = columns
|
||||
self.values = self.wks.get_values()
|
||||
if len(self.values) > 0:
|
||||
self.headers = [v.lower() for v in self.values[header_row - 1]]
|
||||
else:
|
||||
self.headers = []
|
||||
|
||||
def _check_col_exists(self, col: str):
|
||||
if col not in self.columns:
|
||||
raise Exception(f'Column {col} is not in the configured column names: {self.columns.keys()}')
|
||||
|
||||
def _col_index(self, col: str):
|
||||
self._check_col_exists(col)
|
||||
return self.headers.index(self.columns[col])
|
||||
|
||||
def col_exists(self, col: str):
|
||||
self._check_col_exists(col)
|
||||
return self.columns[col] in self.headers
|
||||
|
||||
def count_rows(self):
|
||||
return len(self.values)
|
||||
|
||||
def get_row(self, row: int):
|
||||
# row is 1-based
|
||||
return self.values[row - 1]
|
||||
|
||||
def get_values(self):
|
||||
return self.values
|
||||
|
||||
def get_cell(self, row, col: str, fresh=False):
|
||||
"""
|
||||
returns the cell value from (row, col),
|
||||
where row can be an index (1-based) OR list of values
|
||||
as received from self.get_row(row)
|
||||
if fresh=True, the sheet is queried again for this cell
|
||||
"""
|
||||
col_index = self._col_index(col)
|
||||
|
||||
if fresh:
|
||||
return self.wks.cell(row, col_index + 1).value
|
||||
if type(row) == int:
|
||||
row = self.get_row(row)
|
||||
|
||||
if col_index >= len(row):
|
||||
return ''
|
||||
return row[col_index]
|
||||
|
||||
def get_cell_or_default(self, row, col: str, default: str = None, fresh=False, when_empty_use_default=True):
|
||||
"""
|
||||
return self.get_cell or default value on error (eg: column is missing)
|
||||
"""
|
||||
try:
|
||||
val = self.get_cell(row, col, fresh)
|
||||
if when_empty_use_default and val.strip() == "":
|
||||
return default
|
||||
return val
|
||||
except:
|
||||
return default
|
||||
|
||||
def set_cell(self, row: int, col: str, val):
|
||||
# row is 1-based
|
||||
col_index = self._col_index(col) + 1
|
||||
self.wks.update_cell(row, col_index, val)
|
||||
|
||||
def batch_set_cell(self, cell_updates):
|
||||
"""
|
||||
receives a list of [(row:int, col:str, val)] and batch updates it, the parameters are the same as in the self.set_cell() method
|
||||
"""
|
||||
cell_updates = [
|
||||
{
|
||||
'range': self.to_a1(row, col),
|
||||
'values': [[val]]
|
||||
}
|
||||
for row, col, val in cell_updates
|
||||
]
|
||||
self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
|
||||
|
||||
def to_a1(self, row: int, col: str):
|
||||
# row is 1-based
|
||||
return utils.rowcol_to_a1(row, self._col_index(col) + 1)
|
||||
42
src/auto_archiver/utils/misc.py
Normal file
42
src/auto_archiver/utils/misc.py
Normal file
@@ -0,0 +1,42 @@
|
||||
|
||||
import os, json, requests
|
||||
from datetime import datetime
|
||||
from loguru import logger
|
||||
|
||||
|
||||
def mkdir_if_not_exists(folder):
|
||||
if not os.path.exists(folder):
|
||||
os.makedirs(folder)
|
||||
|
||||
|
||||
def expand_url(url):
|
||||
# expand short URL links
|
||||
if 'https://t.co/' in url:
|
||||
try:
|
||||
r = requests.get(url)
|
||||
logger.debug(f'Expanded url {url} to {r.url}')
|
||||
return r.url
|
||||
except:
|
||||
logger.error(f'Failed to expand url {url}')
|
||||
return url
|
||||
|
||||
|
||||
def getattr_or(o: object, prop: str, default=None):
|
||||
try:
|
||||
res = getattr(o, prop)
|
||||
if res is None: raise
|
||||
return res
|
||||
except:
|
||||
return default
|
||||
|
||||
|
||||
class DateTimeEncoder(json.JSONEncoder):
|
||||
# to allow json.dump with datetimes do json.dumps(obj, cls=DateTimeEncoder)
|
||||
def default(self, o):
|
||||
if isinstance(o, datetime):
|
||||
return str(o) # with timezone
|
||||
return json.JSONEncoder.default(self, o)
|
||||
|
||||
|
||||
def dump_payload(p):
|
||||
return json.dumps(p, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
|
||||
19
src/auto_archiver/utils/util.py
Normal file
19
src/auto_archiver/utils/util.py
Normal file
@@ -0,0 +1,19 @@
|
||||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from ..core import Metadata, Step
|
||||
|
||||
#TODO: likely unused
|
||||
@dataclass
|
||||
class Util(Step):
|
||||
name = "util"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
Step.__init__(self)
|
||||
|
||||
# only for typing...
|
||||
def init(name: str, config: dict) -> Util:
|
||||
return super().init(name, config, Util)
|
||||
|
||||
@abstractmethod
|
||||
def enrich(self, item: Metadata) -> Metadata: pass
|
||||
45
src/auto_archiver/utils/webdriver.py
Normal file
45
src/auto_archiver/utils/webdriver.py
Normal file
@@ -0,0 +1,45 @@
|
||||
from __future__ import annotations
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from loguru import logger
|
||||
from selenium.webdriver.common.by import By
|
||||
import time
|
||||
|
||||
|
||||
class Webdriver:
|
||||
def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False) -> webdriver:
|
||||
self.width = width
|
||||
self.height = height
|
||||
self.timeout_seconds = timeout_seconds
|
||||
self.facebook_accept_cookies = facebook_accept_cookies
|
||||
|
||||
def __enter__(self) -> webdriver:
|
||||
options = webdriver.FirefoxOptions()
|
||||
options.headless = True
|
||||
options.set_preference('network.protocol-handler.external.tg', False)
|
||||
try:
|
||||
self.driver = webdriver.Firefox(options=options)
|
||||
self.driver.set_window_size(self.width, self.height)
|
||||
self.driver.set_page_load_timeout(self.timeout_seconds)
|
||||
except TimeoutException as e:
|
||||
logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")
|
||||
|
||||
if self.facebook_accept_cookies:
|
||||
try:
|
||||
logger.debug(f'Trying fb click accept cookie popup.')
|
||||
self.driver.get("http://www.facebook.com")
|
||||
foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
|
||||
foo.click()
|
||||
logger.debug(f'fb click worked')
|
||||
# linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
|
||||
time.sleep(2)
|
||||
except:
|
||||
logger.warning(f'Failed on fb accept cookies.')
|
||||
|
||||
return self.driver
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.driver.close()
|
||||
self.driver.quit()
|
||||
del self.driver
|
||||
return True
|
||||
Reference in New Issue
Block a user