From 5135e97d3fdde49c6ef768e200aa8a1349f1285b Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Fri, 3 Jun 2022 18:03:49 +0200 Subject: [PATCH] cleanup auto_archive and config --- Pipfile | 1 - Pipfile.lock | 8 -- archivers/wayback_archiver.py | 2 +- auto_archive.py | 175 ++++++++++++---------------------- configs/config.py | 13 ++- utils/misc.py | 3 +- 6 files changed, 75 insertions(+), 127 deletions(-) diff --git a/Pipfile b/Pipfile index 7e5cbd7..7e31a3c 100644 --- a/Pipfile +++ b/Pipfile @@ -6,7 +6,6 @@ name = "pypi" [packages] gspread = "*" boto3 = "*" -python-dotenv = "*" argparse = "*" beautifulsoup4 = "*" tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"} diff --git a/Pipfile.lock b/Pipfile.lock index e47d720..9ee753d 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -707,14 +707,6 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==2.8.2" }, - "python-dotenv": { - "hashes": [ - "sha256:b7e3b04a59693c42c36f9ab1cc2acc46fa5df8c78e178fc33a8d4cd05c8d498f", - "sha256:d92a187be61fe482e4fd675b6d52200e7be63a12b724abbf931a40ce4fa92938" - ], - "index": "pypi", - "version": "==0.20.0" - }, "requests": { "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index bb18d2b..c76437e 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -78,6 +78,6 @@ class WaybackArchiver(Archiver): title = "Could not get title" screenshot = self.get_screenshot(url) - result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title, screenshot=screenshot) + result = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot) self.seen_urls[url] = result return result diff --git a/auto_archive.py b/auto_archive.py index 5df2cd0..a0f8883 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -1,24 +1,12 @@ -import sys import datetime import shutil -from loguru import logger -from dotenv import load_dotenv - import traceback +from loguru import logger from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult from utils import GWorksheet, mkdir_if_not_exists, expand_url from configs import Config -from storages.gd_storage import GDConfig, GDStorage -from utils import GWorksheet, mkdir_if_not_exists -import sys - -logger.add("logs/1trace.log", level="TRACE") -logger.add("logs/2info.log", level="INFO") -logger.add("logs/3success.log", level="SUCCESS") -logger.add("logs/4warning.log", level="WARNING") -logger.add("logs/5error.log", level="ERROR") def update_sheet(gw, row, result: ArchiveResult): cell_updates = [] @@ -33,8 +21,7 @@ def update_sheet(gw, row, result: ArchiveResult): batch_if_valid('archive', result.cdn_url) batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()) - batch_if_valid('thumbnail', result.thumbnail, - f'=IMAGE("{result.thumbnail}")') + batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")') batch_if_valid('thumbnail_index', result.thumbnail_index) batch_if_valid('title', result.title) batch_if_valid('duration', result.duration, str(result.duration)) @@ -54,109 +41,82 @@ def update_sheet(gw, row, result: ArchiveResult): gw.batch_set_cell(cell_updates) -def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): - sh = c.gsheets_client.open(sheet) +def missing_required_columns(gw: GWorksheet): + required_found = True + for required_col in ['url', 'status']: + if not gw.col_exists(required_col): + logger.warning(f'Required column for {required_col}: "{gw.columns[required_col]}" not found, skipping worksheet {gw.worksheet.title}') + required_found = False + return required_found + + +def process_sheet(c: Config): + sh = c.gsheets_client.open(c.sheet) # loop through worksheets to check for ii, wks in enumerate(sh.worksheets()): - logger.info(f'Opening worksheet {ii=}: {wks.title=} {header=}') - gw = GWorksheet(wks, header_row=header, columns=columns) + logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}') + gw = GWorksheet(wks, header_row=c.header, columns=c.column_names) - if not gw.col_exists('url'): - logger.warning( - f'No "{c.column_names["url"]}" column found, skipping worksheet {wks.title}') - continue - - if not gw.col_exists('status'): - logger.warning( - f'No "{c.column_names["status"]}" column found, skipping worksheet {wks.title}') - continue + if missing_required_columns(gw): continue # archives will be in a folder 'doc_name/worksheet_name' - c.set_folder(f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/') + # TODO: use slugify lib + c.set_folder(f'{c.sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/') storage = c.get_storage() # loop through rows in worksheet - for row in range(1 + header, gw.count_rows() + 1): + for row in range(1 + c.header, gw.count_rows() + 1): url = gw.get_cell(row, 'url') original_status = gw.get_cell(row, 'status') status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '') - if url != '' and status in ['', None]: - gw.set_cell(row, 'status', 'Archive in progress') + if url == '' or status not in ['', None]: continue - url = expand_url(url) + # All checks done - archival process starts here + gw.set_cell(row, 'status', 'Archive in progress') + url = expand_url(url) + storage.update_properties(subfolder=gw.get_cell_or_default(row, 'subfolder')) - subfolder = gw.get_cell_or_default(row, 'subfolder') + # make a new driver so each spreadsheet row is idempotent + c.recreate_webdriver() - # make a new driver so each spreadsheet row is idempotent - c.recreate_webdriver() + # order matters, first to succeed excludes remaining + active_archivers = [ + TelethonArchiver(storage, c.webdriver, c.telegram_config), + TelegramArchiver(storage, c.webdriver), + TiktokArchiver(storage, c.webdriver), + YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), + TwitterArchiver(storage, c.webdriver), + WaybackArchiver(storage, c.webdriver, c.wayback_config) + ] - # order matters, first to succeed excludes remaining - active_archivers = [ - TelethonArchiver(storage, c.webdriver, c.telegram_config), - TelegramArchiver(storage, c.webdriver), - TiktokArchiver(storage, c.webdriver), - YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), - TwitterArchiver(storage, c.webdriver), - WaybackArchiver(storage, c.webdriver, c.wayback_config) - ] + for archiver in active_archivers: + logger.debug(f'Trying {archiver=} on {row=}') - storage_client = None - if storage == "s3": - storage_client = s3_client - elif storage == "gd": - storage_client = gd_client - else: - raise ValueError(f'Cant get storage_client {storage_client}') - storage_client.update_properties(subfolder=subfolder) - for archiver in active_archivers: - logger.debug(f'Trying {archiver} on row {row}') + try: + result = archiver.download(url, check_if_exists=True) + except KeyboardInterrupt: + # catches keyboard interruptions to do a clean exit + logger.warning(f"caught interrupt for {archiver=} on {row=}") + gw.set_cell(row, 'status', '') + c.destroy_webdriver() + exit() + except Exception as e: + result = False + logger.error(f'Got unexpected error in row {row} with {archiver=} for {url=}: {e}\n{traceback.format_exc()}') - try: - result = archiver.download(url, check_if_exists=True) - except KeyboardInterrupt: - logger.warning("caught interrupt") - gw.set_cell(row, 'status', '') - driver.quit() - exit() - except Exception as e: - result = False - logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}\n{traceback.format_exc()}') - - if result: - # IA is a Success I believe - or do we want to display a logger warning for it? - if result.status in ['success', 'already archived', 'Internet Archive fallback']: - result.status = archiver.name + \ - ": " + str(result.status) - logger.success( - f'{archiver} succeeded on row {row}, url {url}') - if result.status in ['success', 'already archived']: - result.status = f"{archiver.name}: {result.status}" - logger.success(f'{archiver} succeeded on row {row}') - break - logger.warning(f'{archiver} did not succeed on row {row}, final status: {result.status}') - result.status = f"{archiver.name}: {result.status}" - - - # wayback has seen this url before so keep existing status - if "wayback: Internet Archive fallback" in result.status: - logger.success( - f'wayback has seen this url before so keep existing status on row {row}') - result.status = result.status.replace(' (duplicate)', '') - result.status = str(result.status) + " (duplicate)" - break - - logger.warning( - f'{archiver} did not succeed on {row=}, final status: {result.status}') - result.status = archiver.name + \ - ": " + str(result.status) - # get rid of driver so can reload on next row - driver.quit() if result: - update_sheet(gw, row, result) - else: - gw.set_cell(row, 'status', 'failed: no archiver') + result.status = f"{archiver.name}: {result.status}" + if result.status in ['success', 'already archived']: + logger.success(f'{archiver=} succeeded on {row=}, {url=}') + break + logger.warning(f'{archiver} did not succeed on {row=}, final status: {result.status}') + + if result: + update_sheet(gw, row, result) + else: + gw.set_cell(row, 'status', 'failed: no archiver') logger.success(f'Finshed worksheet {wks.title}') @@ -164,26 +124,11 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): def main(): c = Config() c.parse() - logger.info(f'Opening document {c.sheet} for header {c.header}') - parser.add_argument('--storage', action='store', dest='storage', default='s3', help='which storage to use.', choices={"s3", "gd"}) - - for k, v in GWorksheet.COLUMN_NAMES.items(): - help = f"the name of the column to fill with {k} (defaults={v})" - if k == "subfolder": - help = f"the name of the column to read the {k} from (defaults={v})" - parser.add_argument(f'--col-{k}', action='store', dest=k, default=v, help=help) - mkdir_if_not_exists(c.tmp_folder) - process_sheet(c, c.sheet, header=c.header, columns=c.column_names) - shutil.rmtree(c.tmp_folder) + process_sheet(c) c.destroy_webdriver() - - logger.info(f'Opening document {args.sheet} for header {args.header}') - - mkdir_if_not_exists('tmp') - process_sheet(args.sheet, header=args.header, columns=config_columns) - shutil.rmtree('tmp') + shutil.rmtree(c.tmp_folder) if __name__ == '__main__': diff --git a/configs/config.py b/configs/config.py index 11ed808..b517f4a 100644 --- a/configs/config.py +++ b/configs/config.py @@ -37,6 +37,14 @@ class Config: def __init__(self): self.parser = self.get_argument_parser() self.folder = "" + self.set_log_files() + + def set_log_files(self): + logger.add("logs/1trace.log", level="TRACE") + logger.add("logs/2info.log", level="INFO") + logger.add("logs/3success.log", level="SUCCESS") + logger.add("logs/4warning.log", level="WARNING") + logger.add("logs/5error.log", level="ERROR") def parse(self): self.args = self.parser.parse_args() @@ -145,7 +153,10 @@ class Config: parser.add_argument('--s3-private', action='store_true', help='Store content without public access permission (only for storage=s3) [secrets.s3.private in config.json]') for k, v in GWorksheet.COLUMN_NAMES.items(): - parser.add_argument(f'--col-{k}', action='store', dest=k, help=f"name of the column to fill with {k} (default='{v}')") + help = f"the name of the column to FILL WITH {k} (default='{v}')" + if k in ["url", "subfolder"]: + help = f"the name of the column to READ {k} FROM (default='{v}')" + parser.add_argument(f'--col-{k}', action='store', dest=k, help=help) return parser diff --git a/utils/misc.py b/utils/misc.py index 5b1a688..2dfd683 100644 --- a/utils/misc.py +++ b/utils/misc.py @@ -13,7 +13,8 @@ def expand_url(url): if 'https://t.co/' in url: try: r = requests.get(url) - url = r.url + logger.debug(f'Expanded url {url} to {r.url}') + return r.url except: logger.error(f'Failed to expand url {url}') return url