cleanup auto_archive and config

This commit is contained in:
msramalho
2022-06-03 18:03:49 +02:00
parent aaa1d299da
commit 5135e97d3f
6 changed files with 75 additions and 127 deletions

View File

@@ -6,7 +6,6 @@ name = "pypi"
[packages] [packages]
gspread = "*" gspread = "*"
boto3 = "*" boto3 = "*"
python-dotenv = "*"
argparse = "*" argparse = "*"
beautifulsoup4 = "*" beautifulsoup4 = "*"
tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"} tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"}

8
Pipfile.lock generated
View File

@@ -707,14 +707,6 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.8.2" "version": "==2.8.2"
}, },
"python-dotenv": {
"hashes": [
"sha256:b7e3b04a59693c42c36f9ab1cc2acc46fa5df8c78e178fc33a8d4cd05c8d498f",
"sha256:d92a187be61fe482e4fd675b6d52200e7be63a12b724abbf931a40ce4fa92938"
],
"index": "pypi",
"version": "==0.20.0"
},
"requests": { "requests": {
"hashes": [ "hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",

View File

@@ -78,6 +78,6 @@ class WaybackArchiver(Archiver):
title = "Could not get title" title = "Could not get title"
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url)
result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title, screenshot=screenshot) result = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot)
self.seen_urls[url] = result self.seen_urls[url] = result
return result return result

View File

@@ -1,24 +1,12 @@
import sys
import datetime import datetime
import shutil import shutil
from loguru import logger
from dotenv import load_dotenv
import traceback import traceback
from loguru import logger
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult
from utils import GWorksheet, mkdir_if_not_exists, expand_url from utils import GWorksheet, mkdir_if_not_exists, expand_url
from configs import Config from configs import Config
from storages.gd_storage import GDConfig, GDStorage
from utils import GWorksheet, mkdir_if_not_exists
import sys
logger.add("logs/1trace.log", level="TRACE")
logger.add("logs/2info.log", level="INFO")
logger.add("logs/3success.log", level="SUCCESS")
logger.add("logs/4warning.log", level="WARNING")
logger.add("logs/5error.log", level="ERROR")
def update_sheet(gw, row, result: ArchiveResult): def update_sheet(gw, row, result: ArchiveResult):
cell_updates = [] cell_updates = []
@@ -33,8 +21,7 @@ def update_sheet(gw, row, result: ArchiveResult):
batch_if_valid('archive', result.cdn_url) batch_if_valid('archive', result.cdn_url)
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()) batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
batch_if_valid('thumbnail', result.thumbnail, batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
f'=IMAGE("{result.thumbnail}")')
batch_if_valid('thumbnail_index', result.thumbnail_index) batch_if_valid('thumbnail_index', result.thumbnail_index)
batch_if_valid('title', result.title) batch_if_valid('title', result.title)
batch_if_valid('duration', result.duration, str(result.duration)) batch_if_valid('duration', result.duration, str(result.duration))
@@ -54,109 +41,82 @@ def update_sheet(gw, row, result: ArchiveResult):
gw.batch_set_cell(cell_updates) gw.batch_set_cell(cell_updates)
def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): def missing_required_columns(gw: GWorksheet):
sh = c.gsheets_client.open(sheet) required_found = True
for required_col in ['url', 'status']:
if not gw.col_exists(required_col):
logger.warning(f'Required column for {required_col}: "{gw.columns[required_col]}" not found, skipping worksheet {gw.worksheet.title}')
required_found = False
return required_found
def process_sheet(c: Config):
sh = c.gsheets_client.open(c.sheet)
# loop through worksheets to check # loop through worksheets to check
for ii, wks in enumerate(sh.worksheets()): for ii, wks in enumerate(sh.worksheets()):
logger.info(f'Opening worksheet {ii=}: {wks.title=} {header=}') logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}')
gw = GWorksheet(wks, header_row=header, columns=columns) gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)
if not gw.col_exists('url'): if missing_required_columns(gw): continue
logger.warning(
f'No "{c.column_names["url"]}" column found, skipping worksheet {wks.title}')
continue
if not gw.col_exists('status'):
logger.warning(
f'No "{c.column_names["status"]}" column found, skipping worksheet {wks.title}')
continue
# archives will be in a folder 'doc_name/worksheet_name' # archives will be in a folder 'doc_name/worksheet_name'
c.set_folder(f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/') # TODO: use slugify lib
c.set_folder(f'{c.sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/')
storage = c.get_storage() storage = c.get_storage()
# loop through rows in worksheet # loop through rows in worksheet
for row in range(1 + header, gw.count_rows() + 1): for row in range(1 + c.header, gw.count_rows() + 1):
url = gw.get_cell(row, 'url') url = gw.get_cell(row, 'url')
original_status = gw.get_cell(row, 'status') original_status = gw.get_cell(row, 'status')
status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '') status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '')
if url != '' and status in ['', None]: if url == '' or status not in ['', None]: continue
gw.set_cell(row, 'status', 'Archive in progress')
url = expand_url(url) # All checks done - archival process starts here
gw.set_cell(row, 'status', 'Archive in progress')
url = expand_url(url)
storage.update_properties(subfolder=gw.get_cell_or_default(row, 'subfolder'))
subfolder = gw.get_cell_or_default(row, 'subfolder') # make a new driver so each spreadsheet row is idempotent
c.recreate_webdriver()
# make a new driver so each spreadsheet row is idempotent # order matters, first to succeed excludes remaining
c.recreate_webdriver() active_archivers = [
TelethonArchiver(storage, c.webdriver, c.telegram_config),
TelegramArchiver(storage, c.webdriver),
TiktokArchiver(storage, c.webdriver),
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
TwitterArchiver(storage, c.webdriver),
WaybackArchiver(storage, c.webdriver, c.wayback_config)
]
# order matters, first to succeed excludes remaining for archiver in active_archivers:
active_archivers = [ logger.debug(f'Trying {archiver=} on {row=}')
TelethonArchiver(storage, c.webdriver, c.telegram_config),
TelegramArchiver(storage, c.webdriver),
TiktokArchiver(storage, c.webdriver),
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
TwitterArchiver(storage, c.webdriver),
WaybackArchiver(storage, c.webdriver, c.wayback_config)
]
storage_client = None try:
if storage == "s3": result = archiver.download(url, check_if_exists=True)
storage_client = s3_client except KeyboardInterrupt:
elif storage == "gd": # catches keyboard interruptions to do a clean exit
storage_client = gd_client logger.warning(f"caught interrupt for {archiver=} on {row=}")
else: gw.set_cell(row, 'status', '')
raise ValueError(f'Cant get storage_client {storage_client}') c.destroy_webdriver()
storage_client.update_properties(subfolder=subfolder) exit()
for archiver in active_archivers: except Exception as e:
logger.debug(f'Trying {archiver} on row {row}') result = False
logger.error(f'Got unexpected error in row {row} with {archiver=} for {url=}: {e}\n{traceback.format_exc()}')
try:
result = archiver.download(url, check_if_exists=True)
except KeyboardInterrupt:
logger.warning("caught interrupt")
gw.set_cell(row, 'status', '')
driver.quit()
exit()
except Exception as e:
result = False
logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}\n{traceback.format_exc()}')
if result:
# IA is a Success I believe - or do we want to display a logger warning for it?
if result.status in ['success', 'already archived', 'Internet Archive fallback']:
result.status = archiver.name + \
": " + str(result.status)
logger.success(
f'{archiver} succeeded on row {row}, url {url}')
if result.status in ['success', 'already archived']:
result.status = f"{archiver.name}: {result.status}"
logger.success(f'{archiver} succeeded on row {row}')
break
logger.warning(f'{archiver} did not succeed on row {row}, final status: {result.status}')
result.status = f"{archiver.name}: {result.status}"
# wayback has seen this url before so keep existing status
if "wayback: Internet Archive fallback" in result.status:
logger.success(
f'wayback has seen this url before so keep existing status on row {row}')
result.status = result.status.replace(' (duplicate)', '')
result.status = str(result.status) + " (duplicate)"
break
logger.warning(
f'{archiver} did not succeed on {row=}, final status: {result.status}')
result.status = archiver.name + \
": " + str(result.status)
# get rid of driver so can reload on next row
driver.quit()
if result: if result:
update_sheet(gw, row, result) result.status = f"{archiver.name}: {result.status}"
else: if result.status in ['success', 'already archived']:
gw.set_cell(row, 'status', 'failed: no archiver') logger.success(f'{archiver=} succeeded on {row=}, {url=}')
break
logger.warning(f'{archiver} did not succeed on {row=}, final status: {result.status}')
if result:
update_sheet(gw, row, result)
else:
gw.set_cell(row, 'status', 'failed: no archiver')
logger.success(f'Finshed worksheet {wks.title}') logger.success(f'Finshed worksheet {wks.title}')
@@ -164,26 +124,11 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
def main(): def main():
c = Config() c = Config()
c.parse() c.parse()
logger.info(f'Opening document {c.sheet} for header {c.header}') logger.info(f'Opening document {c.sheet} for header {c.header}')
parser.add_argument('--storage', action='store', dest='storage', default='s3', help='which storage to use.', choices={"s3", "gd"})
for k, v in GWorksheet.COLUMN_NAMES.items():
help = f"the name of the column to fill with {k} (defaults={v})"
if k == "subfolder":
help = f"the name of the column to read the {k} from (defaults={v})"
parser.add_argument(f'--col-{k}', action='store', dest=k, default=v, help=help)
mkdir_if_not_exists(c.tmp_folder) mkdir_if_not_exists(c.tmp_folder)
process_sheet(c, c.sheet, header=c.header, columns=c.column_names) process_sheet(c)
shutil.rmtree(c.tmp_folder)
c.destroy_webdriver() c.destroy_webdriver()
shutil.rmtree(c.tmp_folder)
logger.info(f'Opening document {args.sheet} for header {args.header}')
mkdir_if_not_exists('tmp')
process_sheet(args.sheet, header=args.header, columns=config_columns)
shutil.rmtree('tmp')
if __name__ == '__main__': if __name__ == '__main__':

View File

@@ -37,6 +37,14 @@ class Config:
def __init__(self): def __init__(self):
self.parser = self.get_argument_parser() self.parser = self.get_argument_parser()
self.folder = "" self.folder = ""
self.set_log_files()
def set_log_files(self):
logger.add("logs/1trace.log", level="TRACE")
logger.add("logs/2info.log", level="INFO")
logger.add("logs/3success.log", level="SUCCESS")
logger.add("logs/4warning.log", level="WARNING")
logger.add("logs/5error.log", level="ERROR")
def parse(self): def parse(self):
self.args = self.parser.parse_args() self.args = self.parser.parse_args()
@@ -145,7 +153,10 @@ class Config:
parser.add_argument('--s3-private', action='store_true', help='Store content without public access permission (only for storage=s3) [secrets.s3.private in config.json]') parser.add_argument('--s3-private', action='store_true', help='Store content without public access permission (only for storage=s3) [secrets.s3.private in config.json]')
for k, v in GWorksheet.COLUMN_NAMES.items(): for k, v in GWorksheet.COLUMN_NAMES.items():
parser.add_argument(f'--col-{k}', action='store', dest=k, help=f"name of the column to fill with {k} (default='{v}')") help = f"the name of the column to FILL WITH {k} (default='{v}')"
if k in ["url", "subfolder"]:
help = f"the name of the column to READ {k} FROM (default='{v}')"
parser.add_argument(f'--col-{k}', action='store', dest=k, help=help)
return parser return parser

View File

@@ -13,7 +13,8 @@ def expand_url(url):
if 'https://t.co/' in url: if 'https://t.co/' in url:
try: try:
r = requests.get(url) r = requests.get(url)
url = r.url logger.debug(f'Expanded url {url} to {r.url}')
return r.url
except: except:
logger.error(f'Failed to expand url {url}') logger.error(f'Failed to expand url {url}')
return url return url