From 20ca50dc90cbde60055d2ed3e7c643b5bc19c9af Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Tue, 11 Oct 2022 16:49:19 -0400 Subject: [PATCH] Clean up browsertrix-crawler files Remove any local browsertrix-crawler files after the WACZ has been copied to storage. Note, until this issue has a release on DockerHub the local files won't be able to be deleted since Docker on Linux creates the files as root: https://github.com/webrecorder/browsertrix-crawler/issues/170 The code will catch this exception and log a warning instead of failing and losing the work that has been completed. --- archivers/base_archiver.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 4ee3433..ea172f8 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -231,10 +231,10 @@ class Archiver(ABC): cmd.extend(["--profile", "/crawls/profile.tar.gz"]) try: - logger.info(f"running browsertrix-crawler: {' '.join(cmd)}") + logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}") subprocess.run(cmd, check=True) except Exception as e: - logger.error(f"wacz generation failed: {e}") + logger.error(f"WACZ generation failed: {e}") return filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz") @@ -242,8 +242,11 @@ class Archiver(ABC): self.storage.upload(filename, key, extra_args={ 'ACL': 'public-read', 'ContentType': 'application/zip'}) - # TODO: remove wacz collection, waiting for resolution on: - # https://github.com/webrecorder/browsertrix-crawler/issues/170 + # clean up the local browsertrix files + try: + shutil.rmtree(browsertrix_home) + except PermissionError: + logger.warn(f"Unable to clean up browsertrix-crawler files in {browsertrix_home}") return self.storage.get_cdn_url(key)