mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 04:38:29 +03:00
Clean up browsertrix-crawler files
Remove any local browsertrix-crawler files after the WACZ has been copied to storage. Note, until this issue has a release on DockerHub the local files won't be able to be deleted since Docker on Linux creates the files as root: https://github.com/webrecorder/browsertrix-crawler/issues/170 The code will catch this exception and log a warning instead of failing and losing the work that has been completed.
This commit is contained in:
@@ -231,10 +231,10 @@ class Archiver(ABC):
|
||||
cmd.extend(["--profile", "/crawls/profile.tar.gz"])
|
||||
|
||||
try:
|
||||
logger.info(f"running browsertrix-crawler: {' '.join(cmd)}")
|
||||
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
|
||||
subprocess.run(cmd, check=True)
|
||||
except Exception as e:
|
||||
logger.error(f"wacz generation failed: {e}")
|
||||
logger.error(f"WACZ generation failed: {e}")
|
||||
return
|
||||
|
||||
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
|
||||
@@ -242,8 +242,11 @@ class Archiver(ABC):
|
||||
self.storage.upload(filename, key, extra_args={
|
||||
'ACL': 'public-read', 'ContentType': 'application/zip'})
|
||||
|
||||
# TODO: remove wacz collection, waiting for resolution on:
|
||||
# https://github.com/webrecorder/browsertrix-crawler/issues/170
|
||||
# clean up the local browsertrix files
|
||||
try:
|
||||
shutil.rmtree(browsertrix_home)
|
||||
except PermissionError:
|
||||
logger.warn(f"Unable to clean up browsertrix-crawler files in {browsertrix_home}")
|
||||
|
||||
return self.storage.get_cdn_url(key)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user