mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5324d562ba | ||
|
|
5bf0a0206d | ||
|
|
4941823565 | ||
|
|
27310c2911 | ||
|
|
eb973ba42d |
@@ -90,7 +90,9 @@ class ArchivingOrchestrator:
|
|||||||
if cached_result:
|
if cached_result:
|
||||||
logger.debug("Found previously archived entry")
|
logger.debug("Found previously archived entry")
|
||||||
for d in self.databases:
|
for d in self.databases:
|
||||||
d.done(cached_result, cached=True)
|
try: d.done(cached_result, cached=True)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
||||||
return cached_result
|
return cached_result
|
||||||
|
|
||||||
# 3 - call archivers until one succeeds
|
# 3 - call archivers until one succeeds
|
||||||
@@ -120,6 +122,9 @@ class ArchivingOrchestrator:
|
|||||||
result.status = "nothing archived"
|
result.status = "nothing archived"
|
||||||
|
|
||||||
# signal completion to databases and archivers
|
# signal completion to databases and archivers
|
||||||
for d in self.databases: d.done(result)
|
for d in self.databases:
|
||||||
|
try: d.done(result)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|||||||
@@ -23,8 +23,7 @@ class AAApiDb(Database):
|
|||||||
def configs() -> dict:
|
def configs() -> dict:
|
||||||
return {
|
return {
|
||||||
"api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
|
"api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
|
||||||
"api_secret": {"default": None, "help": "API Basic authentication secret [deprecating soon]"},
|
"api_token": {"default": None, "help": "API Bearer token."},
|
||||||
"api_token": {"default": None, "help": "API Bearer token, to be preferred over secret (Basic auth) going forward"},
|
|
||||||
"public": {"default": False, "help": "whether the URL should be publicly available via the API"},
|
"public": {"default": False, "help": "whether the URL should be publicly available via the API"},
|
||||||
"author_id": {"default": None, "help": "which email to assign as author"},
|
"author_id": {"default": None, "help": "which email to assign as author"},
|
||||||
"group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
|
"group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
|
||||||
@@ -59,7 +58,7 @@ class AAApiDb(Database):
|
|||||||
logger.debug(f"saving archive of {item.get_url()} to the AA API.")
|
logger.debug(f"saving archive of {item.get_url()} to the AA API.")
|
||||||
|
|
||||||
payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
|
payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
|
||||||
headers = {"Authorization": f"Bearer {self.api_secret}"}
|
headers = {"Authorization": f"Bearer {self.api_token}"}
|
||||||
response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers)
|
response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers)
|
||||||
|
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
|
|||||||
@@ -35,6 +35,23 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
|||||||
"socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
|
"socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
|
||||||
"socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
|
"socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def setup(self) -> None:
|
||||||
|
self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
|
||||||
|
self.browsertrix_home_host = os.environ.get('BROWSERTRIX_HOME_HOST')
|
||||||
|
self.browsertrix_home_container = os.environ.get('BROWSERTRIX_HOME_CONTAINER') or self.browsertrix_home_host
|
||||||
|
# create crawls folder if not exists, so it can be safely removed in cleanup
|
||||||
|
if self.use_docker:
|
||||||
|
if self.browsertrix_home_container:
|
||||||
|
os.makedirs(self.browsertrix_home_container, exist_ok=True)
|
||||||
|
|
||||||
|
def cleanup(self) -> None:
|
||||||
|
if self.use_docker:
|
||||||
|
if self.browsertrix_home_container:
|
||||||
|
logger.debug(f"Removing {self.browsertrix_home_container=}")
|
||||||
|
shutil.rmtree(self.browsertrix_home_container, ignore_errors=True)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
# this new Metadata object is required to avoid duplication
|
# this new Metadata object is required to avoid duplication
|
||||||
@@ -51,8 +68,8 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
|||||||
url = to_enrich.get_url()
|
url = to_enrich.get_url()
|
||||||
|
|
||||||
collection = random_str(8)
|
collection = random_str(8)
|
||||||
browsertrix_home_host = os.environ.get('BROWSERTRIX_HOME_HOST') or os.path.abspath(ArchivingContext.get_tmp_dir())
|
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(ArchivingContext.get_tmp_dir())
|
||||||
browsertrix_home_container = os.environ.get('BROWSERTRIX_HOME_CONTAINER') or browsertrix_home_host
|
browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host
|
||||||
|
|
||||||
cmd = [
|
cmd = [
|
||||||
"crawl",
|
"crawl",
|
||||||
@@ -69,9 +86,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
|||||||
"--timeout", str(self.timeout)]
|
"--timeout", str(self.timeout)]
|
||||||
|
|
||||||
# call docker if explicitly enabled or we are running on the host (not in docker)
|
# call docker if explicitly enabled or we are running on the host (not in docker)
|
||||||
use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
|
if self.use_docker:
|
||||||
|
|
||||||
if use_docker:
|
|
||||||
logger.debug(f"generating WACZ in Docker for {url=}")
|
logger.debug(f"generating WACZ in Docker for {url=}")
|
||||||
logger.debug(f"{browsertrix_home_host=} {browsertrix_home_container=}")
|
logger.debug(f"{browsertrix_home_host=} {browsertrix_home_container=}")
|
||||||
if self.docker_commands:
|
if self.docker_commands:
|
||||||
@@ -103,7 +118,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
|||||||
logger.error(f"WACZ generation failed: {e}")
|
logger.error(f"WACZ generation failed: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if use_docker:
|
if self.use_docker:
|
||||||
wacz_fn = os.path.join(browsertrix_home_container, "collections", collection, f"{collection}.wacz")
|
wacz_fn = os.path.join(browsertrix_home_container, "collections", collection, f"{collection}.wacz")
|
||||||
else:
|
else:
|
||||||
wacz_fn = os.path.join("collections", collection, f"{collection}.wacz")
|
wacz_fn = os.path.join("collections", collection, f"{collection}.wacz")
|
||||||
@@ -116,7 +131,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
|||||||
if self.extract_media or self.extract_screenshot:
|
if self.extract_media or self.extract_screenshot:
|
||||||
self.extract_media_from_wacz(to_enrich, wacz_fn)
|
self.extract_media_from_wacz(to_enrich, wacz_fn)
|
||||||
|
|
||||||
if use_docker:
|
if self.use_docker:
|
||||||
jsonl_fn = os.path.join(browsertrix_home_container, "collections", collection, "pages", "pages.jsonl")
|
jsonl_fn = os.path.join(browsertrix_home_container, "collections", collection, "pages", "pages.jsonl")
|
||||||
else:
|
else:
|
||||||
jsonl_fn = os.path.join("collections", collection, "pages", "pages.jsonl")
|
jsonl_fn = os.path.join("collections", collection, "pages", "pages.jsonl")
|
||||||
|
|||||||
@@ -177,14 +177,23 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
async function run() {
|
async function run() {
|
||||||
await PreviewCertificates();
|
let setupFunctions = [
|
||||||
await PreviewText();
|
previewCertificates,
|
||||||
await enableCopyLogic();
|
previewText,
|
||||||
await enableCollapsibleLogic();
|
enableCopyLogic,
|
||||||
await setupSafeView();
|
enableCollapsibleLogic,
|
||||||
|
setupSafeView
|
||||||
|
];
|
||||||
|
setupFunctions.forEach(async f => {
|
||||||
|
try {
|
||||||
|
await f();
|
||||||
|
} catch (e) {
|
||||||
|
console.error(`Error in ${f.name}: ${e}`);
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
async function PreviewCertificates() {
|
async function previewCertificates() {
|
||||||
await Promise.all(
|
await Promise.all(
|
||||||
Array.from(document.querySelectorAll(".pem-certificate")).map(async el => {
|
Array.from(document.querySelectorAll(".pem-certificate")).map(async el => {
|
||||||
let certificate = await (await fetch(el.getAttribute("pem"))).text();
|
let certificate = await (await fetch(el.getAttribute("pem"))).text();
|
||||||
@@ -202,7 +211,7 @@
|
|||||||
console.log("certificate preview done");
|
console.log("certificate preview done");
|
||||||
}
|
}
|
||||||
|
|
||||||
async function PreviewText() {
|
async function previewText() {
|
||||||
await Promise.all(
|
await Promise.all(
|
||||||
Array.from(document.querySelectorAll(".text-preview")).map(async el => {
|
Array.from(document.querySelectorAll(".text-preview")).map(async el => {
|
||||||
let textContent = await (await fetch(el.getAttribute("url"))).text();
|
let textContent = await (await fetch(el.getAttribute("url"))).text();
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ _MAJOR = "0"
|
|||||||
_MINOR = "9"
|
_MINOR = "9"
|
||||||
# On main and in a nightly release the patch should be one ahead of the last
|
# On main and in a nightly release the patch should be one ahead of the last
|
||||||
# released build.
|
# released build.
|
||||||
_PATCH = "0"
|
_PATCH = "4"
|
||||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||||
_SUFFIX = ""
|
_SUFFIX = ""
|
||||||
|
|||||||
Reference in New Issue
Block a user