mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 20:58:29 +03:00
general security updates
This commit is contained in:
@@ -1,5 +1,7 @@
|
||||
from __future__ import annotations
|
||||
from typing import Generator, Union, List
|
||||
from urllib.parse import urlparse
|
||||
from ipaddress import ip_address
|
||||
|
||||
from .context import ArchivingContext
|
||||
|
||||
@@ -60,7 +62,9 @@ class ArchivingOrchestrator:
|
||||
exit()
|
||||
except Exception as e:
|
||||
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
|
||||
for d in self.databases: d.failed(item)
|
||||
for d in self.databases:
|
||||
if type(e) == AssertionError: d.failed(item, str(e))
|
||||
else: d.failed(item)
|
||||
|
||||
|
||||
def archive(self, result: Metadata) -> Union[Metadata, None]:
|
||||
@@ -74,6 +78,7 @@ class ArchivingOrchestrator:
|
||||
6. Call selected Formatter and store formatted if needed
|
||||
"""
|
||||
original_url = result.get_url()
|
||||
self.assert_valid_url(original_url)
|
||||
|
||||
# 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
|
||||
url = original_url
|
||||
@@ -128,3 +133,23 @@ class ArchivingOrchestrator:
|
||||
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
||||
|
||||
return result
|
||||
|
||||
def assert_valid_url(self, url: str) -> bool:
|
||||
"""
|
||||
Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes.
|
||||
"""
|
||||
assert url.startswith("http://") or url.startswith("https://"), f"Invalid URL scheme"
|
||||
|
||||
parsed = urlparse(url)
|
||||
assert parsed.scheme in ["http", "https"], f"Invalid URL scheme"
|
||||
assert parsed.hostname, f"Invalid URL hostname"
|
||||
assert parsed.hostname != "localhost", f"Invalid URL"
|
||||
|
||||
try: # special rules for IP addresses
|
||||
ip = ip_address(parsed.hostname)
|
||||
except ValueError: pass
|
||||
else:
|
||||
assert ip.is_global, f"Invalid IP used"
|
||||
assert not ip.is_reserved, f"Invalid IP used"
|
||||
assert not ip.is_link_local, f"Invalid IP used"
|
||||
assert not ip.is_private, f"Invalid IP used"
|
||||
|
||||
@@ -21,8 +21,8 @@ class ConsoleDb(Database):
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
|
||||
def failed(self, item: Metadata) -> None:
|
||||
logger.error(f"FAILED {item}")
|
||||
def failed(self, item: Metadata, reason:str) -> None:
|
||||
logger.error(f"FAILED {item}: {reason}")
|
||||
|
||||
def aborted(self, item: Metadata) -> None:
|
||||
logger.warning(f"ABORTED {item}")
|
||||
|
||||
@@ -22,7 +22,7 @@ class Database(Step, ABC):
|
||||
"""signals the DB that the given item archival has started"""
|
||||
pass
|
||||
|
||||
def failed(self, item: Metadata) -> None:
|
||||
def failed(self, item: Metadata, reason:str) -> None:
|
||||
"""update DB accordingly for failure"""
|
||||
pass
|
||||
|
||||
|
||||
@@ -29,9 +29,9 @@ class GsheetsDb(Database):
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, 'status', 'Archive in progress')
|
||||
|
||||
def failed(self, item: Metadata) -> None:
|
||||
def failed(self, item: Metadata, reason:str) -> None:
|
||||
logger.error(f"FAILED {item}")
|
||||
self._safe_status_update(item, 'Archive failed')
|
||||
self._safe_status_update(item, f'Archive failed {reason}')
|
||||
|
||||
def aborted(self, item: Metadata) -> None:
|
||||
logger.warning(f"ABORTED {item}")
|
||||
@@ -102,6 +102,11 @@ class GsheetsDb(Database):
|
||||
|
||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
|
||||
gw: GWorksheet = ArchivingContext.get("gsheet").get("worksheet")
|
||||
row: int = ArchivingContext.get("gsheet").get("row")
|
||||
if gsheet := ArchivingContext.get("gsheet"):
|
||||
gw: GWorksheet = gsheet.get("worksheet")
|
||||
row: int = gsheet.get("row")
|
||||
elif self.sheet_id:
|
||||
print(self.sheet_id)
|
||||
|
||||
|
||||
return gw, row
|
||||
|
||||
@@ -27,7 +27,10 @@ class SSLEnricher(Enricher):
|
||||
if not to_enrich.media and self.skip_when_nothing_archived: return
|
||||
|
||||
url = to_enrich.get_url()
|
||||
domain = urlparse(url).netloc
|
||||
parsed = urlparse(url)
|
||||
assert parsed.scheme in ["https"], f"Invalid URL scheme {url=}"
|
||||
|
||||
domain = parsed.netloc
|
||||
logger.debug(f"fetching SSL certificate for {domain=} in {url=}")
|
||||
|
||||
cert = ssl.get_server_certificate((domain, 443))
|
||||
|
||||
@@ -21,7 +21,7 @@ class HtmlFormatter(Formatter):
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")))
|
||||
self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")), autoescape=True)
|
||||
# JinjaHelper class static methods are added as filters
|
||||
self.environment.filters.update({
|
||||
k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
|
||||
_MAJOR = "0"
|
||||
_MINOR = "9"
|
||||
_MINOR = "10"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "11"
|
||||
_PATCH = "0"
|
||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||
_SUFFIX = ""
|
||||
|
||||
Reference in New Issue
Block a user