mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
Merge branch 'main' into small_issues
# Conflicts: # src/auto_archiver/core/base_module.py # src/auto_archiver/utils/misc.py
This commit is contained in:
@@ -61,11 +61,7 @@ def random_str(length: int = 32) -> str:
|
||||
return str(uuid.uuid4()).replace("-", "")[:length]
|
||||
|
||||
|
||||
def json_loader(cli_val):
|
||||
return json.loads(cli_val)
|
||||
|
||||
|
||||
def calculate_file_hash(filename: str, hash_algo=hashlib.sha256, chunksize: int = 16000000) -> str:
|
||||
def calculate_file_hash(filename: str, hash_algo = hashlib.sha256, chunksize: int = 16000000) -> str:
|
||||
hash = hash_algo()
|
||||
with open(filename, "rb") as f:
|
||||
while True:
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import re
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
from ipaddress import ip_address
|
||||
|
||||
|
||||
AUTHWALL_URLS = [
|
||||
@@ -7,6 +8,43 @@ AUTHWALL_URLS = [
|
||||
re.compile(r"https:\/\/www\.instagram\.com"), # instagram
|
||||
]
|
||||
|
||||
|
||||
def check_url_or_raise(url: str) -> bool | ValueError:
|
||||
"""
|
||||
Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes.
|
||||
"""
|
||||
|
||||
|
||||
if not (url.startswith("http://") or url.startswith("https://")):
|
||||
raise ValueError(f"Invalid URL scheme for url {url}")
|
||||
|
||||
parsed = urlparse(url)
|
||||
if not parsed.hostname:
|
||||
raise ValueError(f"Invalid URL hostname for url {url}")
|
||||
|
||||
if parsed.hostname == "localhost":
|
||||
raise ValueError(f"Localhost URLs cannot be parsed for security reasons (for url {url})")
|
||||
|
||||
if parsed.scheme not in ["http", "https"]:
|
||||
raise ValueError(f"Invalid URL scheme, only http and https supported (for url {url})")
|
||||
|
||||
try: # special rules for IP addresses
|
||||
ip = ip_address(parsed.hostname)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
else:
|
||||
if not ip.is_global:
|
||||
raise ValueError(f"IP address {ip} is not globally reachable")
|
||||
if ip.is_reserved:
|
||||
raise ValueError(f"Reserved IP address {ip} used")
|
||||
if ip.is_link_local:
|
||||
raise ValueError(f"Link-local IP address {ip} used")
|
||||
if ip.is_private:
|
||||
raise ValueError(f"Private IP address {ip} used")
|
||||
|
||||
return True
|
||||
|
||||
def domain_for_url(url: str) -> str:
|
||||
"""
|
||||
SECURITY: parse the domain using urllib to avoid any potential security issues
|
||||
|
||||
Reference in New Issue
Block a user