mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
feat: WACZ enricher can now be probed for media, and used as an archiver OR enricher
This commit is contained in:
@@ -2,7 +2,6 @@
|
||||
import os, json, requests
|
||||
from datetime import datetime
|
||||
from loguru import logger
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
|
||||
def mkdir_if_not_exists(folder):
|
||||
@@ -21,14 +20,6 @@ def expand_url(url):
|
||||
logger.error(f'Failed to expand url {url}')
|
||||
return url
|
||||
|
||||
def remove_get_parameters(url):
|
||||
# http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
|
||||
# useful for mimetypes to work
|
||||
parsed_url = urlparse(url)
|
||||
new_url = urlunparse(parsed_url._replace(query=''))
|
||||
return new_url
|
||||
|
||||
|
||||
def getattr_or(o: object, prop: str, default=None):
|
||||
try:
|
||||
res = getattr(o, prop)
|
||||
|
||||
@@ -1,14 +1,16 @@
|
||||
import re
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
|
||||
class UrlUtil:
|
||||
telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)")
|
||||
is_istagram = re.compile(r"https:\/\/www\.instagram\.com")
|
||||
|
||||
@staticmethod
|
||||
def clean(url): return url
|
||||
def clean(url: str) -> str: return url
|
||||
|
||||
@staticmethod
|
||||
def is_auth_wall(url):
|
||||
def is_auth_wall(url: str) -> bool:
|
||||
"""
|
||||
checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
|
||||
"""
|
||||
@@ -17,3 +19,28 @@ class UrlUtil:
|
||||
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def remove_get_parameters(url: str) -> str:
|
||||
# http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
|
||||
# useful for mimetypes to work
|
||||
parsed_url = urlparse(url)
|
||||
new_url = urlunparse(parsed_url._replace(query=''))
|
||||
return new_url
|
||||
|
||||
@staticmethod
|
||||
def is_relevant_url(url: str) -> bool:
|
||||
"""
|
||||
Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc.
|
||||
"""
|
||||
clean_url = UrlUtil.remove_get_parameters(url)
|
||||
|
||||
# favicons
|
||||
if "favicon" in url: return False
|
||||
# ifnore icons
|
||||
if clean_url.endswith(".ico"): return False
|
||||
# ignore SVGs
|
||||
if UrlUtil.remove_get_parameters(url).endswith(".svg"): return False
|
||||
|
||||
# twitter profile pictures
|
||||
if "twimg.com/profile_images" in url: return False
|
||||
return True
|
||||
|
||||
Reference in New Issue
Block a user