mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
cleanup + rearchivable logic
This commit is contained in:
@@ -23,10 +23,15 @@ class Archiver(Step):
|
||||
# used when archivers need to login or do other one-time setup
|
||||
pass
|
||||
|
||||
def clean_url(self, url: str) -> str:
|
||||
# used to clean unnecessary URL parameters
|
||||
def sanitize_url(self, url: str) -> str:
|
||||
# used to clean unnecessary URL parameters OR unfurl redirect links
|
||||
return url
|
||||
|
||||
def is_rearchivable(self, url: str) -> bool:
|
||||
# archivers can signal if it does not make sense to rearchive a piece of content
|
||||
# default is rearchiving
|
||||
return True
|
||||
|
||||
def _guess_file_type(self, path: str) -> str:
|
||||
"""
|
||||
Receives a URL or filename and returns global mimetype like 'image' or 'video'
|
||||
@@ -57,19 +62,3 @@ class Archiver(Step):
|
||||
|
||||
@abstractmethod
|
||||
def download(self, item: Metadata) -> Metadata: pass
|
||||
|
||||
# TODO: how to fix allow predictable key
|
||||
# def get_key(self, filename):
|
||||
# """
|
||||
# returns a key in the format "[archiverName]_[filename]" includes extension
|
||||
# """
|
||||
# tail = os.path.split(filename)[1] # returns filename.ext from full path
|
||||
# _id, extension = os.path.splitext(tail) # returns [filename, .ext]
|
||||
# if 'unknown_video' in _id:
|
||||
# _id = _id.replace('unknown_video', 'jpg')
|
||||
|
||||
# # long filenames can cause problems, so trim them if necessary
|
||||
# if len(_id) > 128:
|
||||
# _id = _id[-128:]
|
||||
|
||||
# return f'{self.name}_{_id}{extension}'
|
||||
@@ -22,6 +22,10 @@ class TelegramArchiver(Archiver):
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def is_rearchivable(self, url: str) -> bool:
|
||||
# telegram posts are static
|
||||
return False
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
# detect URLs that we definitely cannot handle
|
||||
|
||||
@@ -39,6 +39,10 @@ class TelethonArchiver(Archiver):
|
||||
}
|
||||
}
|
||||
|
||||
def is_rearchivable(self, url: str) -> bool:
|
||||
# telegram posts are static
|
||||
return False
|
||||
|
||||
def setup(self) -> None:
|
||||
"""
|
||||
1. trigger login process for telegram or proceed if already saved in a session file
|
||||
|
||||
@@ -19,6 +19,10 @@ class TiktokArchiver(Archiver):
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def is_rearchivable(self, url: str) -> bool:
|
||||
# TikTok posts are static
|
||||
return False
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
@@ -11,6 +11,7 @@ from . import Archiver
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
|
||||
|
||||
class TwitterArchiver(Archiver):
|
||||
"""
|
||||
This Twitter Archiver uses unofficial scraping methods.
|
||||
@@ -18,6 +19,7 @@ class TwitterArchiver(Archiver):
|
||||
|
||||
name = "twitter_archiver"
|
||||
link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
||||
link_clean_pattern = re.compile(r"(.+twitter\.com\/.+\/\d+)(\?)*.*")
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
@@ -26,6 +28,22 @@ class TwitterArchiver(Archiver):
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def sanitize_url(self, url: str) -> str:
|
||||
# expand URL if t.co and clean tracker GET params
|
||||
if 'https://t.co/' in url:
|
||||
try:
|
||||
r = requests.get(url)
|
||||
logger.debug(f'Expanded url {url} to {r.url}')
|
||||
url = r.url
|
||||
except:
|
||||
logger.error(f'Failed to expand url {url}')
|
||||
# https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w
|
||||
return self.link_clean_pattern.sub("\\1", url)
|
||||
|
||||
def is_rearchivable(self, url: str) -> bool:
|
||||
# Twitter posts are static
|
||||
return False
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
"""
|
||||
if this url is archivable will download post info and look for other posts from the same group with media.
|
||||
|
||||
@@ -28,6 +28,10 @@ class VkArchiver(Archiver):
|
||||
"session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
|
||||
}
|
||||
|
||||
def is_rearchivable(self, url: str) -> bool:
|
||||
# VK content is static
|
||||
return False
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user