cleanup + rearchivable logic

This commit is contained in:
msramalho
2023-01-26 23:01:34 +00:00
parent 9dd8afed8c
commit 2508bb8a1b
8 changed files with 70 additions and 61 deletions

View File

@@ -23,10 +23,15 @@ class Archiver(Step):
# used when archivers need to login or do other one-time setup
pass
def clean_url(self, url: str) -> str:
# used to clean unnecessary URL parameters
def sanitize_url(self, url: str) -> str:
# used to clean unnecessary URL parameters OR unfurl redirect links
return url
def is_rearchivable(self, url: str) -> bool:
# archivers can signal if it does not make sense to rearchive a piece of content
# default is rearchiving
return True
def _guess_file_type(self, path: str) -> str:
"""
Receives a URL or filename and returns global mimetype like 'image' or 'video'
@@ -57,19 +62,3 @@ class Archiver(Step):
@abstractmethod
def download(self, item: Metadata) -> Metadata: pass
# TODO: how to fix allow predictable key
# def get_key(self, filename):
# """
# returns a key in the format "[archiverName]_[filename]" includes extension
# """
# tail = os.path.split(filename)[1] # returns filename.ext from full path
# _id, extension = os.path.splitext(tail) # returns [filename, .ext]
# if 'unknown_video' in _id:
# _id = _id.replace('unknown_video', 'jpg')
# # long filenames can cause problems, so trim them if necessary
# if len(_id) > 128:
# _id = _id[-128:]
# return f'{self.name}_{_id}{extension}'

View File

@@ -22,6 +22,10 @@ class TelegramArchiver(Archiver):
def configs() -> dict:
return {}
def is_rearchivable(self, url: str) -> bool:
# telegram posts are static
return False
def download(self, item: Metadata) -> Metadata:
url = item.get_url()
# detect URLs that we definitely cannot handle

View File

@@ -39,6 +39,10 @@ class TelethonArchiver(Archiver):
}
}
def is_rearchivable(self, url: str) -> bool:
# telegram posts are static
return False
def setup(self) -> None:
"""
1. trigger login process for telegram or proceed if already saved in a session file

View File

@@ -19,6 +19,10 @@ class TiktokArchiver(Archiver):
@staticmethod
def configs() -> dict:
return {}
def is_rearchivable(self, url: str) -> bool:
# TikTok posts are static
return False
def download(self, item: Metadata) -> Metadata:
url = item.get_url()

View File

@@ -11,6 +11,7 @@ from . import Archiver
from ..core import Metadata
from ..core import Media
class TwitterArchiver(Archiver):
"""
This Twitter Archiver uses unofficial scraping methods.
@@ -18,6 +19,7 @@ class TwitterArchiver(Archiver):
name = "twitter_archiver"
link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
link_clean_pattern = re.compile(r"(.+twitter\.com\/.+\/\d+)(\?)*.*")
def __init__(self, config: dict) -> None:
super().__init__(config)
@@ -26,6 +28,22 @@ class TwitterArchiver(Archiver):
def configs() -> dict:
return {}
def sanitize_url(self, url: str) -> str:
# expand URL if t.co and clean tracker GET params
if 'https://t.co/' in url:
try:
r = requests.get(url)
logger.debug(f'Expanded url {url} to {r.url}')
url = r.url
except:
logger.error(f'Failed to expand url {url}')
# https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w
return self.link_clean_pattern.sub("\\1", url)
def is_rearchivable(self, url: str) -> bool:
# Twitter posts are static
return False
def download(self, item: Metadata) -> Metadata:
"""
if this url is archivable will download post info and look for other posts from the same group with media.

View File

@@ -28,6 +28,10 @@ class VkArchiver(Archiver):
"session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
}
def is_rearchivable(self, url: str) -> bool:
# VK content is static
return False
def download(self, item: Metadata) -> Metadata:
url = item.get_url()