Files
auto-archiver/src/auto_archiver/archivers/archiver.py
2025-01-21 16:43:14 +01:00

104 lines
3.6 KiB
Python

""" The `archiver` module defines the base functionality for implementing archivers in the media archiving framework.
This class provides common utility methods and a standard interface for archivers.
Factory method to initialize an archiver instance based on its name.
"""
from __future__ import annotations
from pathlib import Path
from abc import abstractmethod
from dataclasses import dataclass
import mimetypes
import os
import mimetypes, requests
from loguru import logger
from retrying import retry
from ..core import Metadata, Step, ArchivingContext
@dataclass
class Archiver(Step):
"""
Base class for implementing archivers in the media archiving framework.
Subclasses must implement the `download` method to define platform-specific behavior.
"""
name = "archiver"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
def init(name: str, config: dict) -> Archiver:
# only for typing...
return Step.init(name, config, Archiver)
def setup(self) -> None:
# used when archivers need to login or do other one-time setup
pass
def cleanup(self) -> None:
# called when archivers are done, or upon errors, cleanup any resources
pass
def sanitize_url(self, url: str) -> str:
# used to clean unnecessary URL parameters OR unfurl redirect links
return url
def suitable(self, url: str) -> bool:
"""
Returns True if this archiver can handle the given URL
Should be overridden by subclasses
"""
return True
def _guess_file_type(self, path: str) -> str:
"""
Receives a URL or filename and returns global mimetype like 'image' or 'video'
see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
"""
mime = mimetypes.guess_type(path)[0]
if mime is not None:
return mime.split("/")[0]
return ""
@retry(wait_random_min=500, wait_random_max=3500, stop_max_attempt_number=5)
def download_from_url(self, url: str, to_filename: str = None, verbose=True) -> str:
"""
downloads a URL to provided filename, or inferred from URL, returns local filename
"""
if not to_filename:
to_filename = url.split('/')[-1].split('?')[0]
if len(to_filename) > 64:
to_filename = to_filename[-64:]
to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
if verbose: logger.debug(f"downloading {url[0:50]=} {to_filename=}")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
try:
d = requests.get(url, stream=True, headers=headers, timeout=30)
d.raise_for_status()
# get mimetype from the response headers
if not Path(to_filename).suffix:
content_type = d.headers.get('Content-Type')
extension = mimetypes.guess_extension(content_type)
if extension:
to_filename += extension
with open(to_filename, 'wb') as f:
for chunk in d.iter_content(chunk_size=8192):
f.write(chunk)
return to_filename
except requests.RequestException as e:
logger.warning(f"Failed to fetch the Media URL: {e}")
@abstractmethod
def download(self, item: Metadata) -> Metadata:
pass