"""The `extractor` module defines the base functionality for implementing extractors in the media archiving framework. This class provides common utility methods and a standard interface for extractors. Factory method to initialize an extractor instance based on its name. """ from __future__ import annotations from abc import abstractmethod import mimetypes import os import requests from loguru import logger from retrying import retry import re from auto_archiver.core import Metadata, BaseModule class Extractor(BaseModule): """ Base class for implementing extractors in the media archiving framework. Subclasses must implement the `download` method to define platform-specific behavior. """ valid_url: re.Pattern = None def cleanup(self) -> None: """ Called when extractors are done, or upon errors, cleanup any resources """ pass def sanitize_url(self, url: str) -> str: """ Used to clean unnecessary URL parameters OR unfurl redirect links """ return url def match_link(self, url: str) -> re.Match: """ Returns a match object if the given URL matches the valid_url pattern or False/None if not. Normally used in the `suitable` method to check if the URL is supported by this extractor. """ return self.valid_url.match(url) def suitable(self, url: str) -> bool: """ Returns True if this extractor can handle the given URL Should be overridden by subclasses """ if self.valid_url: return self.match_link(url) is not None return True def _guess_file_type(self, path: str) -> str: """ Receives a URL or filename and returns global mimetype like 'image' or 'video' see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types """ mime = mimetypes.guess_type(path)[0] if mime is not None: return mime.split("/")[0] return "" @retry(wait_random_min=500, wait_random_max=3500, stop_max_attempt_number=5) def download_from_url(self, url: str, to_filename: str = None, verbose=True) -> str: """ downloads a URL to provided filename, or inferred from URL, returns local filename """ if not to_filename: to_filename = url.split("/")[-1].split("?")[0] if len(to_filename) > 64: to_filename = to_filename[-64:] to_filename = os.path.join(self.tmp_dir, to_filename) if verbose: logger.debug(f"downloading {url[0:50]=} {to_filename=}") headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36" } try: d = requests.get(url, stream=True, headers=headers, timeout=30) d.raise_for_status() # get mimetype from the response headers if not mimetypes.guess_type(to_filename)[0]: content_type = d.headers.get("Content-Type") or self._guess_file_type(url) extension = mimetypes.guess_extension(content_type) if extension: to_filename += extension with open(to_filename, "wb") as f: for chunk in d.iter_content(chunk_size=8192): f.write(chunk) return to_filename except requests.RequestException as e: logger.warning(f"Failed to fetch the Media URL: {e}") @abstractmethod def download(self, item: Metadata) -> Metadata | False: """ Downloads the media from the given URL and returns a Metadata object with the downloaded media. If the URL is not supported or the download fails, this method should return False. """ pass