auto-archiver/src/auto_archiver/core/extractor.py

""" The `extractor` module defines the base functionality for implementing extractors in the media archiving framework.
    This class provides common utility methods and a standard interface for extractors.

    Factory method to initialize an extractor instance based on its name.


"""
from __future__ import annotations
from pathlib import Path
from abc import abstractmethod
from dataclasses import dataclass
import mimetypes
import os
import mimetypes
import requests
from loguru import logger
from retrying import retry
import re

from auto_archiver.core import Metadata, BaseModule


class Extractor(BaseModule):
    """
    Base class for implementing extractors in the media archiving framework.
    Subclasses must implement the `download` method to define platform-specific behavior.
    """

    valid_url: re.Pattern = None

    def cleanup(self) -> None:
        """
        Called when extractors are done, or upon errors, cleanup any resources
        """
        pass

    def sanitize_url(self, url: str) -> str:
        """
        Used to clean unnecessary URL parameters OR unfurl redirect links
        """
        return url

    def match_link(self, url: str) -> re.Match:
        """
        Returns a match object if the given URL matches the valid_url pattern or False/None if not.

        Normally used in the `suitable` method to check if the URL is supported by this extractor.

        """
        return self.valid_url.match(url)

    def suitable(self, url: str) -> bool:
        """
        Returns True if this extractor can handle the given URL

        Should be overridden by subclasses

        """
        if self.valid_url:
            return self.match_link(url) is not None

        return True

    def _guess_file_type(self, path: str) -> str:
        """
        Receives a URL or filename and returns global mimetype like 'image' or 'video'
        see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
        """
        mime = mimetypes.guess_type(path)[0]
        if mime is not None:
            return mime.split("/")[0]
        return ""

    @retry(wait_random_min=500, wait_random_max=3500, stop_max_attempt_number=5)
    def download_from_url(self, url: str, to_filename: str = None, verbose=True) -> str:
        """
            downloads a URL to provided filename, or inferred from URL, returns local filename
        """
        if not to_filename:
            to_filename = url.split('/')[-1].split('?')[0]
            if len(to_filename) > 64:
                to_filename = to_filename[-64:]
        to_filename = os.path.join(self.tmp_dir, to_filename)
        if verbose: logger.debug(f"downloading {url[0:50]=} {to_filename=}")
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
        }
        try:
            d = requests.get(url, stream=True, headers=headers, timeout=30)
            d.raise_for_status()

            # get mimetype from the response headers
            if not mimetypes.guess_type(to_filename)[0]:
                content_type = d.headers.get('Content-Type') or self._guess_file_type(url)
                extension = mimetypes.guess_extension(content_type)
                if extension:
                    to_filename += extension

            with open(to_filename, 'wb') as f:
                for chunk in d.iter_content(chunk_size=8192):
                    f.write(chunk)
            return to_filename

        except requests.RequestException as e:
            logger.warning(f"Failed to fetch the Media URL: {e}")

    @abstractmethod
    def download(self, item: Metadata) -> Metadata | False:
        """
        Downloads the media from the given URL and returns a Metadata object with the downloaded media.

        If the URL is not supported or the download fails, this method should return False.

        """
        pass