Refactoring for new config setup

2026-06-13 05:38:29 +03:00 · 2025-01-27 19:03:02 +00:00
parent e3074013d0
commit e1a9373336
52 changed files with 219 additions and 242 deletions
--- a/src/auto_archiver/core/extractor.py
+++ b/src/auto_archiver/core/extractor.py
@@ -0,0 +1,89 @@
+""" The `extractor` module defines the base functionality for implementing extractors in the media archiving framework.
+    This class provides common utility methods and a standard interface for extractors.
+
+    Factory method to initialize an extractor instance based on its name.
+
+
+"""
+from __future__ import annotations
+from pathlib import Path
+from abc import abstractmethod
+from dataclasses import dataclass
+import mimetypes
+import os
+import mimetypes, requests
+from loguru import logger
+from retrying import retry
+
+from ..core import Metadata, ArchivingContext, BaseModule
+
+
+@dataclass
+class Extractor(BaseModule):
+    """
+    Base class for implementing extractors in the media archiving framework.
+    Subclasses must implement the `download` method to define platform-specific behavior.
+    """
+
+    def cleanup(self) -> None:
+        # called when extractors are done, or upon errors, cleanup any resources
+        pass
+
+    def sanitize_url(self, url: str) -> str:
+        # used to clean unnecessary URL parameters OR unfurl redirect links
+        return url
+
+    def suitable(self, url: str) -> bool:
+        """
+        Returns True if this extractor can handle the given URL
+
+        Should be overridden by subclasses
+        """
+        return True
+
+    def _guess_file_type(self, path: str) -> str:
+        """
+        Receives a URL or filename and returns global mimetype like 'image' or 'video'
+        see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
+        """
+        mime = mimetypes.guess_type(path)[0]
+        if mime is not None:
+            return mime.split("/")[0]
+        return ""
+
+    @retry(wait_random_min=500, wait_random_max=3500, stop_max_attempt_number=5)
+    def download_from_url(self, url: str, to_filename: str = None, verbose=True) -> str:
+        """
+            downloads a URL to provided filename, or inferred from URL, returns local filename
+        """
+        if not to_filename:
+            to_filename = url.split('/')[-1].split('?')[0]
+            if len(to_filename) > 64:
+                to_filename = to_filename[-64:]
+        to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
+        if verbose: logger.debug(f"downloading {url[0:50]=} {to_filename=}")
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
+        }
+        try:
+            d = requests.get(url, stream=True, headers=headers, timeout=30)
+            d.raise_for_status()
+
+            # get mimetype from the response headers
+            if not Path(to_filename).suffix:
+                content_type = d.headers.get('Content-Type')
+                extension = mimetypes.guess_extension(content_type)
+                if extension:
+                    to_filename += extension
+
+            with open(to_filename, 'wb') as f:
+                for chunk in d.iter_content(chunk_size=8192):
+                    f.write(chunk)
+            return to_filename
+
+        except requests.RequestException as e:
+            logger.warning(f"Failed to fetch the Media URL: {e}")
+
+    @abstractmethod
+    def download(self, item: Metadata) -> Metadata:
+        pass