Further tidyups + refactoring for new structure

* Add implementation tests for orchestrator + logging tests * Standardise method/class vars for extractors to see if they are suitable * Fix bugs with removing default loguru logger (allows further customisation) * Fix bug loading required fields from file *
2026-06-13 05:38:29 +03:00 · 2025-01-30 13:21:10 +01:00
parent cddae65a90
commit b7d9145f6c
22 changed files with 292 additions and 51 deletions
--- a/src/auto_archiver/core/extractor.py
+++ b/src/auto_archiver/core/extractor.py
@@ -11,9 +11,12 @@ from abc import abstractmethod
 from dataclasses import dataclass
 import mimetypes
 import os
-import mimetypes, requests
+import mimetypes
+
+import requests
 from loguru import logger
 from retrying import retry
+import re

 from ..core import Metadata, ArchivingContext, BaseModule

@@ -25,6 +28,8 @@ class Extractor(BaseModule):
    Subclasses must implement the `download` method to define platform-specific behavior.
    """

+    valid_url: re.Pattern = None
+
    def cleanup(self) -> None:
        # called when extractors are done, or upon errors, cleanup any resources
        pass
@@ -32,13 +37,20 @@ class Extractor(BaseModule):
    def sanitize_url(self, url: str) -> str:
        # used to clean unnecessary URL parameters OR unfurl redirect links
        return url
+    
+    def match_link(self, url: str) -> re.Match:
+        return self.valid_url.match(url)

    def suitable(self, url: str) -> bool:
        """
        Returns True if this extractor can handle the given URL

        Should be overridden by subclasses
+
        """
+        if self.valid_url:
+            return self.match_link(url) is not None
+        
        return True

    def _guess_file_type(self, path: str) -> str: