diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py index c7be89f..e2bcad9 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py @@ -17,10 +17,6 @@ "default": 50, "help": "maximum number of videos to download from the page (0 = no download, inf = no limit).", }, - "exclude_media_extensions": { - "default": ".svg,.ico,.gif", - "help": "CSV of media (image/video) file extensions to exclude from download", - }, "user_data_dir": { "default": "secrets/antibot_user_data", "help": "Path to the user data directory for the webdriver. This is used to persist browser state, such as cookies and local storage. If you use the docker deployment, this path will be appended with `_docker` that is because the folder cannot be shared between the host and the container due to user permissions.", diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py index 549cced..1982389 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py @@ -1,6 +1,5 @@ import base64 import math -import mimetypes import os import sys import traceback @@ -26,10 +25,6 @@ class AntibotExtractorEnricher(Extractor, Enricher): self.agent = None # Use the default UserAgent # parse configuration options - self.exclude_media_mimetypes = set( - [mimetypes.guess_type(f"file{m}")[0] for m in self.exclude_media_extensions.split(",")] - ) - {None} - if self.max_download_images == "inf": self.max_download_images = math.inf else: @@ -292,9 +287,6 @@ class AntibotExtractorEnricher(Extractor, Enricher): break if not is_relevant_url(src): continue - mimetype = mimetypes.guess_type(src)[0] - if mimetype in self.exclude_media_mimetypes: - continue full_src = urljoin(url, src) if full_src not in all_urls: filename, full_src = self.download_from_url(full_src, try_best_quality=True) diff --git a/src/auto_archiver/utils/url.py b/src/auto_archiver/utils/url.py index ea03d7f..2bb19cf 100644 --- a/src/auto_archiver/utils/url.py +++ b/src/auto_archiver/utils/url.py @@ -111,12 +111,14 @@ def is_relevant_url(url: str) -> bool: ("emoji.redditmedia.com",), ] + # TODO: make these globally configurable IRRELEVANT_ENDS_WITH = [ ".svg", # ignore SVGs ".ico", # ignore icons + # ignore index files for videos, these should be handled by ytdlp ".m3u8", ".mpd", - ".ism", # ignore index files for videos, these should be handled by ytdlp + ".ism", ] for end in IRRELEVANT_ENDS_WITH: diff --git a/tests/extractors/test_antibot_extractor_enricher.py b/tests/extractors/test_antibot_extractor_enricher.py index 600baf2..06107b4 100644 --- a/tests/extractors/test_antibot_extractor_enricher.py +++ b/tests/extractors/test_antibot_extractor_enricher.py @@ -34,7 +34,6 @@ class TestAntibotExtractorEnricher(TestExtractorBase): "save_to_pdf": False, "max_download_images": 0, "max_download_videos": 0, - "exclude_media_extensions": ".svg,.ico,.gif", "proxy": None, }