mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-07 19:08:30 +03:00
removes exclude_media_extensions option
This commit is contained in:
@@ -17,10 +17,6 @@
|
||||
"default": 50,
|
||||
"help": "maximum number of videos to download from the page (0 = no download, inf = no limit).",
|
||||
},
|
||||
"exclude_media_extensions": {
|
||||
"default": ".svg,.ico,.gif",
|
||||
"help": "CSV of media (image/video) file extensions to exclude from download",
|
||||
},
|
||||
"user_data_dir": {
|
||||
"default": "secrets/antibot_user_data",
|
||||
"help": "Path to the user data directory for the webdriver. This is used to persist browser state, such as cookies and local storage. If you use the docker deployment, this path will be appended with `_docker` that is because the folder cannot be shared between the host and the container due to user permissions.",
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import base64
|
||||
import math
|
||||
import mimetypes
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
@@ -26,10 +25,6 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
self.agent = None # Use the default UserAgent
|
||||
|
||||
# parse configuration options
|
||||
self.exclude_media_mimetypes = set(
|
||||
[mimetypes.guess_type(f"file{m}")[0] for m in self.exclude_media_extensions.split(",")]
|
||||
) - {None}
|
||||
|
||||
if self.max_download_images == "inf":
|
||||
self.max_download_images = math.inf
|
||||
else:
|
||||
@@ -292,9 +287,6 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
break
|
||||
if not is_relevant_url(src):
|
||||
continue
|
||||
mimetype = mimetypes.guess_type(src)[0]
|
||||
if mimetype in self.exclude_media_mimetypes:
|
||||
continue
|
||||
full_src = urljoin(url, src)
|
||||
if full_src not in all_urls:
|
||||
filename, full_src = self.download_from_url(full_src, try_best_quality=True)
|
||||
|
||||
@@ -111,12 +111,14 @@ def is_relevant_url(url: str) -> bool:
|
||||
("emoji.redditmedia.com",),
|
||||
]
|
||||
|
||||
# TODO: make these globally configurable
|
||||
IRRELEVANT_ENDS_WITH = [
|
||||
".svg", # ignore SVGs
|
||||
".ico", # ignore icons
|
||||
# ignore index files for videos, these should be handled by ytdlp
|
||||
".m3u8",
|
||||
".mpd",
|
||||
".ism", # ignore index files for videos, these should be handled by ytdlp
|
||||
".ism",
|
||||
]
|
||||
|
||||
for end in IRRELEVANT_ENDS_WITH:
|
||||
|
||||
@@ -34,7 +34,6 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
"save_to_pdf": False,
|
||||
"max_download_images": 0,
|
||||
"max_download_videos": 0,
|
||||
"exclude_media_extensions": ".svg,.ico,.gif",
|
||||
"proxy": None,
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user