mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
Refactoring for new config setup
This commit is contained in:
@@ -1,13 +1,13 @@
|
||||
{
|
||||
'name': 'Generic Extractor',
|
||||
'version': '0.1.0',
|
||||
'author': 'Bellingcat',
|
||||
'type': ['extractor'],
|
||||
'requires_setup': False,
|
||||
'dependencies': {
|
||||
'python': ['yt_dlp', 'requests', 'loguru', 'slugify'],
|
||||
"name": "Generic Extractor",
|
||||
"version": "0.1.0",
|
||||
"author": "Bellingcat",
|
||||
"type": ["extractor"],
|
||||
"requires_setup": False,
|
||||
"dependencies": {
|
||||
"python": ["yt_dlp", "requests", "loguru", "slugify"],
|
||||
},
|
||||
'description': """
|
||||
"description": """
|
||||
This is the generic extractor used by auto-archiver, which uses `yt-dlp` under the hood.
|
||||
|
||||
This module is responsible for downloading and processing media content from platforms
|
||||
@@ -28,17 +28,53 @@ the broader archiving framework.
|
||||
custom dropins can be created to handle additional websites and passed to the archiver
|
||||
via the command line using the `--dropins` option (TODO!).
|
||||
""",
|
||||
'configs': {
|
||||
"facebook_cookie": {"default": None, "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'"},
|
||||
"subtitles": {"default": True, "help": "download subtitles if available"},
|
||||
"comments": {"default": False, "help": "download all comments if available, may lead to large metadata"},
|
||||
"livestreams": {"default": False, "help": "if set, will download live streams, otherwise will skip them; see --max-filesize for more control"},
|
||||
"live_from_start": {"default": False, "help": "if set, will download live streams from their earliest available moment, otherwise starts now."},
|
||||
"proxy": {"default": "", "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port"},
|
||||
"end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
|
||||
'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
|
||||
"max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
|
||||
"cookies_from_browser": {"default": None, 'type': 'str', "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
|
||||
"cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
|
||||
}
|
||||
}
|
||||
"configs": {
|
||||
"facebook_cookie": {
|
||||
"default": None,
|
||||
"help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'",
|
||||
},
|
||||
"subtitles": {"default": True, "help": "download subtitles if available", "type": "bool"},
|
||||
"comments": {
|
||||
"default": False,
|
||||
"help": "download all comments if available, may lead to large metadata",
|
||||
"type": "bool",
|
||||
},
|
||||
"livestreams": {
|
||||
"default": False,
|
||||
"help": "if set, will download live streams, otherwise will skip them; see --max-filesize for more control",
|
||||
"type": "bool",
|
||||
},
|
||||
"live_from_start": {
|
||||
"default": False,
|
||||
"help": "if set, will download live streams from their earliest available moment, otherwise starts now.",
|
||||
"type": "bool",
|
||||
},
|
||||
"proxy": {
|
||||
"default": "",
|
||||
"help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port",
|
||||
},
|
||||
"end_means_success": {
|
||||
"default": True,
|
||||
"help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve.",
|
||||
"type": "bool",
|
||||
},
|
||||
"allow_playlist": {
|
||||
"default": False,
|
||||
"help": "If True will also download playlists, set to False if the expectation is to download a single video.",
|
||||
"type": "bool",
|
||||
},
|
||||
"max_downloads": {
|
||||
"default": "inf",
|
||||
"help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
|
||||
},
|
||||
"cookies_from_browser": {
|
||||
"default": None,
|
||||
"type": "str",
|
||||
"help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale",
|
||||
},
|
||||
"cookie_file": {
|
||||
"default": None,
|
||||
"help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.base_processors.extractor import Extractor
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from .dropin import GenericDropin, InfoExtractor
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from yt_dlp.extractor.common import InfoExtractor
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.base_processors.extractor import Extractor
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
|
||||
class GenericDropin:
|
||||
"""Base class for dropins for the generic extractor.
|
||||
|
||||
@@ -5,11 +5,10 @@ from yt_dlp.extractor.common import InfoExtractor
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.base_processors.extractor import Extractor
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
from ...core import Metadata, Media, ArchivingContext
|
||||
|
||||
class GenericExtractor(Extractor):
|
||||
name = "youtubedl_archiver" #left as is for backwards compat
|
||||
_dropins = {}
|
||||
|
||||
def suitable_extractors(self, url: str) -> list[str]:
|
||||
@@ -268,7 +267,7 @@ class GenericExtractor(Extractor):
|
||||
if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
|
||||
logger.debug('Using Facebook cookie')
|
||||
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
|
||||
|
||||
|
||||
ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
|
||||
|
||||
if item.netloc in ['youtube.com', 'www.youtube.com']:
|
||||
@@ -285,6 +284,6 @@ class GenericExtractor(Extractor):
|
||||
result = self.download_for_extractor(info_extractor, url, ydl)
|
||||
if result:
|
||||
return result
|
||||
|
||||
|
||||
|
||||
return False
|
||||
|
||||
@@ -2,7 +2,7 @@ from typing import Type
|
||||
|
||||
from auto_archiver.utils import traverse_obj
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from auto_archiver.base_processors.extractor import Extractor
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
from yt_dlp.extractor.common import InfoExtractor
|
||||
|
||||
from dateutil.parser import parse as parse_dt
|
||||
|
||||
@@ -6,7 +6,7 @@ from slugify import slugify
|
||||
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from auto_archiver.utils import UrlUtil
|
||||
from auto_archiver.base_processors.extractor import Extractor
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
|
||||
from .dropin import GenericDropin, InfoExtractor
|
||||
|
||||
|
||||
Reference in New Issue
Block a user