Merge pull request #262 from bellingcat/generic_extractor_args

Add flexible extractor_args to generic_extractor.py

This allows users to pass any of the options listed [here](https://github.com/yt-dlp/yt-dlp/blob/master/README.md#extractor-arguments) to yt-dlp extractor_args.

example usage:

```
generic_extractor:
  facebook_cookie:
  ...
  extractor_args:
    youtube:
      player_client: web,tv
    generic:
      is_live: true
```
This commit is contained in:
Erin Clark
2025-03-20 15:38:20 +00:00
committed by GitHub
3 changed files with 38 additions and 7 deletions

View File

@@ -74,6 +74,11 @@ If you are having issues with the extractor, you can review the version of `yt-d
"default": "inf",
"help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
},
"extractor_args": {
"default": {},
"help": "Additional arguments to pass to the yt-dlp extractor. See https://github.com/yt-dlp/yt-dlp/blob/master/README.md#extractor-arguments.",
"type": "json_loader",
},
"ytdlp_update_interval": {
"default": 5,
"help": "How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.",

View File

@@ -422,16 +422,20 @@ class GenericExtractor(Extractor):
"--write-subs" if self.subtitles else "--no-write-subs",
"--write-auto-subs" if self.subtitles else "--no-write-auto-subs",
"--live-from-start" if self.live_from_start else "--no-live-from-start",
"--proxy",
self.proxy if self.proxy else "",
f"--max-downloads {self.max_downloads}" if self.max_downloads != "inf" else "",
f"--playlist-end {self.max_downloads}" if self.max_downloads != "inf" else "",
]
# proxy handling
if self.proxy:
ydl_options.extend(["--proxy", self.proxy])
# max_downloads handling
if self.max_downloads != "inf":
ydl_options.extend(["--max-downloads", str(self.max_downloads)])
ydl_options.extend(["--playlist-end", str(self.max_downloads)])
# set up auth
auth = self.auth_for_site(url, extract_cookies=False)
# order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
# order of importance: username/password -> api_key -> cookie -> cookies_from_browser -> cookies_file
if auth:
if "username" in auth and "password" in auth:
logger.debug(f"Using provided auth username and password for {url}")
@@ -447,6 +451,16 @@ class GenericExtractor(Extractor):
logger.debug(f"Using cookies from file {auth['cookies_file']} for {url}")
ydl_options.extend(("--cookies", auth["cookies_file"]))
# Applying user-defined extractor_args
if self.extractor_args:
for key, args in self.extractor_args.items():
logger.debug(f"Setting extractor_args: {key}")
if isinstance(args, dict):
arg_str = ";".join(f"{k}={v}" for k, v in args.items())
else:
arg_str = str(args)
ydl_options.extend(["--extractor-args", f"{key}:{arg_str}"])
if self.ytdlp_args:
logger.debug("Adding additional ytdlp arguments: {self.ytdlp_args}")
ydl_options += self.ytdlp_args.split(" ")

View File

@@ -81,8 +81,20 @@ def test_load_modules(module_name):
# check that default settings are applied
default_config = module.configs
assert loaded_module.name in loaded_module.config.keys()
defaults = {k for k in default_config}
assert defaults in [loaded_module.config[module_name].keys()]
@pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"])
def test_config_defaults(module_name):
# test the values of the default config values are set
# Note: some modules can alter values in the setup() method, this test checks cases that don't
module = ModuleFactory().get_module_lazy(module_name)
loaded_module = module.load({})
# check that default config values are set
default_config = module.configs
defaults = {k: v.get("default") for k, v in default_config.items()}
assert loaded_module.config[module_name] == defaults
assert defaults == loaded_module.config[module_name]
@pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"])