mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 12:48:28 +03:00
Compare commits
4 Commits
v0.12.0
...
youtube-co
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
82c00d491d | ||
|
|
e49550163f | ||
|
|
e6f5981afc | ||
|
|
c62bf1a34d |
4
Pipfile
4
Pipfile
@@ -30,10 +30,10 @@ tqdm = "*"
|
||||
jinja2 = "*"
|
||||
cryptography = "*"
|
||||
dataclasses-json = "*"
|
||||
yt-dlp = "*"
|
||||
yt-dlp = "2024.09.27"
|
||||
vk-url-scraper = "*"
|
||||
requests = {extras = ["socks"], version = "*"}
|
||||
numpy = "*"
|
||||
numpy = "1.26.4"
|
||||
warcio = "*"
|
||||
jsonlines = "*"
|
||||
pysubs2 = "*"
|
||||
|
||||
1961
Pipfile.lock
generated
1961
Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -94,6 +94,14 @@ configurations:
|
||||
password: "vk pass"
|
||||
session_file: "secrets/vk_config.v2.json"
|
||||
|
||||
youtubedl_archiver:
|
||||
subtitles: true
|
||||
# use one of the following two methods to authenticate in youtube - either provide a cookies file or use the cookies of the given browser
|
||||
# for more information, see https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp
|
||||
# cookie_file: "secrets/youtube_cookies.txt"
|
||||
# cookies_from_browser: firefox
|
||||
# proxy: socks5://proxy-user:password@proxy-ip:port
|
||||
|
||||
screenshot_enricher:
|
||||
width: 1280
|
||||
height: 2300
|
||||
|
||||
@@ -30,6 +30,8 @@ class YoutubeDLArchiver(Archiver):
|
||||
"end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
|
||||
'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
|
||||
"max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
|
||||
"cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
|
||||
"cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
|
||||
}
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
@@ -38,8 +40,17 @@ class YoutubeDLArchiver(Archiver):
|
||||
if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
|
||||
logger.debug('Using Facebook cookie')
|
||||
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
|
||||
|
||||
|
||||
ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
|
||||
|
||||
if item.netloc in ['youtube.com', 'www.youtube.com']:
|
||||
if self.cookies_from_browser:
|
||||
logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube')
|
||||
ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,)
|
||||
elif self.cookie_file:
|
||||
logger.debug(f'Using cookies from file {self.cookie_file}')
|
||||
ydl_options['cookiefile'] = self.cookie_file
|
||||
|
||||
ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
|
||||
|
||||
try:
|
||||
|
||||
@@ -34,6 +34,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
||||
"extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
|
||||
"socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
|
||||
"socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
|
||||
"proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
|
||||
}
|
||||
|
||||
def setup(self) -> None:
|
||||
@@ -113,7 +114,10 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
||||
try:
|
||||
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
|
||||
my_env = os.environ.copy()
|
||||
if self.socks_proxy_host and self.socks_proxy_port:
|
||||
if self.proxy_server:
|
||||
logger.debug("Using PROXY_SERVER proxy for browsertrix-crawler")
|
||||
my_env["PROXY_SERVER"] = self.proxy_server
|
||||
elif self.socks_proxy_host and self.socks_proxy_port:
|
||||
logger.debug("Using SOCKS proxy for browsertrix-crawler")
|
||||
my_env["SOCKS_HOST"] = self.socks_proxy_host
|
||||
my_env["SOCKS_PORT"] = str(self.socks_proxy_port)
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
|
||||
_MAJOR = "0"
|
||||
_MINOR = "12"
|
||||
_MINOR = "13"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "0"
|
||||
_PATCH = "1"
|
||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||
_SUFFIX = ""
|
||||
|
||||
Reference in New Issue
Block a user