mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-10 20:28:28 +03:00
Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9584193d69 | ||
|
|
0dd45d90f1 | ||
|
|
edcb2da74a | ||
|
|
17d9bf694f | ||
|
|
368395ffa8 | ||
|
|
21d7d2e16c | ||
|
|
0bbb4c9b08 |
@@ -4,7 +4,6 @@ ENV RUNNING_IN_DOCKER=1
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# TODO: use custom ffmpeg builds instead of apt-get install
|
||||
RUN pip install --upgrade pip && \
|
||||
pip install pipenv && \
|
||||
add-apt-repository ppa:mozillateam/ppa && \
|
||||
@@ -18,7 +17,6 @@ RUN pip install --upgrade pip && \
|
||||
rm geckodriver-v*
|
||||
|
||||
|
||||
# TODO: avoid copying unnecessary files, including .git
|
||||
COPY Pipfile* ./
|
||||
# install from pipenv, with browsertrix-only requirements
|
||||
RUN pipenv install && \
|
||||
@@ -27,11 +25,6 @@ RUN pipenv install && \
|
||||
# doing this at the end helps during development, builds are quick
|
||||
COPY ./src/ .
|
||||
|
||||
# TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
|
||||
# RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
|
||||
# USER archiver
|
||||
|
||||
|
||||
ENTRYPOINT ["pipenv", "run", "python3", "-m", "auto_archiver"]
|
||||
|
||||
# should be executed with 2 volumes (3 if local_storage is used)
|
||||
|
||||
@@ -25,7 +25,7 @@ class YoutubeDLArchiver(Archiver):
|
||||
logger.debug('Using Facebook cookie')
|
||||
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
|
||||
|
||||
ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False})
|
||||
ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': True})
|
||||
|
||||
try:
|
||||
# don'd download since it can be a live stream
|
||||
|
||||
@@ -27,6 +27,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
|
||||
"browsertrix_home": {"default": None, "help": "path to use when calling docker run with a volume, by default it will be the tmp folder generated during execution, but setting this option is needed when running the auto-archiver in a docker container that calls another container via DooD."},
|
||||
"timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
|
||||
"extract_media": {"default": True, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media. The .wacz file will be kept untouched."}
|
||||
}
|
||||
@@ -46,7 +47,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
||||
url = to_enrich.get_url()
|
||||
|
||||
collection = str(uuid.uuid4())[0:8]
|
||||
browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
|
||||
browsertrix_home = self.browsertrix_home or os.path.abspath(ArchivingContext.get_tmp_dir())
|
||||
|
||||
if os.getenv('RUNNING_IN_DOCKER'):
|
||||
logger.debug(f"generating WACZ without Docker for {url=}")
|
||||
|
||||
@@ -3,7 +3,7 @@ _MAJOR = "0"
|
||||
_MINOR = "6"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "6"
|
||||
_PATCH = "8"
|
||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||
_SUFFIX = ""
|
||||
|
||||
Reference in New Issue
Block a user