mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-08 03:18:28 +03:00
Merge pull request #74 from bellingcat/feature/browsertrix
This commit is contained in:
24
Dockerfile
24
Dockerfile
@@ -1,5 +1,6 @@
|
||||
# stage 1 - all dependencies
|
||||
From python:3.10
|
||||
FROM webrecorder/browsertrix-crawler:latest
|
||||
|
||||
ENV RUNNING_IN_DOCKER=1
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -7,29 +8,28 @@ WORKDIR /app
|
||||
RUN pip install --upgrade pip && \
|
||||
pip install pipenv && \
|
||||
apt-get update && \
|
||||
apt-get install -y gcc ffmpeg fonts-noto firefox-esr && \
|
||||
apt-get install -y gcc ffmpeg fonts-noto firefox && \
|
||||
wget https://github.com/mozilla/geckodriver/releases/download/v0.32.0/geckodriver-v0.32.0-linux64.tar.gz && \
|
||||
tar -xvzf geckodriver* -C /usr/local/bin && \
|
||||
chmod +x /usr/local/bin/geckodriver && \
|
||||
rm geckodriver-v*
|
||||
|
||||
|
||||
# install docker for WACZ
|
||||
# TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66
|
||||
# RUN curl -fsSL https://get.docker.com | sh
|
||||
|
||||
# TODO: avoid copying unnecessary files, including .git
|
||||
COPY Pipfile Pipfile.lock ./
|
||||
RUN pipenv install --python=3.10 --system --deploy
|
||||
# ENV IS_DOCKER=1
|
||||
COPY Pipfile* ./
|
||||
RUN pipenv install
|
||||
|
||||
# doing this at the end helps during development, builds are quick
|
||||
COPY ./src/ .
|
||||
|
||||
# TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
|
||||
# RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
|
||||
# USER archiver
|
||||
ENTRYPOINT ["python"]
|
||||
# ENTRYPOINT ["docker-entrypoint.sh"]
|
||||
|
||||
ADD docker-entrypoint.sh /docker-entrypoint.sh
|
||||
ENTRYPOINT ["/docker-entrypoint.sh"]
|
||||
|
||||
CMD ["python3"]
|
||||
|
||||
# should be executed with 2 volumes (3 if local_storage)
|
||||
# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa --help
|
||||
6
Pipfile
6
Pipfile
@@ -30,9 +30,13 @@ cryptography = "==38.0.4"
|
||||
dataclasses-json = "*"
|
||||
yt-dlp = ">=2023.2.17"
|
||||
vk-url-scraper = "*"
|
||||
uwsgi = "*"
|
||||
requests = {extras = ["socks"], version = "*"}
|
||||
# wacz = "==0.4.8"
|
||||
pywb = ">=2.7.3"
|
||||
|
||||
[requires]
|
||||
python_version = "3.9"
|
||||
python_version = "3.10"
|
||||
|
||||
[dev-packages]
|
||||
autopep8 = "*"
|
||||
|
||||
1325
Pipfile.lock
generated
1325
Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
27
docker-entrypoint.sh
Executable file
27
docker-entrypoint.sh
Executable file
@@ -0,0 +1,27 @@
|
||||
#!/bin/sh
|
||||
|
||||
# Get UID/GID from volume dir
|
||||
|
||||
VOLUME_UID=$(stat -c '%u' /crawls)
|
||||
VOLUME_GID=$(stat -c '%g' /crawls)
|
||||
|
||||
# Get the UID/GID we are running as
|
||||
|
||||
MY_UID=$(id -u)
|
||||
MY_GID=$(id -g)
|
||||
|
||||
# If we aren't running as the owner of the /crawls/ dir then add a new user
|
||||
# btrix with the same UID/GID of the /crawls dir and run as that user instead.
|
||||
|
||||
if [ "$MY_GID" != "$VOLUME_GID" ] || [ "$MY_UID" != "$VOLUME_UID" ]; then
|
||||
groupadd btrix
|
||||
groupmod -o --gid $VOLUME_GID btrix
|
||||
|
||||
useradd -ms /bin/bash -g $VOLUME_GID btrix
|
||||
usermod -o -u $VOLUME_UID btrix > /dev/null
|
||||
|
||||
su btrix -c '"$@"' -- argv0-ignore "$@"
|
||||
else
|
||||
exec "$@"
|
||||
fi
|
||||
|
||||
@@ -26,36 +26,58 @@ class WaczEnricher(Enricher):
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> bool:
|
||||
# TODO: figure out support for browsertrix in docker
|
||||
|
||||
url = to_enrich.get_url()
|
||||
|
||||
if UrlUtil.is_auth_wall(url):
|
||||
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
|
||||
return
|
||||
|
||||
logger.debug(f"generating WACZ for {url=}")
|
||||
|
||||
collection = str(uuid.uuid4())[0:8]
|
||||
browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
|
||||
cmd = [
|
||||
"docker", "run",
|
||||
"--rm", # delete container once it has completed running
|
||||
"-v", f"{browsertrix_home}:/crawls/",
|
||||
# "-it", # this leads to "the input device is not a TTY"
|
||||
"webrecorder/browsertrix-crawler", "crawl",
|
||||
"--url", url,
|
||||
"--scopeType", "page",
|
||||
"--generateWACZ",
|
||||
"--text",
|
||||
"--collection", collection,
|
||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||
"--behaviorTimeout", str(self.timeout),
|
||||
"--timeout", str(self.timeout)
|
||||
]
|
||||
if self.profile:
|
||||
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
|
||||
shutil.copyfile(self.profile, profile_fn)
|
||||
# TODO: test which is right
|
||||
cmd.extend(["--profile", profile_fn])
|
||||
# cmd.extend(["--profile", "/crawls/profile.tar.gz"])
|
||||
|
||||
if os.getenv('RUNNING_IN_DOCKER'):
|
||||
logger.debug(f"generating WACZ without Docker for {url=}")
|
||||
|
||||
cmd = [
|
||||
"crawl",
|
||||
"--url", url,
|
||||
"--scopeType", "page",
|
||||
"--generateWACZ",
|
||||
"--text",
|
||||
"--collection", collection,
|
||||
"--id", collection,
|
||||
"--saveState", "never",
|
||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||
"--behaviorTimeout", str(self.timeout),
|
||||
"--timeout", str(self.timeout),
|
||||
"--profile", str(self.profile)
|
||||
]
|
||||
else:
|
||||
logger.debug(f"generating WACZ in Docker for {url=}")
|
||||
|
||||
cmd = [
|
||||
"docker", "run",
|
||||
"--rm", # delete container once it has completed running
|
||||
"-v", f"{browsertrix_home}:/crawls/",
|
||||
# "-it", # this leads to "the input device is not a TTY"
|
||||
"webrecorder/browsertrix-crawler", "crawl",
|
||||
"--url", url,
|
||||
"--scopeType", "page",
|
||||
"--generateWACZ",
|
||||
"--text",
|
||||
"--collection", collection,
|
||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||
"--behaviorTimeout", str(self.timeout),
|
||||
"--timeout", str(self.timeout)
|
||||
]
|
||||
|
||||
if self.profile:
|
||||
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
|
||||
shutil.copyfile(self.profile, profile_fn)
|
||||
# TODO: test which is right
|
||||
cmd.extend(["--profile", profile_fn])
|
||||
# cmd.extend(["--profile", "/crawls/profile.tar.gz"])
|
||||
|
||||
try:
|
||||
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
|
||||
@@ -64,7 +86,13 @@ class WaczEnricher(Enricher):
|
||||
logger.error(f"WACZ generation failed: {e}")
|
||||
return False
|
||||
|
||||
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
|
||||
|
||||
|
||||
if os.getenv('RUNNING_IN_DOCKER'):
|
||||
filename = os.path.join("collections", collection, f"{collection}.wacz")
|
||||
else:
|
||||
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
|
||||
|
||||
if not os.path.exists(filename):
|
||||
logger.warning(f"Unable to locate and upload WACZ {filename=}")
|
||||
return False
|
||||
|
||||
Reference in New Issue
Block a user