Merge pull request #74 from bellingcat/feature/browsertrix

This commit is contained in:
Miguel Sozinho Ramalho
2023-05-10 09:36:41 +01:00
committed by GitHub
5 changed files with 1111 additions and 347 deletions

View File

@@ -1,5 +1,6 @@
# stage 1 - all dependencies
From python:3.10
FROM webrecorder/browsertrix-crawler:latest
ENV RUNNING_IN_DOCKER=1
WORKDIR /app
@@ -7,29 +8,28 @@ WORKDIR /app
RUN pip install --upgrade pip && \
pip install pipenv && \
apt-get update && \
apt-get install -y gcc ffmpeg fonts-noto firefox-esr && \
apt-get install -y gcc ffmpeg fonts-noto firefox && \
wget https://github.com/mozilla/geckodriver/releases/download/v0.32.0/geckodriver-v0.32.0-linux64.tar.gz && \
tar -xvzf geckodriver* -C /usr/local/bin && \
chmod +x /usr/local/bin/geckodriver && \
rm geckodriver-v*
# install docker for WACZ
# TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66
# RUN curl -fsSL https://get.docker.com | sh
# TODO: avoid copying unnecessary files, including .git
COPY Pipfile Pipfile.lock ./
RUN pipenv install --python=3.10 --system --deploy
# ENV IS_DOCKER=1
COPY Pipfile* ./
RUN pipenv install
# doing this at the end helps during development, builds are quick
COPY ./src/ .
# TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
# RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
# USER archiver
ENTRYPOINT ["python"]
# ENTRYPOINT ["docker-entrypoint.sh"]
ADD docker-entrypoint.sh /docker-entrypoint.sh
ENTRYPOINT ["/docker-entrypoint.sh"]
CMD ["python3"]
# should be executed with 2 volumes (3 if local_storage)
# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa --help

View File

@@ -30,9 +30,13 @@ cryptography = "==38.0.4"
dataclasses-json = "*"
yt-dlp = ">=2023.2.17"
vk-url-scraper = "*"
uwsgi = "*"
requests = {extras = ["socks"], version = "*"}
# wacz = "==0.4.8"
pywb = ">=2.7.3"
[requires]
python_version = "3.9"
python_version = "3.10"
[dev-packages]
autopep8 = "*"

1325
Pipfile.lock generated

File diff suppressed because it is too large Load Diff

27
docker-entrypoint.sh Executable file
View File

@@ -0,0 +1,27 @@
#!/bin/sh
# Get UID/GID from volume dir
VOLUME_UID=$(stat -c '%u' /crawls)
VOLUME_GID=$(stat -c '%g' /crawls)
# Get the UID/GID we are running as
MY_UID=$(id -u)
MY_GID=$(id -g)
# If we aren't running as the owner of the /crawls/ dir then add a new user
# btrix with the same UID/GID of the /crawls dir and run as that user instead.
if [ "$MY_GID" != "$VOLUME_GID" ] || [ "$MY_UID" != "$VOLUME_UID" ]; then
groupadd btrix
groupmod -o --gid $VOLUME_GID btrix
useradd -ms /bin/bash -g $VOLUME_GID btrix
usermod -o -u $VOLUME_UID btrix > /dev/null
su btrix -c '"$@"' -- argv0-ignore "$@"
else
exec "$@"
fi

View File

@@ -26,36 +26,58 @@ class WaczEnricher(Enricher):
def enrich(self, to_enrich: Metadata) -> bool:
# TODO: figure out support for browsertrix in docker
url = to_enrich.get_url()
if UrlUtil.is_auth_wall(url):
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
return
logger.debug(f"generating WACZ for {url=}")
collection = str(uuid.uuid4())[0:8]
browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
cmd = [
"docker", "run",
"--rm", # delete container once it has completed running
"-v", f"{browsertrix_home}:/crawls/",
# "-it", # this leads to "the input device is not a TTY"
"webrecorder/browsertrix-crawler", "crawl",
"--url", url,
"--scopeType", "page",
"--generateWACZ",
"--text",
"--collection", collection,
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", str(self.timeout),
"--timeout", str(self.timeout)
]
if self.profile:
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
shutil.copyfile(self.profile, profile_fn)
# TODO: test which is right
cmd.extend(["--profile", profile_fn])
# cmd.extend(["--profile", "/crawls/profile.tar.gz"])
if os.getenv('RUNNING_IN_DOCKER'):
logger.debug(f"generating WACZ without Docker for {url=}")
cmd = [
"crawl",
"--url", url,
"--scopeType", "page",
"--generateWACZ",
"--text",
"--collection", collection,
"--id", collection,
"--saveState", "never",
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", str(self.timeout),
"--timeout", str(self.timeout),
"--profile", str(self.profile)
]
else:
logger.debug(f"generating WACZ in Docker for {url=}")
cmd = [
"docker", "run",
"--rm", # delete container once it has completed running
"-v", f"{browsertrix_home}:/crawls/",
# "-it", # this leads to "the input device is not a TTY"
"webrecorder/browsertrix-crawler", "crawl",
"--url", url,
"--scopeType", "page",
"--generateWACZ",
"--text",
"--collection", collection,
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", str(self.timeout),
"--timeout", str(self.timeout)
]
if self.profile:
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
shutil.copyfile(self.profile, profile_fn)
# TODO: test which is right
cmd.extend(["--profile", profile_fn])
# cmd.extend(["--profile", "/crawls/profile.tar.gz"])
try:
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
@@ -64,7 +86,13 @@ class WaczEnricher(Enricher):
logger.error(f"WACZ generation failed: {e}")
return False
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
if os.getenv('RUNNING_IN_DOCKER'):
filename = os.path.join("collections", collection, f"{collection}.wacz")
else:
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
if not os.path.exists(filename):
logger.warning(f"Unable to locate and upload WACZ {filename=}")
return False