mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
Merge branch 'refs/heads/main' into feat/yt-dlp-pots
# Conflicts: # poetry.lock
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
import sys
|
||||
import datetime
|
||||
import os
|
||||
import importlib
|
||||
@@ -42,12 +43,24 @@ class GenericExtractor(Extractor):
|
||||
next_update_check = datetime.datetime.fromisoformat(f.read())
|
||||
|
||||
if not next_update_check or next_update_check < datetime.datetime.now():
|
||||
self.update_ytdlp()
|
||||
updated = self.update_ytdlp()
|
||||
|
||||
next_update_check = datetime.datetime.now() + datetime.timedelta(days=self.ytdlp_update_interval)
|
||||
with open(path, "w") as f:
|
||||
f.write(next_update_check.isoformat())
|
||||
|
||||
if not updated:
|
||||
return
|
||||
|
||||
if os.environ.get("AUTO_ARCHIVER_ALLOW_RESTART", "1") != "1":
|
||||
logger.warning(
|
||||
"yt-dlp has been updated. Auto archiver should be restarted for these changes to take effect"
|
||||
)
|
||||
else:
|
||||
logger.warning("Restarting auto-archiver to apply yt-dlp update")
|
||||
logger.warning(" ======= RESTARTING ======= ")
|
||||
os.execv(sys.executable, [sys.executable] + sys.argv)
|
||||
|
||||
def update_ytdlp(self):
|
||||
logger.info("Checking and updating yt-dlp...")
|
||||
logger.info(
|
||||
@@ -63,12 +76,14 @@ class GenericExtractor(Extractor):
|
||||
if "Successfully installed yt-dlp" in result.stdout.decode():
|
||||
new_version = importlib.metadata.version("yt-dlp")
|
||||
logger.info(f"yt-dlp successfully (from {old_version} to {new_version})")
|
||||
importlib.reload(yt_dlp)
|
||||
return True
|
||||
else:
|
||||
logger.info("yt-dlp already up to date")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating yt-dlp: {e}")
|
||||
return False
|
||||
|
||||
def setup_token_script(self):
|
||||
"""Setup PO Token provider https://github.com/Brainicism/bgutil-ytdlp-pot-provider."""
|
||||
|
||||
@@ -40,27 +40,31 @@
|
||||
Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving.
|
||||
[Browsertrix-crawler](https://crawler.docs.browsertrix.com/user-guide/) is a headless browser-based crawler that archives web pages in WACZ format.
|
||||
|
||||
## Setup
|
||||
|
||||
**Docker**
|
||||
If you are using the Docker file to run Auto Archiver (recommended), then everything is set up and you can use WACZ out of the box!
|
||||
Otherwise, if you are using a local install of Auto Archiver (e.g. pip or dev install), then you will need to install Docker and run
|
||||
the docker daemon to be able to run the `browsertrix-crawler` tool.
|
||||
|
||||
**Browsertrix Profiles**
|
||||
A browsertrix profile is a custom browser profile (login information, browser extensions, etc.) that can be used to archive private or dynamic content.
|
||||
You can run the WACZ Enricher without a profile, but for more resilient archiving, it is recommended to create a profile. See the [Browsertrix documentation](https://crawler.docs.browsertrix.com/user-guide/browser-profiles/)
|
||||
for more information.
|
||||
|
||||
** Docker in Docker **
|
||||
If you are running Auto Archiver within a Docker container, you will need to enable Docker in Docker to run the `browsertrix-crawler` tool.
|
||||
This can be done by setting the `WACZ_ENABLE_DOCKER` environment variable to `1`.
|
||||
|
||||
## Features
|
||||
- Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`.
|
||||
- Supports custom profiles for archiving private or dynamic content.
|
||||
- Extracts media (images, videos, audio) and screenshots from the archive, optionally adding them to the enrichment pipeline.
|
||||
- Generates metadata from the archived page's content and structure (e.g., titles, text).
|
||||
|
||||
## Setup
|
||||
|
||||
### Using Docker
|
||||
If you are using the Auto Archiver [Docker image](https://auto-archiver.readthedocs.io/en/latest/installation/installation.html#installing-with-docker)
|
||||
to run Auto Archiver (recommended), then everything is set up and you can use WACZ out of the box!
|
||||
Otherwise, if you are using a local install of Auto Archiver (e.g. pip or dev install), then you will need to install Docker and run
|
||||
the docker daemon to be able to run the `browsertrix-crawler` tool.
|
||||
|
||||
### Browsertrix Profiles
|
||||
A browsertrix profile is a custom browser profile (login information, browser extensions, etc.) that can be used to archive private or dynamic content.
|
||||
You can run the WACZ Enricher without a profile, but for more resilient archiving, it is recommended to create a profile.
|
||||
See the [Browsertrix documentation](https://crawler.docs.browsertrix.com/user-guide/browser-profiles/) for more information on how to use the `create-login-profile` tool.
|
||||
|
||||
|
||||
|
||||
### Docker in Docker
|
||||
If you are running Auto Archiver within a Docker container, you will need to enable Docker in Docker to run the `browsertrix-crawler` tool.
|
||||
This can be done by setting the `WACZ_ENABLE_DOCKER` environment variable to `1`.
|
||||
|
||||
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -86,6 +86,12 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
||||
if self.docker_in_docker:
|
||||
cmd.extend(["--cwd", self.cwd_dind])
|
||||
|
||||
if self.auth_for_site(url):
|
||||
# there's an auth for this site, but browsertrix only supports username/password auth
|
||||
logger.warning(
|
||||
"The WACZ enricher / Browsertrix does not support using the 'authentication' information for logging in. You should consider creating a Browser Profile for WACZ archiving. More information: https://auto-archiver.readthedocs.io/en/latest/modules/autogen/extractor/wacz_extractor_enricher.html#browsertrix-profiles"
|
||||
)
|
||||
|
||||
# call docker if explicitly enabled or we are running on the host (not in docker)
|
||||
if self.use_docker:
|
||||
logger.debug(f"generating WACZ in Docker for {url=}")
|
||||
|
||||
Reference in New Issue
Block a user