mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
Compare commits
16 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
52a7cabaf1 | ||
|
|
a739361e12 | ||
|
|
9a97fede43 | ||
|
|
2d13077fad | ||
|
|
8a4a314cf9 | ||
|
|
75e8b788ae | ||
|
|
defe2315bf | ||
|
|
ba0dffdd5e | ||
|
|
a09927c507 | ||
|
|
6c938c489a | ||
|
|
0e39768da9 | ||
|
|
1e5d6ec4a6 | ||
|
|
3385d004cf | ||
|
|
7f27f7fce0 | ||
|
|
a6e3240af1 | ||
|
|
bf4c196cc2 |
6
.github/workflows/docker-publish.yaml
vendored
6
.github/workflows/docker-publish.yaml
vendored
@@ -22,7 +22,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Check out the repo
|
- name: Check out the repo
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
- name: Set up QEMU
|
- name: Set up QEMU
|
||||||
uses: docker/setup-qemu-action@v3
|
uses: docker/setup-qemu-action@v3
|
||||||
@@ -33,14 +33,14 @@ jobs:
|
|||||||
uses: docker/setup-buildx-action@v3
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
- name: Log in to Docker Hub
|
- name: Log in to Docker Hub
|
||||||
uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772
|
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9
|
||||||
with:
|
with:
|
||||||
username: ${{ secrets.DOCKER_USERNAME }}
|
username: ${{ secrets.DOCKER_USERNAME }}
|
||||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||||
|
|
||||||
- name: Extract metadata (tags, labels) for Docker
|
- name: Extract metadata (tags, labels) for Docker
|
||||||
id: meta
|
id: meta
|
||||||
uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804
|
uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051
|
||||||
with:
|
with:
|
||||||
images: bellingcat/auto-archiver
|
images: bellingcat/auto-archiver
|
||||||
|
|
||||||
|
|||||||
4
.github/workflows/python-publish.yaml
vendored
4
.github/workflows/python-publish.yaml
vendored
@@ -22,10 +22,10 @@ jobs:
|
|||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout Repository
|
- name: Checkout Repository
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v6
|
||||||
with:
|
with:
|
||||||
python-version-file: pyproject.toml
|
python-version-file: pyproject.toml
|
||||||
|
|
||||||
|
|||||||
6
.github/workflows/ruff.yaml
vendored
6
.github/workflows/ruff.yaml
vendored
@@ -20,11 +20,11 @@ jobs:
|
|||||||
build:
|
build:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v6
|
||||||
- name: Install Python
|
- name: Install Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v6
|
||||||
with:
|
with:
|
||||||
python-version: "3.11"
|
python-version: "3.12"
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
|
|||||||
6
.github/workflows/tests-core.yaml
vendored
6
.github/workflows/tests-core.yaml
vendored
@@ -26,13 +26,13 @@ jobs:
|
|||||||
working-directory: ./
|
working-directory: ./
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v6
|
||||||
|
|
||||||
- name: Install ffmpeg
|
- name: Install ffmpeg
|
||||||
run: sudo apt-get update && sudo apt-get install -y ffmpeg
|
run: sudo apt-get update && sudo apt-get install -y ffmpeg
|
||||||
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v6
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
|
|
||||||
@@ -40,7 +40,7 @@ jobs:
|
|||||||
run: pipx install poetry
|
run: pipx install poetry
|
||||||
|
|
||||||
- name: Cache Poetry and pip artifacts
|
- name: Cache Poetry and pip artifacts
|
||||||
uses: actions/cache@v4
|
uses: actions/cache@v5
|
||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/.cache/pypoetry
|
~/.cache/pypoetry
|
||||||
|
|||||||
6
.github/workflows/tests-download.yaml
vendored
6
.github/workflows/tests-download.yaml
vendored
@@ -20,13 +20,13 @@ jobs:
|
|||||||
working-directory: ./
|
working-directory: ./
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v6
|
||||||
|
|
||||||
- name: Install ffmpeg
|
- name: Install ffmpeg
|
||||||
run: sudo apt-get update && sudo apt-get install -y ffmpeg
|
run: sudo apt-get update && sudo apt-get install -y ffmpeg
|
||||||
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v6
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
|
|
||||||
@@ -34,7 +34,7 @@ jobs:
|
|||||||
run: pipx install poetry
|
run: pipx install poetry
|
||||||
|
|
||||||
- name: Cache Poetry and pip artifacts
|
- name: Cache Poetry and pip artifacts
|
||||||
uses: actions/cache@v4
|
uses: actions/cache@v5
|
||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/.cache/pypoetry
|
~/.cache/pypoetry
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
FROM webrecorder/browsertrix-crawler:1.9.2 AS base
|
FROM webrecorder/browsertrix-crawler:1.11.4 AS base
|
||||||
|
|
||||||
ENV RUNNING_IN_DOCKER=1 \
|
ENV RUNNING_IN_DOCKER=1 \
|
||||||
LANG=C.UTF-8 \
|
LANG=C.UTF-8 \
|
||||||
|
|||||||
1122
poetry.lock
generated
1122
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "auto-archiver"
|
name = "auto-archiver"
|
||||||
version = "1.2.0"
|
version = "1.2.2"
|
||||||
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
||||||
|
|
||||||
requires-python = ">=3.10,<3.13"
|
requires-python = ">=3.10,<3.13"
|
||||||
@@ -54,7 +54,7 @@ dependencies = [
|
|||||||
"cryptography (>=46.0.3)",
|
"cryptography (>=46.0.3)",
|
||||||
"opentimestamps (>=0.4.5,<0.5.0)",
|
"opentimestamps (>=0.4.5,<0.5.0)",
|
||||||
"bgutil-ytdlp-pot-provider (>=1.0.0)",
|
"bgutil-ytdlp-pot-provider (>=1.0.0)",
|
||||||
"yt-dlp[curl-cffi,default] (>=2025.5.22,<2026.0.0)",
|
"yt-dlp[curl-cffi,default] (>=2025.5.22)",
|
||||||
"secretstorage (>=3.3.3,<4.0.0)",
|
"secretstorage (>=3.3.3,<4.0.0)",
|
||||||
"seleniumbase (>=4.36.4,<5.0.0)",
|
"seleniumbase (>=4.36.4,<5.0.0)",
|
||||||
"pyautogui (>=0.9.54,<0.10.0)",
|
"pyautogui (>=0.9.54,<0.10.0)",
|
||||||
@@ -66,7 +66,7 @@ pytest = "^8.3.4"
|
|||||||
autopep8 = "^2.3.1"
|
autopep8 = "^2.3.1"
|
||||||
pytest-loguru = "^0.4.0"
|
pytest-loguru = "^0.4.0"
|
||||||
pytest-mock = "^3.14.0"
|
pytest-mock = "^3.14.0"
|
||||||
ruff = "^0.9.10"
|
ruff = "^0.15.2"
|
||||||
pre-commit = "^4.1.0"
|
pre-commit = "^4.1.0"
|
||||||
|
|
||||||
[tool.poetry.group.docs.dependencies]
|
[tool.poetry.group.docs.dependencies]
|
||||||
|
|||||||
1106
scripts/settings/package-lock.json
generated
1106
scripts/settings/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -34,7 +34,7 @@ def _extract_metadata(self, webpage, video_id):
|
|||||||
...,
|
...,
|
||||||
"attachments",
|
"attachments",
|
||||||
...,
|
...,
|
||||||
lambda k, v: (k == "media" and str(v["id"]) == video_id and v["__typename"] == "Video"),
|
lambda k, v: k == "media" and str(v["id"]) == video_id and v["__typename"] == "Video",
|
||||||
),
|
),
|
||||||
expected_type=dict,
|
expected_type=dict,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -355,7 +355,7 @@ class GenericExtractor(Extractor):
|
|||||||
if not dropin:
|
if not dropin:
|
||||||
# TODO: add a proper link to 'how to create your own dropin'
|
# TODO: add a proper link to 'how to create your own dropin'
|
||||||
logger.debug(f"""Could not find valid dropin for {info_extractor.ie_key()}.
|
logger.debug(f"""Could not find valid dropin for {info_extractor.ie_key()}.
|
||||||
Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""")
|
Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/modules/autogen/extractor/generic_extractor.html#dropins""")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
post_data = dropin.extract_post(url, ie_instance)
|
post_data = dropin.extract_post(url, ie_instance)
|
||||||
|
|||||||
@@ -24,8 +24,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
self.use_docker = os.environ.get("WACZ_ENABLE_DOCKER") or not os.environ.get("RUNNING_IN_DOCKER")
|
self.use_docker = os.environ.get("WACZ_ENABLE_DOCKER") or not os.environ.get("RUNNING_IN_DOCKER")
|
||||||
self.docker_in_docker = os.environ.get("WACZ_ENABLE_DOCKER") and os.environ.get("RUNNING_IN_DOCKER")
|
self.docker_in_docker = os.environ.get("WACZ_ENABLE_DOCKER") and os.environ.get("RUNNING_IN_DOCKER")
|
||||||
|
|
||||||
self.crawl_id = random_str(8)
|
self.cwd_dind = f"/crawls/crawls{random_str(8)}"
|
||||||
self.cwd_dind = f"/crawls/crawls{self.crawl_id}"
|
|
||||||
self.browsertrix_home_host = os.environ.get("BROWSERTRIX_HOME_HOST")
|
self.browsertrix_home_host = os.environ.get("BROWSERTRIX_HOME_HOST")
|
||||||
self.browsertrix_home_container = os.environ.get("BROWSERTRIX_HOME_CONTAINER") or self.browsertrix_home_host
|
self.browsertrix_home_container = os.environ.get("BROWSERTRIX_HOME_CONTAINER") or self.browsertrix_home_host
|
||||||
# create crawls folder if not exists, so it can be safely removed in cleanup
|
# create crawls folder if not exists, so it can be safely removed in cleanup
|
||||||
@@ -51,7 +50,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
|
|
||||||
url = to_enrich.get_url()
|
url = to_enrich.get_url()
|
||||||
|
|
||||||
collection = self.crawl_id
|
crawl_id = random_str(8)
|
||||||
|
collection = crawl_id
|
||||||
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
|
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
|
||||||
browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host
|
browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host
|
||||||
|
|
||||||
@@ -83,8 +83,10 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
# "--blockAds" # note: this has been known to cause issues on cloudflare protected sites
|
# "--blockAds" # note: this has been known to cause issues on cloudflare protected sites
|
||||||
]
|
]
|
||||||
|
|
||||||
|
crawl_cwd_dind = os.path.join(self.cwd_dind, crawl_id)
|
||||||
if self.docker_in_docker:
|
if self.docker_in_docker:
|
||||||
cmd.extend(["--cwd", self.cwd_dind])
|
os.makedirs(crawl_cwd_dind, exist_ok=True)
|
||||||
|
cmd.extend(["--cwd", crawl_cwd_dind])
|
||||||
|
|
||||||
if self.auth_for_site(url):
|
if self.auth_for_site(url):
|
||||||
# there's an auth for this site, but browsertrix only supports username/password auth
|
# there's an auth for this site, but browsertrix only supports username/password auth
|
||||||
@@ -109,7 +111,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
] + cmd
|
] + cmd
|
||||||
|
|
||||||
if self.profile:
|
if self.profile:
|
||||||
profile_file = f"profile-{self.crawl_id}.tar.gz"
|
profile_file = f"profile-{crawl_id}.tar.gz"
|
||||||
profile_fn = os.path.join(browsertrix_home_container, profile_file)
|
profile_fn = os.path.join(browsertrix_home_container, profile_file)
|
||||||
logger.debug(f"Copying {self.profile} to {profile_fn}")
|
logger.debug(f"Copying {self.profile} to {profile_fn}")
|
||||||
shutil.copyfile(self.profile, profile_fn)
|
shutil.copyfile(self.profile, profile_fn)
|
||||||
@@ -137,7 +139,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
if self.docker_in_docker:
|
if self.docker_in_docker:
|
||||||
wacz_fn = os.path.join(self.cwd_dind, "collections", collection, f"{collection}.wacz")
|
wacz_fn = os.path.join(crawl_cwd_dind, "collections", collection, f"{collection}.wacz")
|
||||||
elif self.use_docker:
|
elif self.use_docker:
|
||||||
wacz_fn = os.path.join(browsertrix_home_container, "collections", collection, f"{collection}.wacz")
|
wacz_fn = os.path.join(browsertrix_home_container, "collections", collection, f"{collection}.wacz")
|
||||||
else:
|
else:
|
||||||
@@ -152,7 +154,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
self.extract_media_from_wacz(to_enrich, wacz_fn)
|
self.extract_media_from_wacz(to_enrich, wacz_fn)
|
||||||
|
|
||||||
if self.docker_in_docker:
|
if self.docker_in_docker:
|
||||||
jsonl_fn = os.path.join(self.cwd_dind, "collections", collection, "pages", "pages.jsonl")
|
jsonl_fn = os.path.join(crawl_cwd_dind, "collections", collection, "pages", "pages.jsonl")
|
||||||
elif self.use_docker:
|
elif self.use_docker:
|
||||||
jsonl_fn = os.path.join(browsertrix_home_container, "collections", collection, "pages", "pages.jsonl")
|
jsonl_fn = os.path.join(browsertrix_home_container, "collections", collection, "pages", "pages.jsonl")
|
||||||
else:
|
else:
|
||||||
|
|||||||
Reference in New Issue
Block a user