Merge pull request #318 from bellingcat/feat/antibot-reddit

Adds RedditDropin and other flow improvements
This commit is contained in:
Miguel Sozinho Ramalho
2025-06-10 18:39:34 +01:00
committed by GitHub
17 changed files with 613 additions and 324 deletions

View File

@@ -12,7 +12,7 @@ updates:
patterns:
- "*"
schedule:
interval: "weekly"
interval: "monthly"
- package-ecosystem: "github-actions"
directory: "/"
@@ -21,7 +21,7 @@ updates:
patterns:
- "*"
schedule:
interval: "weekly"
interval: "monthly"
- package-ecosystem: "npm"
directory: "/scripts/settings/"
@@ -30,11 +30,11 @@ updates:
patterns:
- "*"
schedule:
interval: "weekly"
interval: "monthly"
- package-ecosystem: "docker"
# Look for a `Dockerfile` in the `root` directory
directory: "/"
# Check for updates once a week
schedule:
interval: "weekly"
interval: "monthly"

View File

@@ -1,4 +1,4 @@
FROM webrecorder/browsertrix-crawler:1.6.1 AS base
FROM webrecorder/browsertrix-crawler:1.6.2 AS base
ENV RUNNING_IN_DOCKER=1 \
LANG=C.UTF-8 \

View File

@@ -0,0 +1,44 @@
# Upgrading from v1.0.1
```{note} This how-to is only relevant for people who used Auto Archiver before June 2025 (versions prior to 1.1.0).
If you are new to Auto Archiver, then you are already using the latest configuration format and this how-to is not relevant for you.
```
Versions 1.1.0+ of Auto Archiver has breaking changes in the configuration format, which means earlier configuration formats will not work without slight modifications.
## Dropping `vk_extractor` module
We have dropped the `vk_extractor` because of problems in a project we relied on. You will need to remove it from your configuration file, otherwise you will see an error like:
```{code} console
Module 'vk_extractor' not found. Are you sure it's installed/exists?
```
## New `antibot_extractor_enricher` module and VkDropin
We have added a new `antibot_extractor_enricher` module that uses a computer-controlled browser to extract content from websites that use anti-bot measures. You can add it to your configuration file like this:
```{code} yaml
steps:
extractors:
- antibot_extractor_enricher
# or alternatively, if you want to use it as an enricher:
enrichers:
- antibot_extractor_enricher
```
It comes with Dropins that we will be adding and maintaining.
> Dropin: A module with site-specific behaviours that is loaded automatically. You don't need to add them to your configuration steps for them to run. Sometimes they need `authentication` configurations though.
One such Dropin is the VkDropin which uses this automated browser to access VKontakte (VK) pages. You should add a username/password to the configuration file if you get authentication blocks from VK, to do so use the [authentication settings](authentication_how_to.md):
```{code} yaml
authentication:
vk:
username: your_username
password: your_password
```
See all available Dropins in [the source code](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules/antibot_extractor_enricher/dropins). Usually each Dropin needs its own authentication settings, similarly to the VkDropin.

109
poetry.lock generated
View File

@@ -193,18 +193,18 @@ files = [
[[package]]
name = "boto3"
version = "1.38.27"
version = "1.38.33"
description = "The AWS SDK for Python"
optional = false
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "boto3-1.38.27-py3-none-any.whl", hash = "sha256:95f5fe688795303a8a15e8b7e7f255cadab35eae459d00cc281a4fd77252ea80"},
{file = "boto3-1.38.27.tar.gz", hash = "sha256:94bd7fdd92d5701b362d4df100d21e28f8307a67ff56b6a8b0398119cf22f859"},
{file = "boto3-1.38.33-py3-none-any.whl", hash = "sha256:25d0717489c658f7ae6c3c7f0f7e1b4d611b30b2f08f0fcef6455ac6864a8768"},
{file = "boto3-1.38.33.tar.gz", hash = "sha256:6467909c1ae01ff67981f021bb2568592211765ec8a9a1d2c4529191e46c3541"},
]
[package.dependencies]
botocore = ">=1.38.27,<1.39.0"
botocore = ">=1.38.33,<1.39.0"
jmespath = ">=0.7.1,<2.0.0"
s3transfer = ">=0.13.0,<0.14.0"
@@ -213,14 +213,14 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
[[package]]
name = "botocore"
version = "1.38.27"
version = "1.38.33"
description = "Low-level, data-driven core of boto 3."
optional = false
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "botocore-1.38.27-py3-none-any.whl", hash = "sha256:a785d5e9a5eda88ad6ab9ed8b87d1f2ac409d0226bba6ff801c55359e94d91a8"},
{file = "botocore-1.38.27.tar.gz", hash = "sha256:9788f7efe974328a38cbade64cc0b1e67d27944b899f88cb786ae362973133b6"},
{file = "botocore-1.38.33-py3-none-any.whl", hash = "sha256:ad25233e93dcebe95809b1f9393c1f11a639696327793d166295fb78dd7bc597"},
{file = "botocore-1.38.33.tar.gz", hash = "sha256:dbe8fea9d0426c644c89ef2018ead886ccbcc22901a02b377b4e65ce1cb69a2b"},
]
[package.dependencies]
@@ -941,14 +941,14 @@ files = [
[[package]]
name = "google-api-core"
version = "2.24.2"
version = "2.25.0"
description = "Google API client core library"
optional = false
python-versions = ">=3.7"
groups = ["main"]
files = [
{file = "google_api_core-2.24.2-py3-none-any.whl", hash = "sha256:810a63ac95f3c441b7c0e43d344e372887f62ce9071ba972eacf32672e072de9"},
{file = "google_api_core-2.24.2.tar.gz", hash = "sha256:81718493daf06d96d6bc76a91c23874dbf2fac0adbbf542831b805ee6e974696"},
{file = "google_api_core-2.25.0-py3-none-any.whl", hash = "sha256:1db79d1281dcf9f3d10023283299ba38f3dc9f639ec41085968fd23e5bcf512e"},
{file = "google_api_core-2.25.0.tar.gz", hash = "sha256:9b548e688702f82a34ed8409fb8a6961166f0b7795032f0be8f48308dff4333a"},
]
[package.dependencies]
@@ -959,21 +959,21 @@ protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4
requests = ">=2.18.0,<3.0.0"
[package.extras]
async-rest = ["google-auth[aiohttp] (>=2.35.0,<3.0.dev0)"]
grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev) ; python_version >= \"3.11\"", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0) ; python_version >= \"3.11\""]
grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
async-rest = ["google-auth[aiohttp] (>=2.35.0,<3.0.0)"]
grpc = ["grpcio (>=1.33.2,<2.0.0)", "grpcio (>=1.49.1,<2.0.0) ; python_version >= \"3.11\"", "grpcio-status (>=1.33.2,<2.0.0)", "grpcio-status (>=1.49.1,<2.0.0) ; python_version >= \"3.11\""]
grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0.0)"]
grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.0)"]
[[package]]
name = "google-api-python-client"
version = "2.170.0"
version = "2.171.0"
description = "Google API Client Library for Python"
optional = false
python-versions = ">=3.7"
groups = ["main"]
files = [
{file = "google_api_python_client-2.170.0-py3-none-any.whl", hash = "sha256:7bf518a0527ad23322f070fa69f4f24053170d5c766821dc970ff0571ec22748"},
{file = "google_api_python_client-2.170.0.tar.gz", hash = "sha256:75f3a1856f11418ea3723214e0abc59d9b217fd7ed43dcf743aab7f06ab9e2b1"},
{file = "google_api_python_client-2.171.0-py3-none-any.whl", hash = "sha256:c9c9b76f561e9d9ac14e54a9e2c0842876201d5b96e69e48f967373f0784cbe9"},
{file = "google_api_python_client-2.171.0.tar.gz", hash = "sha256:057a5c08d28463c6b9eb89746355de5f14b7ed27a65c11fdbf1d06c66bb66b23"},
]
[package.dependencies]
@@ -985,14 +985,14 @@ uritemplate = ">=3.0.1,<5"
[[package]]
name = "google-auth"
version = "2.40.2"
version = "2.40.3"
description = "Google Authentication Library"
optional = false
python-versions = ">=3.7"
groups = ["main"]
files = [
{file = "google_auth-2.40.2-py2.py3-none-any.whl", hash = "sha256:f7e568d42eedfded58734f6a60c58321896a621f7c116c411550a4b4a13da90b"},
{file = "google_auth-2.40.2.tar.gz", hash = "sha256:a33cde547a2134273226fa4b853883559947ebe9207521f7afc707efbf690f58"},
{file = "google_auth-2.40.3-py2.py3-none-any.whl", hash = "sha256:1370d4593e86213563547f97a92752fc658456fe4514c809544f330fed45a7ca"},
{file = "google_auth-2.40.3.tar.gz", hash = "sha256:500c3a29adedeb36ea9cf24b8d10858e152f2412e3ca37829b3fa18e33d63b77"},
]
[package.dependencies]
@@ -2130,7 +2130,7 @@ version = "2.19.1"
description = "Pygments is a syntax highlighting package written in Python."
optional = false
python-versions = ">=3.8"
groups = ["main", "docs"]
groups = ["main", "dev", "docs"]
files = [
{file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"},
{file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"},
@@ -2335,26 +2335,27 @@ files = [
[[package]]
name = "pytest"
version = "8.3.5"
version = "8.4.0"
description = "pytest: simple powerful testing with Python"
optional = false
python-versions = ">=3.8"
python-versions = ">=3.9"
groups = ["main", "dev"]
files = [
{file = "pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820"},
{file = "pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845"},
{file = "pytest-8.4.0-py3-none-any.whl", hash = "sha256:f40f825768ad76c0977cbacdf1fd37c6f7a468e460ea6a0636078f8972d4517e"},
{file = "pytest-8.4.0.tar.gz", hash = "sha256:14d920b48472ea0dbf68e45b96cd1ffda4705f33307dcc86c676c1b5104838a6"},
]
[package.dependencies]
colorama = {version = "*", markers = "sys_platform == \"win32\""}
exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
iniconfig = "*"
packaging = "*"
colorama = {version = ">=0.4", markers = "sys_platform == \"win32\""}
exceptiongroup = {version = ">=1", markers = "python_version < \"3.11\""}
iniconfig = ">=1"
packaging = ">=20"
pluggy = ">=1.5,<2"
pygments = ">=2.7.2"
tomli = {version = ">=1", markers = "python_version < \"3.11\""}
[package.extras]
dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "requests", "setuptools", "xmlschema"]
[[package]]
name = "pytest-html"
@@ -2766,19 +2767,19 @@ files = [
[[package]]
name = "requests"
version = "2.32.3"
version = "2.32.4"
description = "Python HTTP for Humans."
optional = false
python-versions = ">=3.8"
groups = ["main", "docs"]
files = [
{file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
{file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
{file = "requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c"},
{file = "requests-2.32.4.tar.gz", hash = "sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422"},
]
[package.dependencies]
certifi = ">=2017.4.17"
charset-normalizer = ">=2,<4"
charset_normalizer = ">=2,<4"
idna = ">=2.5,<4"
PySocks = {version = ">=1.5.6,<1.5.7 || >1.5.7", optional = true, markers = "extra == \"socks\""}
urllib3 = ">=1.21.1,<3"
@@ -2894,7 +2895,7 @@ description = "Manipulate well-formed Roman numerals"
optional = false
python-versions = ">=3.9"
groups = ["docs"]
markers = "python_version >= \"3.11\""
markers = "python_version != \"3.10\""
files = [
{file = "roman_numerals_py-3.1.0-py3-none-any.whl", hash = "sha256:9da2ad2fb670bcf24e81070ceb3be72f6c11c440d73bd579fbeca1e9f330954c"},
{file = "roman_numerals_py-3.1.0.tar.gz", hash = "sha256:be4bf804f083a4ce001b5eb7e3c0862479d10f94c936f6c4e5f250aa5ff5bd2d"},
@@ -2921,14 +2922,14 @@ pyasn1 = ">=0.1.3"
[[package]]
name = "ruamel-yaml"
version = "0.18.12"
version = "0.18.14"
description = "ruamel.yaml is a YAML parser/emitter that supports roundtrip preservation of comments, seq/map flow style, and map key order"
optional = false
python-versions = ">=3.7"
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "ruamel.yaml-0.18.12-py3-none-any.whl", hash = "sha256:790ba4c48b6a6e6b12b532a7308779eb12d2aaab3a80fdb8389216f28ea2b287"},
{file = "ruamel.yaml-0.18.12.tar.gz", hash = "sha256:5a38fd5ce39d223bebb9e3a6779e86b9427a03fb0bf9f270060f8b149cffe5e2"},
{file = "ruamel.yaml-0.18.14-py3-none-any.whl", hash = "sha256:710ff198bb53da66718c7db27eec4fbcc9aa6ca7204e4c1df2f282b6fe5eb6b2"},
{file = "ruamel.yaml-0.18.14.tar.gz", hash = "sha256:7227b76aaec364df15936730efbf7d72b30c0b79b1d578bbb8e3dcb2d81f52b7"},
]
[package.dependencies]
@@ -3112,14 +3113,14 @@ websocket-client = ">=1.8.0,<1.9.0"
[[package]]
name = "seleniumbase"
version = "4.39.2"
version = "4.39.3"
description = "A complete web automation framework for end-to-end testing."
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "seleniumbase-4.39.2-py3-none-any.whl", hash = "sha256:23b2d071c02ba269a8239b828fd5098edb208d04171143c93b40d8a351ba2861"},
{file = "seleniumbase-4.39.2.tar.gz", hash = "sha256:3a18d582ca90f4d633debb8ec45871db1b7aed71e5876fc634962fba79731967"},
{file = "seleniumbase-4.39.3-py3-none-any.whl", hash = "sha256:cbb94d7858a9ef3b0b4431a5879150649f4a73029afaa8ecfb7bda113f2565e1"},
{file = "seleniumbase-4.39.3.tar.gz", hash = "sha256:b32978e685b1e4e2c7859b2dcb377ac14ba99bf748ea428548f9e450257b861b"},
]
[package.dependencies]
@@ -3156,7 +3157,7 @@ pygments = ">=2.19.1"
pynose = ">=1.5.4"
pyotp = "2.9.0"
pyreadline3 = {version = ">=3.5.3", markers = "platform_system == \"Windows\""}
pytest = "8.3.5"
pytest = {version = "8.4.0", markers = "python_version >= \"3.9\""}
pytest-html = "4.0.2"
pytest-metadata = "3.1.1"
pytest-ordering = "0.6"
@@ -3164,11 +3165,11 @@ pytest-rerunfailures = {version = "15.1", markers = "python_version >= \"3.9\""}
pytest-xdist = {version = "3.7.0", markers = "python_version >= \"3.9\""}
python-xlib = {version = "0.33", markers = "platform_system == \"Linux\""}
pyyaml = ">=6.0.2"
requests = "2.32.3"
requests = "2.32.4"
rich = ">=14.0.0,<15"
sbvirtualdisplay = ">=1.4.0"
selenium = {version = "4.33.0", markers = "python_version >= \"3.10\""}
setuptools = {version = ">=80.8.0", markers = "python_version >= \"3.10\""}
setuptools = {version = ">=80.9.0", markers = "python_version >= \"3.10\""}
six = ">=1.17.0"
sniffio = "1.3.1"
sortedcontainers = "2.4.0"
@@ -3323,7 +3324,7 @@ description = "Python documentation generator"
optional = false
python-versions = ">=3.11"
groups = ["docs"]
markers = "python_version >= \"3.11\""
markers = "python_version != \"3.10\""
files = [
{file = "sphinx-8.2.3-py3-none-any.whl", hash = "sha256:4405915165f13521d875a8c29c8970800a0141c14cc5416a38feca4ea5d9b9c3"},
{file = "sphinx-8.2.3.tar.gz", hash = "sha256:398ad29dee7f63a75888314e9424d40f52ce5a6a87ae88e7071e80af296ec348"},
@@ -3801,14 +3802,14 @@ test = ["coverage", "pytest", "pytest-cov"]
[[package]]
name = "uritemplate"
version = "4.1.1"
version = "4.2.0"
description = "Implementation of RFC 6570 URI Templates"
optional = false
python-versions = ">=3.6"
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "uritemplate-4.1.1-py2.py3-none-any.whl", hash = "sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e"},
{file = "uritemplate-4.1.1.tar.gz", hash = "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0"},
{file = "uritemplate-4.2.0-py3-none-any.whl", hash = "sha256:962201ba1c4edcab02e60f9a0d3821e82dfc5d2d6662a21abd533879bdb8a686"},
{file = "uritemplate-4.2.0.tar.gz", hash = "sha256:480c2ed180878955863323eea31b0ede668795de182617fef9c6ca09e6ec9d0e"},
]
[[package]]
@@ -4120,14 +4121,14 @@ h11 = ">=0.9.0,<1"
[[package]]
name = "yt-dlp"
version = "2025.5.22"
version = "2025.6.9"
description = "A feature-rich command-line audio/video downloader"
optional = false
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "yt_dlp-2025.5.22-py3-none-any.whl", hash = "sha256:a49c4b76afeaded6254c3e2b759d8d5a13271aa963d5fccb51fe059d1c313151"},
{file = "yt_dlp-2025.5.22.tar.gz", hash = "sha256:ea73854c5dabc124f29a35a8fae9bc5d422ef3231bebeea2bdfa82ac191a9c29"},
{file = "yt_dlp-2025.6.9-py3-none-any.whl", hash = "sha256:ebdfda9ffa807f6a26aed7c8f906e5557cd06b4c388dc547df1ec2078631fca8"},
{file = "yt_dlp-2025.6.9.tar.gz", hash = "sha256:751f53a3b61353522bf805fa30bbcbd16666126537e39706eab4f8c368f111ac"},
]
[package.dependencies]
@@ -4142,7 +4143,7 @@ urllib3 = {version = ">=1.26.17,<3", optional = true, markers = "extra == \"defa
websockets = {version = ">=13.0", optional = true, markers = "extra == \"default\""}
[package.extras]
build = ["build", "hatchling", "pip", "setuptools (>=71.0.2)", "wheel"]
build = ["build", "hatchling", "pip", "setuptools (>=71.0.2,<81)", "wheel"]
curl-cffi = ["curl-cffi (>=0.5.10,<0.6.dev0 || ==0.10.*) ; implementation_name == \"cpython\""]
default = ["brotli ; implementation_name == \"cpython\"", "brotlicffi ; implementation_name != \"cpython\"", "certifi", "mutagen", "pycryptodomex", "requests (>=2.32.2,<3)", "urllib3 (>=1.26.17,<3)", "websockets (>=13.0)"]
dev = ["autopep8 (>=2.0,<3.0)", "pre-commit", "pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)", "ruff (>=0.11.0,<0.12.0)"]

View File

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[project]
name = "auto-archiver"
version = "1.0.1"
version = "1.1.0"
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
requires-python = ">=3.10,<3.13"

View File

@@ -10,21 +10,21 @@
"dependencies": {
"@dnd-kit/core": "^6.3.1",
"@dnd-kit/sortable": "^10.0.0",
"@emotion/react": "*",
"@emotion/styled": "*",
"@emotion/react": "latest",
"@emotion/styled": "latest",
"@mui/icons-material": "^7.1.1",
"@mui/material": "*",
"@mui/material": "latest",
"react": "19.1.0",
"react-dom": "19.1.0",
"react-markdown": "^10.0.0",
"yaml": "^2.7.0"
},
"devDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"@vitejs/plugin-react": "*",
"typescript": "*",
"vite": "*",
"@types/react": "latest",
"@types/react-dom": "latest",
"@vitejs/plugin-react": "latest",
"typescript": "latest",
"vite": "latest",
"vite-plugin-singlefile": "^2.1.0"
}
},
@@ -57,9 +57,9 @@
}
},
"node_modules/@babel/compat-data": {
"version": "7.27.3",
"resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.27.3.tgz",
"integrity": "sha512-V42wFfx1ymFte+ecf6iXghnnP8kWTO+ZLXIyZq+1LAXHHvTZdVxicn4yiVYdYMGaCO3tmqub11AorKkv+iodqw==",
"version": "7.27.5",
"resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.27.5.tgz",
"integrity": "sha512-KiRAp/VoJaWkkte84TvUd9qjdbZAdiqyvMxrGl1N6vzFogKmaLgoM3L1kgtLicp2HP5fBJS8JrZKLVIZGVJAVg==",
"dev": true,
"license": "MIT",
"engines": {
@@ -105,12 +105,12 @@
"license": "MIT"
},
"node_modules/@babel/generator": {
"version": "7.27.3",
"resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.27.3.tgz",
"integrity": "sha512-xnlJYj5zepml8NXtjkG0WquFUv8RskFqyFcVgTBp5k+NaA/8uw/K+OSVf8AMGw5e9HKP2ETd5xpK5MLZQD6b4Q==",
"version": "7.27.5",
"resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.27.5.tgz",
"integrity": "sha512-ZGhA37l0e/g2s1Cnzdix0O3aLYm66eF8aufiVteOgnwxgnRP8GoyMj7VWsgWnQbVKXyge7hqrFh2K2TQM6t1Hw==",
"license": "MIT",
"dependencies": {
"@babel/parser": "^7.27.3",
"@babel/parser": "^7.27.5",
"@babel/types": "^7.27.3",
"@jridgewell/gen-mapping": "^0.3.5",
"@jridgewell/trace-mapping": "^0.3.25",
@@ -207,23 +207,23 @@
}
},
"node_modules/@babel/helpers": {
"version": "7.27.4",
"resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.27.4.tgz",
"integrity": "sha512-Y+bO6U+I7ZKaM5G5rDUZiYfUvQPUibYmAFe7EnKdnKBbVXDZxvp+MWOH5gYciY0EPk4EScsuFMQBbEfpdRKSCQ==",
"version": "7.27.6",
"resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.27.6.tgz",
"integrity": "sha512-muE8Tt8M22638HU31A3CgfSUciwz1fhATfoVai05aPXGor//CdWDCbnlY1yvBPo07njuVOCNGCSp/GTt12lIug==",
"dev": true,
"license": "MIT",
"dependencies": {
"@babel/template": "^7.27.2",
"@babel/types": "^7.27.3"
"@babel/types": "^7.27.6"
},
"engines": {
"node": ">=6.9.0"
}
},
"node_modules/@babel/parser": {
"version": "7.27.4",
"resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.27.4.tgz",
"integrity": "sha512-BRmLHGwpUqLFR2jzx9orBuX/ABDkj2jLKOXrHDTN2aOKL+jFDDKaRNo9nyYsIl9h/UE/7lMKdDjKQQyxKKDZ7g==",
"version": "7.27.5",
"resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.27.5.tgz",
"integrity": "sha512-OsQd175SxWkGlzbny8J3K8TnnDD0N3lrIUtB92xwyRpzaenGZhxDvxN/JgU00U3CDZNj9tPuDJ5H0WS4Nt3vKg==",
"license": "MIT",
"dependencies": {
"@babel/types": "^7.27.3"
@@ -268,9 +268,9 @@
}
},
"node_modules/@babel/runtime": {
"version": "7.27.4",
"resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.27.4.tgz",
"integrity": "sha512-t3yaEOuGu9NlIZ+hIeGbBjFtZT7j2cb2tg0fuaJKeGotchRjjLfrBA9Kwf8quhpP1EUuxModQg04q/mBwyg8uA==",
"version": "7.27.6",
"resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.27.6.tgz",
"integrity": "sha512-vbavdySgbTTrmFE+EsiqUTzlOr5bzlnJtUv9PynGCAKvfQqjIXbvFdumPM/GxMDfyuGMJaJAU6TO4zc1Jf1i8Q==",
"license": "MIT",
"engines": {
"node": ">=6.9.0"
@@ -309,9 +309,9 @@
}
},
"node_modules/@babel/types": {
"version": "7.27.3",
"resolved": "https://registry.npmjs.org/@babel/types/-/types-7.27.3.tgz",
"integrity": "sha512-Y1GkI4ktrtvmawoSq+4FCVHNryea6uR+qUQy0AGxLSsjCX0nVmkYQMBLHDkXZuo5hGx7eYdnIaslsdBFm7zbUw==",
"version": "7.27.6",
"resolved": "https://registry.npmjs.org/@babel/types/-/types-7.27.6.tgz",
"integrity": "sha512-ETyHEk2VHHvl9b9jZP5IHPavHYk57EhanlRRuae9XCpb/j5bDCbPPMOBfCWhnl/7EDJz0jEMCi/RhccCE8r1+Q==",
"license": "MIT",
"dependencies": {
"@babel/helper-string-parser": "^7.27.1",
@@ -1237,16 +1237,16 @@
}
},
"node_modules/@rolldown/pluginutils": {
"version": "1.0.0-beta.9",
"resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.9.tgz",
"integrity": "sha512-e9MeMtVWo186sgvFFJOPGy7/d2j2mZhLJIdVW0C/xDluuOvymEATqz6zKsP0ZmXGzQtqlyjz5sC1sYQUoJG98w==",
"version": "1.0.0-beta.11",
"resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.11.tgz",
"integrity": "sha512-L/gAA/hyCSuzTF1ftlzUSI/IKr2POHsv1Dd78GfqkR83KMNuswWD61JxGV2L7nRwBBBSDr6R1gCkdTmoN7W4ag==",
"dev": true,
"license": "MIT"
},
"node_modules/@rollup/rollup-android-arm-eabi": {
"version": "4.41.1",
"resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.41.1.tgz",
"integrity": "sha512-NELNvyEWZ6R9QMkiytB4/L4zSEaBC03KIXEghptLGLZWJ6VPrL63ooZQCOnlx36aQPGhzuOMwDerC1Eb2VmrLw==",
"version": "4.42.0",
"resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.42.0.tgz",
"integrity": "sha512-gldmAyS9hpj+H6LpRNlcjQWbuKUtb94lodB9uCz71Jm+7BxK1VIOo7y62tZZwxhA7j1ylv/yQz080L5WkS+LoQ==",
"cpu": [
"arm"
],
@@ -1258,9 +1258,9 @@
]
},
"node_modules/@rollup/rollup-android-arm64": {
"version": "4.41.1",
"resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.41.1.tgz",
"integrity": "sha512-DXdQe1BJ6TK47ukAoZLehRHhfKnKg9BjnQYUu9gzhI8Mwa1d2fzxA1aw2JixHVl403bwp1+/o/NhhHtxWJBgEA==",
"version": "4.42.0",
"resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.42.0.tgz",
"integrity": "sha512-bpRipfTgmGFdCZDFLRvIkSNO1/3RGS74aWkJJTFJBH7h3MRV4UijkaEUeOMbi9wxtxYmtAbVcnMtHTPBhLEkaw==",
"cpu": [
"arm64"
],
@@ -1272,9 +1272,9 @@
]
},
"node_modules/@rollup/rollup-darwin-arm64": {
"version": "4.41.1",
"resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.41.1.tgz",
"integrity": "sha512-5afxvwszzdulsU2w8JKWwY8/sJOLPzf0e1bFuvcW5h9zsEg+RQAojdW0ux2zyYAz7R8HvvzKCjLNJhVq965U7w==",
"version": "4.42.0",
"resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.42.0.tgz",
"integrity": "sha512-JxHtA081izPBVCHLKnl6GEA0w3920mlJPLh89NojpU2GsBSB6ypu4erFg/Wx1qbpUbepn0jY4dVWMGZM8gplgA==",
"cpu": [
"arm64"
],
@@ -1286,9 +1286,9 @@
]
},
"node_modules/@rollup/rollup-darwin-x64": {
"version": "4.41.1",
"resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.41.1.tgz",
"integrity": "sha512-egpJACny8QOdHNNMZKf8xY0Is6gIMz+tuqXlusxquWu3F833DcMwmGM7WlvCO9sB3OsPjdC4U0wHw5FabzCGZg==",
"version": "4.42.0",
"resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.42.0.tgz",
"integrity": "sha512-rv5UZaWVIJTDMyQ3dCEK+m0SAn6G7H3PRc2AZmExvbDvtaDc+qXkei0knQWcI3+c9tEs7iL/4I4pTQoPbNL2SA==",
"cpu": [
"x64"
],
@@ -1300,9 +1300,9 @@
]
},
"node_modules/@rollup/rollup-freebsd-arm64": {
"version": "4.41.1",
"resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.41.1.tgz",
"integrity": "sha512-DBVMZH5vbjgRk3r0OzgjS38z+atlupJ7xfKIDJdZZL6sM6wjfDNo64aowcLPKIx7LMQi8vybB56uh1Ftck/Atg==",
"version": "4.42.0",
"resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.42.0.tgz",
"integrity": "sha512-fJcN4uSGPWdpVmvLuMtALUFwCHgb2XiQjuECkHT3lWLZhSQ3MBQ9pq+WoWeJq2PrNxr9rPM1Qx+IjyGj8/c6zQ==",
"cpu": [
"arm64"
],
@@ -1314,9 +1314,9 @@
]
},
"node_modules/@rollup/rollup-freebsd-x64": {
"version": "4.41.1",
"resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.41.1.tgz",
"integrity": "sha512-3FkydeohozEskBxNWEIbPfOE0aqQgB6ttTkJ159uWOFn42VLyfAiyD9UK5mhu+ItWzft60DycIN1Xdgiy8o/SA==",
"version": "4.42.0",
"resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.42.0.tgz",
"integrity": "sha512-CziHfyzpp8hJpCVE/ZdTizw58gr+m7Y2Xq5VOuCSrZR++th2xWAz4Nqk52MoIIrV3JHtVBhbBsJcAxs6NammOQ==",
"cpu": [
"x64"
],
@@ -1328,9 +1328,9 @@
]
},
"node_modules/@rollup/rollup-linux-arm-gnueabihf": {
"version": "4.41.1",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.41.1.tgz",
"integrity": "sha512-wC53ZNDgt0pqx5xCAgNunkTzFE8GTgdZ9EwYGVcg+jEjJdZGtq9xPjDnFgfFozQI/Xm1mh+D9YlYtl+ueswNEg==",
"version": "4.42.0",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.42.0.tgz",
"integrity": "sha512-UsQD5fyLWm2Fe5CDM7VPYAo+UC7+2Px4Y+N3AcPh/LdZu23YcuGPegQly++XEVaC8XUTFVPscl5y5Cl1twEI4A==",
"cpu": [
"arm"
],
@@ -1342,9 +1342,9 @@
]
},
"node_modules/@rollup/rollup-linux-arm-musleabihf": {
"version": "4.41.1",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.41.1.tgz",
"integrity": "sha512-jwKCca1gbZkZLhLRtsrka5N8sFAaxrGz/7wRJ8Wwvq3jug7toO21vWlViihG85ei7uJTpzbXZRcORotE+xyrLA==",
"version": "4.42.0",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.42.0.tgz",
"integrity": "sha512-/i8NIrlgc/+4n1lnoWl1zgH7Uo0XK5xK3EDqVTf38KvyYgCU/Rm04+o1VvvzJZnVS5/cWSd07owkzcVasgfIkQ==",
"cpu": [
"arm"
],
@@ -1356,9 +1356,9 @@
]
},
"node_modules/@rollup/rollup-linux-arm64-gnu": {
"version": "4.41.1",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.41.1.tgz",
"integrity": "sha512-g0UBcNknsmmNQ8V2d/zD2P7WWfJKU0F1nu0k5pW4rvdb+BIqMm8ToluW/eeRmxCared5dD76lS04uL4UaNgpNA==",
"version": "4.42.0",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.42.0.tgz",
"integrity": "sha512-eoujJFOvoIBjZEi9hJnXAbWg+Vo1Ov8n/0IKZZcPZ7JhBzxh2A+2NFyeMZIRkY9iwBvSjloKgcvnjTbGKHE44Q==",
"cpu": [
"arm64"
],
@@ -1370,9 +1370,9 @@
]
},
"node_modules/@rollup/rollup-linux-arm64-musl": {
"version": "4.41.1",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.41.1.tgz",
"integrity": "sha512-XZpeGB5TKEZWzIrj7sXr+BEaSgo/ma/kCgrZgL0oo5qdB1JlTzIYQKel/RmhT6vMAvOdM2teYlAaOGJpJ9lahg==",
"version": "4.42.0",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.42.0.tgz",
"integrity": "sha512-/3NrcOWFSR7RQUQIuZQChLND36aTU9IYE4j+TB40VU78S+RA0IiqHR30oSh6P1S9f9/wVOenHQnacs/Byb824g==",
"cpu": [
"arm64"
],
@@ -1384,9 +1384,9 @@
]
},
"node_modules/@rollup/rollup-linux-loongarch64-gnu": {
"version": "4.41.1",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loongarch64-gnu/-/rollup-linux-loongarch64-gnu-4.41.1.tgz",
"integrity": "sha512-bkCfDJ4qzWfFRCNt5RVV4DOw6KEgFTUZi2r2RuYhGWC8WhCA8lCAJhDeAmrM/fdiAH54m0mA0Vk2FGRPyzI+tw==",
"version": "4.42.0",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loongarch64-gnu/-/rollup-linux-loongarch64-gnu-4.42.0.tgz",
"integrity": "sha512-O8AplvIeavK5ABmZlKBq9/STdZlnQo7Sle0LLhVA7QT+CiGpNVe197/t8Aph9bhJqbDVGCHpY2i7QyfEDDStDg==",
"cpu": [
"loong64"
],
@@ -1398,9 +1398,9 @@
]
},
"node_modules/@rollup/rollup-linux-powerpc64le-gnu": {
"version": "4.41.1",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.41.1.tgz",
"integrity": "sha512-3mr3Xm+gvMX+/8EKogIZSIEF0WUu0HL9di+YWlJpO8CQBnoLAEL/roTCxuLncEdgcfJcvA4UMOf+2dnjl4Ut1A==",
"version": "4.42.0",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.42.0.tgz",
"integrity": "sha512-6Qb66tbKVN7VyQrekhEzbHRxXXFFD8QKiFAwX5v9Xt6FiJ3BnCVBuyBxa2fkFGqxOCSGGYNejxd8ht+q5SnmtA==",
"cpu": [
"ppc64"
],
@@ -1412,9 +1412,9 @@
]
},
"node_modules/@rollup/rollup-linux-riscv64-gnu": {
"version": "4.41.1",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.41.1.tgz",
"integrity": "sha512-3rwCIh6MQ1LGrvKJitQjZFuQnT2wxfU+ivhNBzmxXTXPllewOF7JR1s2vMX/tWtUYFgphygxjqMl76q4aMotGw==",
"version": "4.42.0",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.42.0.tgz",
"integrity": "sha512-KQETDSEBamQFvg/d8jajtRwLNBlGc3aKpaGiP/LvEbnmVUKlFta1vqJqTrvPtsYsfbE/DLg5CC9zyXRX3fnBiA==",
"cpu": [
"riscv64"
],
@@ -1426,9 +1426,9 @@
]
},
"node_modules/@rollup/rollup-linux-riscv64-musl": {
"version": "4.41.1",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.41.1.tgz",
"integrity": "sha512-LdIUOb3gvfmpkgFZuccNa2uYiqtgZAz3PTzjuM5bH3nvuy9ty6RGc/Q0+HDFrHrizJGVpjnTZ1yS5TNNjFlklw==",
"version": "4.42.0",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.42.0.tgz",
"integrity": "sha512-qMvnyjcU37sCo/tuC+JqeDKSuukGAd+pVlRl/oyDbkvPJ3awk6G6ua7tyum02O3lI+fio+eM5wsVd66X0jQtxw==",
"cpu": [
"riscv64"
],
@@ -1440,9 +1440,9 @@
]
},
"node_modules/@rollup/rollup-linux-s390x-gnu": {
"version": "4.41.1",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.41.1.tgz",
"integrity": "sha512-oIE6M8WC9ma6xYqjvPhzZYk6NbobIURvP/lEbh7FWplcMO6gn7MM2yHKA1eC/GvYwzNKK/1LYgqzdkZ8YFxR8g==",
"version": "4.42.0",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.42.0.tgz",
"integrity": "sha512-I2Y1ZUgTgU2RLddUHXTIgyrdOwljjkmcZ/VilvaEumtS3Fkuhbw4p4hgHc39Ypwvo2o7sBFNl2MquNvGCa55Iw==",
"cpu": [
"s390x"
],
@@ -1454,9 +1454,9 @@
]
},
"node_modules/@rollup/rollup-linux-x64-gnu": {
"version": "4.41.1",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.41.1.tgz",
"integrity": "sha512-cWBOvayNvA+SyeQMp79BHPK8ws6sHSsYnK5zDcsC3Hsxr1dgTABKjMnMslPq1DvZIp6uO7kIWhiGwaTdR4Og9A==",
"version": "4.42.0",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.42.0.tgz",
"integrity": "sha512-Gfm6cV6mj3hCUY8TqWa63DB8Mx3NADoFwiJrMpoZ1uESbK8FQV3LXkhfry+8bOniq9pqY1OdsjFWNsSbfjPugw==",
"cpu": [
"x64"
],
@@ -1468,9 +1468,9 @@
]
},
"node_modules/@rollup/rollup-linux-x64-musl": {
"version": "4.41.1",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.41.1.tgz",
"integrity": "sha512-y5CbN44M+pUCdGDlZFzGGBSKCA4A/J2ZH4edTYSSxFg7ce1Xt3GtydbVKWLlzL+INfFIZAEg1ZV6hh9+QQf9YQ==",
"version": "4.42.0",
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.42.0.tgz",
"integrity": "sha512-g86PF8YZ9GRqkdi0VoGlcDUb4rYtQKyTD1IVtxxN4Hpe7YqLBShA7oHMKU6oKTCi3uxwW4VkIGnOaH/El8de3w==",
"cpu": [
"x64"
],
@@ -1482,9 +1482,9 @@
]
},
"node_modules/@rollup/rollup-win32-arm64-msvc": {
"version": "4.41.1",
"resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.41.1.tgz",
"integrity": "sha512-lZkCxIrjlJlMt1dLO/FbpZbzt6J/A8p4DnqzSa4PWqPEUUUnzXLeki/iyPLfV0BmHItlYgHUqJe+3KiyydmiNQ==",
"version": "4.42.0",
"resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.42.0.tgz",
"integrity": "sha512-+axkdyDGSp6hjyzQ5m1pgcvQScfHnMCcsXkx8pTgy/6qBmWVhtRVlgxjWwDp67wEXXUr0x+vD6tp5W4x6V7u1A==",
"cpu": [
"arm64"
],
@@ -1496,9 +1496,9 @@
]
},
"node_modules/@rollup/rollup-win32-ia32-msvc": {
"version": "4.41.1",
"resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.41.1.tgz",
"integrity": "sha512-+psFT9+pIh2iuGsxFYYa/LhS5MFKmuivRsx9iPJWNSGbh2XVEjk90fmpUEjCnILPEPJnikAU6SFDiEUyOv90Pg==",
"version": "4.42.0",
"resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.42.0.tgz",
"integrity": "sha512-F+5J9pelstXKwRSDq92J0TEBXn2nfUrQGg+HK1+Tk7VOL09e0gBqUHugZv7SW4MGrYj41oNCUe3IKCDGVlis2g==",
"cpu": [
"ia32"
],
@@ -1510,9 +1510,9 @@
]
},
"node_modules/@rollup/rollup-win32-x64-msvc": {
"version": "4.41.1",
"resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.41.1.tgz",
"integrity": "sha512-Wq2zpapRYLfi4aKxf2Xff0tN+7slj2d4R87WEzqw7ZLsVvO5zwYCIuEGSZYiK41+GlwUo1HiR+GdkLEJnCKTCw==",
"version": "4.42.0",
"resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.42.0.tgz",
"integrity": "sha512-LpHiJRwkaVz/LqjHjK8LCi8osq7elmpwujwbXKNW88bM8eeGxavJIKKjkjpMHAh/2xfnrt1ZSnhTv41WYUHYmA==",
"cpu": [
"x64"
],
@@ -1578,9 +1578,9 @@
}
},
"node_modules/@types/estree": {
"version": "1.0.7",
"resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.7.tgz",
"integrity": "sha512-w28IoSUCJpidD/TGviZwwMJckNESJZXFu7NBZ5YJ4mEUnNraUn9Pm8HSZm/jDF1pDWYKspWE7oVphigUPRakIQ==",
"version": "1.0.8",
"resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz",
"integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==",
"license": "MIT"
},
"node_modules/@types/estree-jsx": {
@@ -1623,24 +1623,24 @@
"license": "MIT"
},
"node_modules/@types/prop-types": {
"version": "15.7.14",
"resolved": "https://registry.npmjs.org/@types/prop-types/-/prop-types-15.7.14.tgz",
"integrity": "sha512-gNMvNH49DJ7OJYv+KAKn0Xp45p8PLl6zo2YnvDIbTd4J6MER2BmWN49TG7n9LvkyihINxeKW8+3bfS2yDC9dzQ==",
"version": "15.7.15",
"resolved": "https://registry.npmjs.org/@types/prop-types/-/prop-types-15.7.15.tgz",
"integrity": "sha512-F6bEyamV9jKGAFBEmlQnesRPGOQqS2+Uwi0Em15xenOxHaf2hv6L8YCVn3rPdPJOiJfPiCnLIRyvwVaqMY3MIw==",
"license": "MIT"
},
"node_modules/@types/react": {
"version": "19.1.6",
"resolved": "https://registry.npmjs.org/@types/react/-/react-19.1.6.tgz",
"integrity": "sha512-JeG0rEWak0N6Itr6QUx+X60uQmN+5t3j9r/OVDtWzFXKaj6kD1BwJzOksD0FF6iWxZlbE1kB0q9vtnU2ekqa1Q==",
"version": "19.1.7",
"resolved": "https://registry.npmjs.org/@types/react/-/react-19.1.7.tgz",
"integrity": "sha512-BnsPLV43ddr05N71gaGzyZ5hzkCmGwhMvYc8zmvI8Ci1bRkkDSzDDVfAXfN2tk748OwI7ediiPX6PfT9p0QGVg==",
"license": "MIT",
"dependencies": {
"csstype": "^3.0.2"
}
},
"node_modules/@types/react-dom": {
"version": "19.1.5",
"resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-19.1.5.tgz",
"integrity": "sha512-CMCjrWucUBZvohgZxkjd6S9h0nZxXjzus6yDfUb+xLxYM7VvjKNH1tQrE9GWLql1XoOP4/Ds3bwFqShHUYraGg==",
"version": "19.1.6",
"resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-19.1.6.tgz",
"integrity": "sha512-4hOiT/dwO8Ko0gV1m/TJZYk3y0KBnY9vzDh7W+DH17b2HFSOGgdj33dhihPeuy3l0q23+4e+hoXHV6hCC4dCXw==",
"dev": true,
"license": "MIT",
"peerDependencies": {
@@ -1669,16 +1669,16 @@
"license": "ISC"
},
"node_modules/@vitejs/plugin-react": {
"version": "4.5.0",
"resolved": "https://registry.npmjs.org/@vitejs/plugin-react/-/plugin-react-4.5.0.tgz",
"integrity": "sha512-JuLWaEqypaJmOJPLWwO335Ig6jSgC1FTONCWAxnqcQthLTK/Yc9aH6hr9z/87xciejbQcnP3GnA1FWUSWeXaeg==",
"version": "4.5.2",
"resolved": "https://registry.npmjs.org/@vitejs/plugin-react/-/plugin-react-4.5.2.tgz",
"integrity": "sha512-QNVT3/Lxx99nMQWJWF7K4N6apUEuT0KlZA3mx/mVaoGj3smm/8rc8ezz15J1pcbcjDK0V15rpHetVfya08r76Q==",
"dev": true,
"license": "MIT",
"dependencies": {
"@babel/core": "^7.26.10",
"@babel/plugin-transform-react-jsx-self": "^7.25.9",
"@babel/plugin-transform-react-jsx-source": "^7.25.9",
"@rolldown/pluginutils": "1.0.0-beta.9",
"@babel/core": "^7.27.4",
"@babel/plugin-transform-react-jsx-self": "^7.27.1",
"@babel/plugin-transform-react-jsx-source": "^7.27.1",
"@rolldown/pluginutils": "1.0.0-beta.11",
"@types/babel__core": "^7.20.5",
"react-refresh": "^0.17.0"
},
@@ -1686,7 +1686,7 @@
"node": "^14.18.0 || >=16.0.0"
},
"peerDependencies": {
"vite": "^4.2.0 || ^5.0.0 || ^6.0.0"
"vite": "^4.2.0 || ^5.0.0 || ^6.0.0 || ^7.0.0-beta.0"
}
},
"node_modules/babel-plugin-macros": {
@@ -1770,9 +1770,9 @@
}
},
"node_modules/caniuse-lite": {
"version": "1.0.30001720",
"resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001720.tgz",
"integrity": "sha512-Ec/2yV2nNPwb4DnTANEV99ZWwm3ZWfdlfkQbWSDDt+PsXEVYwlhPH8tdMaPunYTKKmz7AnHi2oNEi1GcmKCD8g==",
"version": "1.0.30001721",
"resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001721.tgz",
"integrity": "sha512-cOuvmUVtKrtEaoKiO0rSc29jcjwMwX5tOHDy4MgVFEWiUXj4uBMJkwI8MDySkgXidpMiHUcviogAvFi4pA2hDQ==",
"dev": true,
"funding": [
{
@@ -1959,9 +1959,9 @@
}
},
"node_modules/electron-to-chromium": {
"version": "1.5.161",
"resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.161.tgz",
"integrity": "sha512-hwtetwfKNZo/UlwHIVBlKZVdy7o8bIZxxKs0Mv/ROPiQQQmDgdm5a+KvKtBsxM8ZjFzTaCeLoodZ8jiBE3o9rA==",
"version": "1.5.166",
"resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.166.tgz",
"integrity": "sha512-QPWqHL0BglzPYyJJ1zSSmwFFL6MFXhbACOCcsCdUMCkzPdS9/OIBVxg516X/Ado2qwAq8k0nJJ7phQPCqiaFAw==",
"dev": true,
"license": "ISC"
},
@@ -2054,9 +2054,9 @@
"license": "MIT"
},
"node_modules/fdir": {
"version": "6.4.5",
"resolved": "https://registry.npmjs.org/fdir/-/fdir-6.4.5.tgz",
"integrity": "sha512-4BG7puHpVsIYxZUbiUE3RqGloLaSSwzYie5jvasC4LWuBWzZawynvYouhjbQKw2JuIGYdm0DzIxl8iVidKlUEw==",
"version": "6.4.6",
"resolved": "https://registry.npmjs.org/fdir/-/fdir-6.4.6.tgz",
"integrity": "sha512-hiFoqpyZcfNm1yc4u8oWCf9A2c4D3QjCrks3zmoVKVxpQRzmPNar1hUJcBG2RQHvEVGDN+Jm81ZheVLAQMK6+w==",
"dev": true,
"license": "MIT",
"peerDependencies": {
@@ -3342,9 +3342,9 @@
}
},
"node_modules/rollup": {
"version": "4.41.1",
"resolved": "https://registry.npmjs.org/rollup/-/rollup-4.41.1.tgz",
"integrity": "sha512-cPmwD3FnFv8rKMBc1MxWCwVQFxwf1JEmSX3iQXrRVVG15zerAIXRjMFVWnd5Q5QvgKF7Aj+5ykXFhUl+QGnyOw==",
"version": "4.42.0",
"resolved": "https://registry.npmjs.org/rollup/-/rollup-4.42.0.tgz",
"integrity": "sha512-LW+Vse3BJPyGJGAJt1j8pWDKPd73QM8cRXYK1IxOBgL2AGLu7Xd2YOW0M2sLUBCkF5MshXXtMApyEAEzMVMsnw==",
"dev": true,
"license": "MIT",
"dependencies": {
@@ -3358,29 +3358,36 @@
"npm": ">=8.0.0"
},
"optionalDependencies": {
"@rollup/rollup-android-arm-eabi": "4.41.1",
"@rollup/rollup-android-arm64": "4.41.1",
"@rollup/rollup-darwin-arm64": "4.41.1",
"@rollup/rollup-darwin-x64": "4.41.1",
"@rollup/rollup-freebsd-arm64": "4.41.1",
"@rollup/rollup-freebsd-x64": "4.41.1",
"@rollup/rollup-linux-arm-gnueabihf": "4.41.1",
"@rollup/rollup-linux-arm-musleabihf": "4.41.1",
"@rollup/rollup-linux-arm64-gnu": "4.41.1",
"@rollup/rollup-linux-arm64-musl": "4.41.1",
"@rollup/rollup-linux-loongarch64-gnu": "4.41.1",
"@rollup/rollup-linux-powerpc64le-gnu": "4.41.1",
"@rollup/rollup-linux-riscv64-gnu": "4.41.1",
"@rollup/rollup-linux-riscv64-musl": "4.41.1",
"@rollup/rollup-linux-s390x-gnu": "4.41.1",
"@rollup/rollup-linux-x64-gnu": "4.41.1",
"@rollup/rollup-linux-x64-musl": "4.41.1",
"@rollup/rollup-win32-arm64-msvc": "4.41.1",
"@rollup/rollup-win32-ia32-msvc": "4.41.1",
"@rollup/rollup-win32-x64-msvc": "4.41.1",
"@rollup/rollup-android-arm-eabi": "4.42.0",
"@rollup/rollup-android-arm64": "4.42.0",
"@rollup/rollup-darwin-arm64": "4.42.0",
"@rollup/rollup-darwin-x64": "4.42.0",
"@rollup/rollup-freebsd-arm64": "4.42.0",
"@rollup/rollup-freebsd-x64": "4.42.0",
"@rollup/rollup-linux-arm-gnueabihf": "4.42.0",
"@rollup/rollup-linux-arm-musleabihf": "4.42.0",
"@rollup/rollup-linux-arm64-gnu": "4.42.0",
"@rollup/rollup-linux-arm64-musl": "4.42.0",
"@rollup/rollup-linux-loongarch64-gnu": "4.42.0",
"@rollup/rollup-linux-powerpc64le-gnu": "4.42.0",
"@rollup/rollup-linux-riscv64-gnu": "4.42.0",
"@rollup/rollup-linux-riscv64-musl": "4.42.0",
"@rollup/rollup-linux-s390x-gnu": "4.42.0",
"@rollup/rollup-linux-x64-gnu": "4.42.0",
"@rollup/rollup-linux-x64-musl": "4.42.0",
"@rollup/rollup-win32-arm64-msvc": "4.42.0",
"@rollup/rollup-win32-ia32-msvc": "4.42.0",
"@rollup/rollup-win32-x64-msvc": "4.42.0",
"fsevents": "~2.3.2"
}
},
"node_modules/rollup/node_modules/@types/estree": {
"version": "1.0.7",
"resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.7.tgz",
"integrity": "sha512-w28IoSUCJpidD/TGviZwwMJckNESJZXFu7NBZ5YJ4mEUnNraUn9Pm8HSZm/jDF1pDWYKspWE7oVphigUPRakIQ==",
"dev": true,
"license": "MIT"
},
"node_modules/scheduler": {
"version": "0.26.0",
"resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.26.0.tgz",

View File

@@ -8,6 +8,7 @@ Factory method to initialize an extractor instance based on its name.
from __future__ import annotations
from abc import abstractmethod
from contextlib import suppress
import mimetypes
import os
import requests
@@ -16,6 +17,7 @@ from retrying import retry
import re
from auto_archiver.core import Metadata, BaseModule
from auto_archiver.utils.url import get_media_url_best_quality
class Extractor(BaseModule):
@@ -70,10 +72,20 @@ class Extractor(BaseModule):
return ""
@retry(wait_random_min=500, wait_random_max=3500, stop_max_attempt_number=5)
def download_from_url(self, url: str, to_filename: str = None, verbose=True) -> str:
def download_from_url(self, url: str, to_filename: str = None, verbose=True, try_best_quality=False) -> str:
"""
downloads a URL to provided filename, or inferred from URL, returns local filename
Warning: if try_best_quality is True, it will return a tuple of (filename, best_quality_url) if the download was successful.
"""
if try_best_quality:
with suppress(Exception):
# Attempt to download the original URL
best_quality_url = get_media_url_best_quality(url)
orig_download = self.download_from_url(best_quality_url, to_filename, verbose)
if orig_download:
return orig_download, best_quality_url
if not to_filename:
to_filename = url.split("/")[-1].split("?")[0]
if len(to_filename) > 64:
@@ -98,10 +110,12 @@ class Extractor(BaseModule):
with open(to_filename, "wb") as f:
for chunk in d.iter_content(chunk_size=8192):
f.write(chunk)
if try_best_quality:
return to_filename, url
return to_filename
except requests.RequestException as e:
logger.warning(f"Failed to fetch the Media URL: {e}")
logger.warning(f"Failed to fetch the Media URL: {str(e)[:250]}")
@abstractmethod
def download(self, item: Metadata) -> Metadata | False:

View File

@@ -17,13 +17,14 @@
"default": 50,
"help": "maximum number of videos to download from the page (0 = no download, inf = no limit).",
},
"exclude_media_extensions": {
"default": ".svg,.ico,.gif",
"help": "CSV of media (image/video) file extensions to exclude from download",
},
"user_data_dir": {
"default": "secrets/antibot_user_data",
"help": "Path to the user data directory for the webdriver. This is used to persist browser state, such as cookies and local storage. When using docker it's best to let docker create the folder otherwise there may be permission issues. The Extractor will try to work without it if that error occurs but login sessions will not be used or preserved on those runs.",
"help": "Path to the user data directory for the webdriver. This is used to persist browser state, such as cookies and local storage. If you use the docker deployment, this path will be appended with `_docker` that is because the folder cannot be shared between the host and the container due to user permissions.",
},
"detect_auth_wall": {
"default": True,
"type": "bool",
"help": "detect if the page is behind an authentication wall (e.g. login required) and skip it. disable if you want to archive pages where logins are required.",
},
"proxy": {
"default": None,
@@ -31,7 +32,9 @@
},
},
"description": """
Uses a browser controlled by SeleniumBase to capture HTML, media, and screenshots/PDFs of a web page, by bypassing anti-bot measures like Cloudflare's Turnstile.
Uses a browser controlled by SeleniumBase to capture HTML, media, and screenshots/PDFs of a web page, by bypassing anti-bot measures like Cloudflare's Turnstile or Google Recaptcha.
Still in trial development, please report any issues or suggestions via GitHub Issues.
### Features
- Extracts the HTML source code of the page.
@@ -40,7 +43,6 @@
- Downloads images and videos from the page, excluding specified file extensions.
### Notes
- Requires a WebDriver (e.g., ChromeDriver) installed and accessible via the system's PATH.
- Using a proxy affects Cloudflare Turnstile captcha handling, so it is recommended to use a proxy only if necessary.
""",
}

View File

@@ -1,12 +1,10 @@
import base64
import math
import mimetypes
import os
import sys
import traceback
from urllib.parse import urljoin
import glob
import stat
import importlib.util
from loguru import logger
@@ -15,7 +13,9 @@ from seleniumbase import SB
from auto_archiver.core import Extractor, Enricher, Metadata, Media
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
from auto_archiver.modules.antibot_extractor_enricher.dropins.default import DefaultDropin
from auto_archiver.utils.misc import random_str
from auto_archiver.utils.url import is_relevant_url
class AntibotExtractorEnricher(Extractor, Enricher):
@@ -25,10 +25,6 @@ class AntibotExtractorEnricher(Extractor, Enricher):
self.agent = None # Use the default UserAgent
# parse configuration options
self.exclude_media_mimetypes = set(
[mimetypes.guess_type(f"file{m}")[0] for m in self.exclude_media_extensions.split(",")]
) - {None}
if self.max_download_images == "inf":
self.max_download_images = math.inf
else:
@@ -39,7 +35,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
else:
self.max_download_videos = int(self.max_download_videos)
self._prepare_and_warn_about_docker_and_user_data_dir()
self._prepare_user_data_dir()
self.dropins = self.load_dropins()
@@ -77,19 +73,12 @@ class AntibotExtractorEnricher(Extractor, Enricher):
result.status = "antibot"
return result
def _prepare_and_warn_about_docker_and_user_data_dir(self):
os.makedirs(self.user_data_dir, exist_ok=True)
in_docker = os.environ.get("RUNNING_IN_DOCKER")
if in_docker and self.user_data_dir:
st = os.stat(self.user_data_dir)
perms = stat.filemode(st.st_mode)
owner = st.st_uid
group = st.st_gid
if owner != 0 or group != 0:
logger.warning(
f"""ANTIBOT: Running in Docker with user_data_dir {self.user_data_dir} with permissions {perms} and non-root {owner=}. This may cause issues with Chrome, if you get 'session not created' errors make sure to remove the folder and let docker create it."""
)
def _prepare_user_data_dir(self):
if self.user_data_dir:
in_docker = os.environ.get("RUNNING_IN_DOCKER")
if in_docker:
self.user_data_dir = self.user_data_dir.rstrip(os.path.sep) + "_docker"
os.makedirs(self.user_data_dir, exist_ok=True)
def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool:
using_user_data_dir = self.user_data_dir if custom_data_dir else None
@@ -102,39 +91,41 @@ class AntibotExtractorEnricher(Extractor, Enricher):
sb.uc_open_with_reconnect(url, 4)
logger.debug(f"ANTIBOT handling CAPTCHAs for {url_sample}...")
sb.uc_gui_handle_cf()
sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future
# TODO: implement other Captcha handling
sb.uc_gui_handle_captcha() # handles Cloudflare Turnstile captcha if detected
dropin = self._get_suitable_dropin(url, sb)
dropin.open_page(url)
suitable_dropin = self._get_suitable_dropin(url, sb)
if suitable_dropin:
suitable_dropin.open_page(url)
if self._hit_auth_wall(sb):
if self.detect_auth_wall and self._hit_auth_wall(sb):
logger.warning(f"ANTIBOT SKIP since auth wall or CAPTCHA was detected for {url_sample}")
return False
logger.debug(f"ANTIBOT no auth wall detected for {url_sample}...")
sb.wait_for_ready_state_complete()
sb.sleep(1) # margin for the page to load completely
to_enrich.set_title(sb.get_title())
self._enrich_html_source_code(sb, to_enrich)
self._enrich_full_page_screenshot(sb, to_enrich)
if self.save_to_pdf:
self._enrich_full_page_pdf(sb, to_enrich)
downloaded_images, downloaded_videos = 0, 0
if suitable_dropin:
downloaded_images, downloaded_videos = suitable_dropin.add_extra_media(to_enrich)
downloaded_images, downloaded_videos = dropin.add_extra_media(to_enrich)
self._enrich_download_media(
sb, to_enrich, css_selector="img", max_media=self.max_download_images - downloaded_images
sb,
to_enrich,
css_selector=dropin.images_selectors(),
max_media=self.max_download_images - downloaded_images,
)
self._enrich_download_media(
sb, to_enrich, css_selector="video, source", max_media=self.max_download_videos - downloaded_videos
sb,
to_enrich,
css_selector=dropin.video_selectors(),
max_media=self.max_download_videos - downloaded_videos,
)
logger.success(f"ANTIBOT completed for {url_sample}")
logger.info(f"ANTIBOT completed for {url_sample}")
return to_enrich
except selenium.common.exceptions.SessionNotCreatedException as e:
@@ -155,10 +146,10 @@ class AntibotExtractorEnricher(Extractor, Enricher):
"""
for dropin in self.dropins:
if dropin.suitable(url):
logger.debug(f"ANTIBOT using drop-in {dropin.__class__.__name__} for {url}")
logger.debug(f"ANTIBOT using drop-in {dropin.__name__} for {url}")
return dropin(sb, self)
# logger.warning(f"ANTIBOT no suitable drop-in found for {url}")
return None
return DefaultDropin(sb, self)
def _hit_auth_wall(self, sb: SB) -> bool:
"""
@@ -168,8 +159,8 @@ class AntibotExtractorEnricher(Extractor, Enricher):
# TODO: improve this detection logic, currently it is very basic and may not cover all cases
# Common URL patterns
url = sb.get_current_url().lower()
if any(kw in url for kw in ["login", "signin", "signup", "register", "captcha"]):
current_url = sb.get_current_url().lower()
if any(kw in current_url for kw in ["login", "signin", "signup", "register", "captcha"]):
return True
# Common visible text markers
@@ -245,8 +236,12 @@ class AntibotExtractorEnricher(Extractor, Enricher):
Enriches the full page screenshot of the Metadata object.
This method is called by the enrich method.
"""
x = sb.execute_script("return document.documentElement.scrollWidth")
y = min(sb.execute_script("return document.documentElement.scrollHeight"), 25_000)
start_size = sb.get_window_size()
w, h = start_size["width"], start_size["height"]
x = max(sb.execute_script("return document.documentElement.scrollWidth"), w)
y = min(max(sb.execute_script("return document.documentElement.scrollHeight"), h), 25_000)
logger.debug(f"Setting window size to {x}x{y} for full page screenshot.")
sb.set_window_size(x, y)
screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png")
@@ -278,12 +273,9 @@ class AntibotExtractorEnricher(Extractor, Enricher):
"""
if max_media == 0:
return
logger.debug(
f"Downloading media from {to_enrich.get_url()} with selector '{css_selector}' up to {max_media} items."
)
url = to_enrich.get_url()
all_urls = set()
# media_elements = sb.find_elements(css_selector)
sources = sb.execute_script(f"""
return Array.from(document.querySelectorAll("{css_selector}"))
.map(el => el.src || el.href)
@@ -293,10 +285,12 @@ class AntibotExtractorEnricher(Extractor, Enricher):
if len(all_urls) >= max_media:
logger.debug(f"Reached max download limit of {max_media} images/videos.")
break
mimerype = mimetypes.guess_type(src)[0]
if mimerype in self.exclude_media_mimetypes:
if not is_relevant_url(src):
continue
full_src = urljoin(url, src)
if full_src not in all_urls and (filename := self.download_from_url(full_src)):
if full_src not in all_urls:
filename, full_src = self.download_from_url(full_src, try_best_quality=True)
if not filename:
continue
all_urls.add(full_src)
to_enrich.add_media(Media(filename=filename, properties={"url": full_src}))

View File

@@ -1,7 +1,10 @@
import os
from loguru import logger
from seleniumbase import SB
import yt_dlp
from auto_archiver.core.extractor import Extractor
from auto_archiver.core.metadata import Metadata
from auto_archiver.core import Extractor, Media, Metadata
from auto_archiver.utils.misc import ydl_entry_to_filename
class Dropin:
@@ -36,6 +39,20 @@ class Dropin:
"""
return url
@staticmethod
def images_selectors() -> str:
"""
CSS selector to find images in the HTML page
"""
return "img"
@staticmethod
def video_selectors() -> str:
"""
CSS selector to find videos in the HTML page.
"""
return "video, source"
def open_page(self, url) -> bool:
"""
Make sure the page is opened, even if it requires authentication, captcha solving, etc.
@@ -50,3 +67,59 @@ class Dropin:
:return: A tuple (number of Images added, number of Videos added).
"""
raise NotImplementedError("This method should be implemented in the subclass")
def _get_username_password(self, site) -> tuple[str, str]:
"""
Get the username and password for the site from the extractor's auth data.
:return: A tuple (username, password).
"""
auth = self.extractor.auth_for_site(site)
username = auth.get("username", "")
password = auth.get("password", "")
if not username or not password:
raise ValueError(f"{site} authentication requires a username and password.")
return username, password
def _download_videos_with_ytdlp(self, video_urls: list[str], to_enrich: Metadata) -> int:
"""
Download videos using yt-dlp.
:param video_urls: List of video URLs to download.
:return: The number of videos downloaded.
"""
if type(self.extractor.max_download_videos) is int:
video_urls = video_urls[: self.extractor.max_download_videos]
if not video_urls:
return 0
ydl_options = [
"-o",
os.path.join(self.extractor.tmp_dir, "%(id)s.%(ext)s"),
"--quiet",
"--no-playlist",
"--no-write-subs",
"--no-write-auto-subs",
"--postprocessor-args",
"ffmpeg:-bitexact",
"--max-filesize",
"1000M", # Limit to 1GB per video
]
*_, validated_options = yt_dlp.parse_options(ydl_options)
downloaded = 0
with yt_dlp.YoutubeDL(validated_options) as ydl:
for url in video_urls:
try:
logger.debug(f"Downloading video from URL: {url}")
info = ydl.extract_info(url, download=True)
filename = ydl_entry_to_filename(ydl, info)
if not filename: # Failed to download video.
continue
media = Media(filename)
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
if x in info:
media.set(x, info[x])
to_enrich.add_media(media)
downloaded += 1
except Exception as e:
logger.error(f"Error downloading {url}: {e}")
return downloaded

View File

@@ -0,0 +1,18 @@
from auto_archiver.core.metadata import Metadata
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
class DefaultDropin(Dropin):
"""
A default fallback drop-in class for handling generic cases in the antibot extractor enricher module.
"""
@staticmethod
def suitable(url: str) -> bool:
return False
def open_page(self, url) -> bool:
return True
def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
return 0, 0

View File

@@ -0,0 +1,78 @@
from contextlib import suppress
from auto_archiver.core.metadata import Metadata
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
from loguru import logger
class RedditDropin(Dropin):
"""
A class to handle Reddit drop-in functionality for the antibot extractor enricher module.
"""
@staticmethod
def suitable(url: str) -> bool:
return "reddit.com" in url
@staticmethod
def images_selectors() -> str:
return "shreddit-post img"
@staticmethod
def video_selectors() -> str:
return "shreddit-post video, shreddit-post source"
def open_page(self, url) -> bool:
if self.sb.is_text_visible("You've been blocked by network security."):
self._login()
if url != self.sb.get_current_url():
self.sb.open(url)
return True
@logger.catch
def _login(self):
self.sb.click_link_text("Log in")
self.sb.wait_for_ready_state_complete()
self._close_cookies_banner()
username, password = self._get_username_password("reddit.com")
logger.debug("RedditDropin Logging in to VK with username: {}", username)
self.sb.type("#login-username", username)
self.sb.type("#login-password", password)
elem = self.sb.find_element("button.login")
self.sb.execute_script("arguments[0].scrollIntoView(true);", elem)
self.sb.slow_click("button.login")
self.sb.wait_for_ready_state_complete()
if "https://www.reddit.com/login/" in self.sb.get_current_url():
self.sb.sleep(5)
self.sb.wait_for_ready_state_complete()
if self.sb.is_text_visible("You've been blocked by network security."):
self.sb.click_link_text("Log in")
self.sb.wait_for_ready_state_complete()
if self.sb.is_text_visible("Welcome back"):
logger.debug("RedditDropin Login successful")
self.sb.click_if_visible("this link")
def _close_cookies_banner(self):
with suppress(Exception): # selenium.common.exceptions.JavascriptException
self.sb.execute_script("""
document
.querySelector("reddit-cookie-banner")
.shadowRoot.querySelector("faceplate-dialog")
.querySelector("#accept-all-cookies-button button")
.click()
""")
@logger.catch
def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
filtered_urls = self.sb.execute_script(rf"""
return [...document.querySelectorAll("{self.video_selectors()}")]
.map(el => el.src || el.href)
.filter(url => url && /\.(m3u8|mpd|ism)$/.test(url));
""")
logger.debug("RedditDropin Found {} video URLs", len(filtered_urls))
return 0, self._download_videos_with_ytdlp(filtered_urls, to_enrich)

View File

@@ -1,12 +1,8 @@
import os
import re
from auto_archiver.core.media import Media
from auto_archiver.core.metadata import Metadata
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
from auto_archiver.utils.misc import ydl_entry_to_filename
import yt_dlp
from loguru import logger
@@ -37,10 +33,11 @@ class VkDropin(Dropin):
def open_page(self, url) -> bool:
if self.sb.is_text_visible("Sign in to VK"):
self._login()
self.sb.open(url)
if self._login():
self.sb.open(url)
return True
@logger.catch
def _login(self) -> bool:
# TODO: test method
self.sb.open("https://vk.com")
@@ -50,13 +47,9 @@ class VkDropin(Dropin):
return True
# need to login
logger.debug("Logging in to VK...")
auth = self.extractor.auth_for_site("vk.com")
username = auth.get("username", "")
password = auth.get("password", "")
if not username or not password:
raise ValueError("VK authentication requires a username and password.")
logger.debug("Using username: {}", username)
username, password = self._get_username_password("vk.com")
logger.debug("Logging in to VK with username: {}", username)
self.sb.click('[data-testid="enter-another-way"]', timeout=10)
self.sb.clear('input[name="login"][type="tel"]', by="css selector", timeout=10)
self.sb.type('input[name="login"][type="tel"]', username, by="css selector", timeout=10)
@@ -80,47 +73,6 @@ class VkDropin(Dropin):
@logger.catch
def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
"""
Extract video data from the currently open post with SeleniumBase.
:return: A tuple (number of Images added, number of Videos added).
"""
video_urls = [v.get_attribute("href") for v in self.sb.find_elements('a[href*="/video-"]')]
if type(self.extractor.max_download_videos) is int:
video_urls = video_urls[: self.extractor.max_download_videos]
if not video_urls:
return 0, 0
logger.debug(f"Found {len(video_urls)} video URLs in the post, using ytdlp for download.")
ydl_options = [
"-o",
os.path.join(self.extractor.tmp_dir, "%(id)s.%(ext)s"),
"--quiet",
"--no-playlist",
"--no-write-subs",
"--no-write-auto-subs",
"--postprocessor-args",
"ffmpeg:-bitexact",
"--max-filesize",
"1000M", # Limit to 1GB per video
]
*_, validated_options = yt_dlp.parse_options(ydl_options)
downloaded = 0
with yt_dlp.YoutubeDL(validated_options) as ydl:
for url in video_urls:
try:
logger.debug(f"Downloading video from URL: {url}")
info = ydl.extract_info(url, download=True)
filename = ydl_entry_to_filename(ydl, info)
if not filename: # Failed to download video.
continue
media = Media(filename)
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
if x in info:
media.set(x, info[x])
to_enrich.add_media(media)
downloaded += 1
except Exception as e:
logger.error(f"Error downloading {url}: {e}")
return 0, downloaded
return 0, self._download_videos_with_ytdlp(video_urls, to_enrich)

View File

@@ -30,6 +30,8 @@ For a full list of video platforms supported by `yt-dlp`, see the
custom dropins can be created to handle additional websites and passed to the archiver
via the command line using the `--dropins` option (TODO!).
You can see all currently implemented dropins in [the source code](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules/generic_extractor).
### Auto-Updates
The Generic Extractor will also automatically check for updates to `yt-dlp` (every 5 days by default).

View File

@@ -78,6 +78,8 @@ def remove_get_parameters(url: str) -> str:
def is_relevant_url(url: str) -> bool:
"""
Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc.
Assumption: URLs are relevant if they refer to files that can be downloaded with curl/requests, so excludes extensions like .m3u8.
"""
clean_url = remove_get_parameters(url)
@@ -104,11 +106,19 @@ def is_relevant_url(url: str) -> bool:
("vk.com/images/reaction/",),
# wikipedia
("wikipedia.org/static",),
# reddit
("styles.redditmedia.com",), # opinionated but excludes may irrelevant images like avatars and banners
("emoji.redditmedia.com",),
]
# TODO: make these globally configurable
IRRELEVANT_ENDS_WITH = [
".svg", # ignore SVGs
".ico", # ignore icons
# ignore index files for videos, these should be handled by ytdlp
".m3u8",
".mpd",
".ism",
]
for end in IRRELEVANT_ENDS_WITH:
@@ -125,6 +135,36 @@ def is_relevant_url(url: str) -> bool:
def twitter_best_quality_url(url: str) -> str:
"""
some twitter image URLs point to a less-than best quality
this returns the URL pointing to the highest (original) quality
this returns the URL pointing to the highest (original) quality (with 'name=orig')
"""
return re.sub(r"name=(\w+)", "name=orig", url, 1)
parsed = urlparse(url)
query = parsed.query
if "name=" in query:
# Replace only the first occurrence of name=xxx with name=orig
new_query = re.sub(r"name=[^&]*", "name=orig", query, 1)
parsed = parsed._replace(query=new_query)
return urlunparse(parsed)
return url
def get_media_url_best_quality(url: str) -> str:
"""
Returns the best quality URL for the given media URL, it may not exist.
"""
parsed = urlparse(url)
# twitter case
if any(d in parsed.netloc.replace("www", "") for d in ("twitter.com", "twimg.com", "x.com")):
url = twitter_best_quality_url(url)
parsed = urlparse(url)
# some cases https://example.com/media-1280x720.mp4 to https://example.com/media.mp4
basename = parsed.path.split("/")[-1]
match = re.match(r"(.+)-\d+x\d+(\.[a-zA-Z0-9]+)$", basename)
if match:
orig_basename = match.group(1) + match.group(2)
new_path = "/".join(parsed.path.split("/")[:-1] + [orig_basename])
parsed = parsed._replace(path=new_path) # keep the query unchanged
url = urlunparse(parsed)
return url

View File

@@ -34,7 +34,6 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
"save_to_pdf": False,
"max_download_images": 0,
"max_download_videos": 0,
"exclude_media_extensions": ".svg,.ico,.gif",
"proxy": None,
}
@@ -129,15 +128,33 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
),
(
"https://seleniumbase.io/apps/turnstile",
'id="captcha-success"',
'<img id="captcha-success" src="https://seleniumbase.io/cdn/img/green_check.png" style="" width="180">',
),
(
"https://seleniumbase.io/apps/form_turnstile",
'<img id="captcha-success" src="https://seleniumbase.io/cdn/img/green_check.png" width="120" style="">',
),
(
"https://gitlab.com/users/sign_in",
"Password",
),
],
)
def test_download_with_cloudflare_turnstile(self, setup_module, make_item, url, in_html):
def test_overcome_cloudflare_turnstile(self, setup_module, make_item, url, in_html):
"""
Test downloading a page with Cloudflare Turnstile captcha.
"""
self.extractor = setup_module(
self.extractor_module,
{
"save_to_pdf": True,
"detect_auth_wall": False,
"max_download_images": 5,
"max_download_videos": "inf",
},
)
item = make_item(url)
self.extractor.enrich(item)

View File

@@ -6,6 +6,7 @@ from auto_archiver.utils.url import (
is_relevant_url,
remove_get_parameters,
twitter_best_quality_url,
get_media_url_best_quality,
)
@@ -95,6 +96,11 @@ def test_remove_get_parameters(url, without_get):
("https://example.com/150x150.jpg", True),
("https://example.com/rsrc.php/", True),
("https://example.com/img/emoji/", True),
("https://styles.redditmedia.com/123", False),
("https://emoji.redditmedia.com/abc.jpg", False),
("https://example.com/rsrc.m3u8?asdasd=10", False),
("https://example.com/rsrc.mpd", False),
("https://example.com/rsrc.ism?vid=12", False),
],
)
def test_is_relevant_url(url, relevant):
@@ -104,10 +110,51 @@ def test_is_relevant_url(url, relevant):
@pytest.mark.parametrize(
"url, best_quality",
[
("https://twitter.com/some_image.jpg?name=small", "https://twitter.com/some_image.jpg?name=orig"),
(
"https://twitter.com/some_image.jpg?name=small&this_is_another=145",
"https://twitter.com/some_image.jpg?name=orig&this_is_another=145",
),
("https://twitter.com/some_image.jpg", "https://twitter.com/some_image.jpg"),
("https://twitter.com/some_image.jpg?name=orig", "https://twitter.com/some_image.jpg?name=orig"),
],
)
def test_twitter_best_quality_url(url, best_quality):
assert twitter_best_quality_url(url) == best_quality
@pytest.mark.parametrize(
"input_url,expected_url",
[
# Twitter: add/replace name= to name=orig
(
"https://pbs.twimg.com/media/abc123?format=jpg&name=small",
"https://pbs.twimg.com/media/abc123?format=jpg&name=orig",
),
("https://pbs.twimg.com/media/abc123?name=large", "https://pbs.twimg.com/media/abc123?name=orig"),
("https://pbs.twimg.com/media/abc123?format=jpg", "https://pbs.twimg.com/media/abc123?format=jpg"),
# Twitter: already orig
(
"https://pbs.twimg.com/media/abc123?format=jpg&name=orig",
"https://pbs.twimg.com/media/abc123?format=jpg&name=orig",
),
# X.com domain
("https://x.com/media/abc123?name=medium", "https://x.com/media/abc123?name=orig"),
# twimg.com domain
("https://twimg.com/media/abc123?name=thumb", "https://twimg.com/media/abc123?name=orig"),
# Non-twitter domain, no change
("https://example.com/media/file.mp4", "https://example.com/media/file.mp4"),
# Remove -WxH from basename
("https://example.com/media/file-1280x720.mp4", "https://example.com/media/file.mp4"),
("https://example.com/media/file-1920x1080.jpg?foo=bar", "https://example.com/media/file.jpg?foo=bar"),
# Both twitter and -WxH
("https://pbs.twimg.com/media/abc-1280x720.jpg?name=small", "https://pbs.twimg.com/media/abc.jpg?name=orig"),
# No match for -WxH, no change
("https://example.com/media/file.mp4?foo=bar", "https://example.com/media/file.mp4?foo=bar"),
# Path with multiple directories
("https://example.com/a/b/c/file-640x480.png", "https://example.com/a/b/c/file.png"),
# -WxH in directory, not basename (should not change)
("https://example.com/media-1280x720/file.mp4", "https://example.com/media-1280x720/file.mp4"),
],
)
def test_get_media_url_best_quality(input_url, expected_url):
assert get_media_url_best_quality(input_url) == expected_url