diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 34e7a24..0042295 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -12,7 +12,7 @@ updates: patterns: - "*" schedule: - interval: "weekly" + interval: "monthly" - package-ecosystem: "github-actions" directory: "/" @@ -21,7 +21,7 @@ updates: patterns: - "*" schedule: - interval: "weekly" + interval: "monthly" - package-ecosystem: "npm" directory: "/scripts/settings/" @@ -30,11 +30,11 @@ updates: patterns: - "*" schedule: - interval: "weekly" + interval: "monthly" - package-ecosystem: "docker" # Look for a `Dockerfile` in the `root` directory directory: "/" # Check for updates once a week schedule: - interval: "weekly" \ No newline at end of file + interval: "monthly" \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index fc92ce3..9f50452 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.6.1 AS base +FROM webrecorder/browsertrix-crawler:1.6.2 AS base ENV RUNNING_IN_DOCKER=1 \ LANG=C.UTF-8 \ diff --git a/docs/source/how_to/upgrading_1_0_1_to_1_1_0.md b/docs/source/how_to/upgrading_1_0_1_to_1_1_0.md new file mode 100644 index 0000000..81e00e2 --- /dev/null +++ b/docs/source/how_to/upgrading_1_0_1_to_1_1_0.md @@ -0,0 +1,44 @@ +# Upgrading from v1.0.1 + +```{note} This how-to is only relevant for people who used Auto Archiver before June 2025 (versions prior to 1.1.0). + +If you are new to Auto Archiver, then you are already using the latest configuration format and this how-to is not relevant for you. +``` + +Versions 1.1.0+ of Auto Archiver has breaking changes in the configuration format, which means earlier configuration formats will not work without slight modifications. + + +## Dropping `vk_extractor` module +We have dropped the `vk_extractor` because of problems in a project we relied on. You will need to remove it from your configuration file, otherwise you will see an error like: + +```{code} console +Module 'vk_extractor' not found. Are you sure it's installed/exists? +``` + +## New `antibot_extractor_enricher` module and VkDropin +We have added a new `antibot_extractor_enricher` module that uses a computer-controlled browser to extract content from websites that use anti-bot measures. You can add it to your configuration file like this: + +```{code} yaml +steps: + extractors: + - antibot_extractor_enricher + + # or alternatively, if you want to use it as an enricher: + enrichers: + - antibot_extractor_enricher +``` + +It comes with Dropins that we will be adding and maintaining. + +> Dropin: A module with site-specific behaviours that is loaded automatically. You don't need to add them to your configuration steps for them to run. Sometimes they need `authentication` configurations though. + +One such Dropin is the VkDropin which uses this automated browser to access VKontakte (VK) pages. You should add a username/password to the configuration file if you get authentication blocks from VK, to do so use the [authentication settings](authentication_how_to.md): + +```{code} yaml +authentication: + vk: + username: your_username + password: your_password +``` + +See all available Dropins in [the source code](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules/antibot_extractor_enricher/dropins). Usually each Dropin needs its own authentication settings, similarly to the VkDropin. \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 8adf6b9..786ae5f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -193,18 +193,18 @@ files = [ [[package]] name = "boto3" -version = "1.38.27" +version = "1.38.33" description = "The AWS SDK for Python" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "boto3-1.38.27-py3-none-any.whl", hash = "sha256:95f5fe688795303a8a15e8b7e7f255cadab35eae459d00cc281a4fd77252ea80"}, - {file = "boto3-1.38.27.tar.gz", hash = "sha256:94bd7fdd92d5701b362d4df100d21e28f8307a67ff56b6a8b0398119cf22f859"}, + {file = "boto3-1.38.33-py3-none-any.whl", hash = "sha256:25d0717489c658f7ae6c3c7f0f7e1b4d611b30b2f08f0fcef6455ac6864a8768"}, + {file = "boto3-1.38.33.tar.gz", hash = "sha256:6467909c1ae01ff67981f021bb2568592211765ec8a9a1d2c4529191e46c3541"}, ] [package.dependencies] -botocore = ">=1.38.27,<1.39.0" +botocore = ">=1.38.33,<1.39.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.13.0,<0.14.0" @@ -213,14 +213,14 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.38.27" +version = "1.38.33" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "botocore-1.38.27-py3-none-any.whl", hash = "sha256:a785d5e9a5eda88ad6ab9ed8b87d1f2ac409d0226bba6ff801c55359e94d91a8"}, - {file = "botocore-1.38.27.tar.gz", hash = "sha256:9788f7efe974328a38cbade64cc0b1e67d27944b899f88cb786ae362973133b6"}, + {file = "botocore-1.38.33-py3-none-any.whl", hash = "sha256:ad25233e93dcebe95809b1f9393c1f11a639696327793d166295fb78dd7bc597"}, + {file = "botocore-1.38.33.tar.gz", hash = "sha256:dbe8fea9d0426c644c89ef2018ead886ccbcc22901a02b377b4e65ce1cb69a2b"}, ] [package.dependencies] @@ -941,14 +941,14 @@ files = [ [[package]] name = "google-api-core" -version = "2.24.2" +version = "2.25.0" description = "Google API client core library" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "google_api_core-2.24.2-py3-none-any.whl", hash = "sha256:810a63ac95f3c441b7c0e43d344e372887f62ce9071ba972eacf32672e072de9"}, - {file = "google_api_core-2.24.2.tar.gz", hash = "sha256:81718493daf06d96d6bc76a91c23874dbf2fac0adbbf542831b805ee6e974696"}, + {file = "google_api_core-2.25.0-py3-none-any.whl", hash = "sha256:1db79d1281dcf9f3d10023283299ba38f3dc9f639ec41085968fd23e5bcf512e"}, + {file = "google_api_core-2.25.0.tar.gz", hash = "sha256:9b548e688702f82a34ed8409fb8a6961166f0b7795032f0be8f48308dff4333a"}, ] [package.dependencies] @@ -959,21 +959,21 @@ protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4 requests = ">=2.18.0,<3.0.0" [package.extras] -async-rest = ["google-auth[aiohttp] (>=2.35.0,<3.0.dev0)"] -grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev) ; python_version >= \"3.11\"", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0) ; python_version >= \"3.11\""] -grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] -grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] +async-rest = ["google-auth[aiohttp] (>=2.35.0,<3.0.0)"] +grpc = ["grpcio (>=1.33.2,<2.0.0)", "grpcio (>=1.49.1,<2.0.0) ; python_version >= \"3.11\"", "grpcio-status (>=1.33.2,<2.0.0)", "grpcio-status (>=1.49.1,<2.0.0) ; python_version >= \"3.11\""] +grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0.0)"] +grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.0)"] [[package]] name = "google-api-python-client" -version = "2.170.0" +version = "2.171.0" description = "Google API Client Library for Python" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "google_api_python_client-2.170.0-py3-none-any.whl", hash = "sha256:7bf518a0527ad23322f070fa69f4f24053170d5c766821dc970ff0571ec22748"}, - {file = "google_api_python_client-2.170.0.tar.gz", hash = "sha256:75f3a1856f11418ea3723214e0abc59d9b217fd7ed43dcf743aab7f06ab9e2b1"}, + {file = "google_api_python_client-2.171.0-py3-none-any.whl", hash = "sha256:c9c9b76f561e9d9ac14e54a9e2c0842876201d5b96e69e48f967373f0784cbe9"}, + {file = "google_api_python_client-2.171.0.tar.gz", hash = "sha256:057a5c08d28463c6b9eb89746355de5f14b7ed27a65c11fdbf1d06c66bb66b23"}, ] [package.dependencies] @@ -985,14 +985,14 @@ uritemplate = ">=3.0.1,<5" [[package]] name = "google-auth" -version = "2.40.2" +version = "2.40.3" description = "Google Authentication Library" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "google_auth-2.40.2-py2.py3-none-any.whl", hash = "sha256:f7e568d42eedfded58734f6a60c58321896a621f7c116c411550a4b4a13da90b"}, - {file = "google_auth-2.40.2.tar.gz", hash = "sha256:a33cde547a2134273226fa4b853883559947ebe9207521f7afc707efbf690f58"}, + {file = "google_auth-2.40.3-py2.py3-none-any.whl", hash = "sha256:1370d4593e86213563547f97a92752fc658456fe4514c809544f330fed45a7ca"}, + {file = "google_auth-2.40.3.tar.gz", hash = "sha256:500c3a29adedeb36ea9cf24b8d10858e152f2412e3ca37829b3fa18e33d63b77"}, ] [package.dependencies] @@ -2130,7 +2130,7 @@ version = "2.19.1" description = "Pygments is a syntax highlighting package written in Python." optional = false python-versions = ">=3.8" -groups = ["main", "docs"] +groups = ["main", "dev", "docs"] files = [ {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"}, {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"}, @@ -2335,26 +2335,27 @@ files = [ [[package]] name = "pytest" -version = "8.3.5" +version = "8.4.0" description = "pytest: simple powerful testing with Python" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" groups = ["main", "dev"] files = [ - {file = "pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820"}, - {file = "pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845"}, + {file = "pytest-8.4.0-py3-none-any.whl", hash = "sha256:f40f825768ad76c0977cbacdf1fd37c6f7a468e460ea6a0636078f8972d4517e"}, + {file = "pytest-8.4.0.tar.gz", hash = "sha256:14d920b48472ea0dbf68e45b96cd1ffda4705f33307dcc86c676c1b5104838a6"}, ] [package.dependencies] -colorama = {version = "*", markers = "sys_platform == \"win32\""} -exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} -iniconfig = "*" -packaging = "*" +colorama = {version = ">=0.4", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1", markers = "python_version < \"3.11\""} +iniconfig = ">=1" +packaging = ">=20" pluggy = ">=1.5,<2" +pygments = ">=2.7.2" tomli = {version = ">=1", markers = "python_version < \"3.11\""} [package.extras] -dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "requests", "setuptools", "xmlschema"] [[package]] name = "pytest-html" @@ -2766,19 +2767,19 @@ files = [ [[package]] name = "requests" -version = "2.32.3" +version = "2.32.4" description = "Python HTTP for Humans." optional = false python-versions = ">=3.8" groups = ["main", "docs"] files = [ - {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, - {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, + {file = "requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c"}, + {file = "requests-2.32.4.tar.gz", hash = "sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422"}, ] [package.dependencies] certifi = ">=2017.4.17" -charset-normalizer = ">=2,<4" +charset_normalizer = ">=2,<4" idna = ">=2.5,<4" PySocks = {version = ">=1.5.6,<1.5.7 || >1.5.7", optional = true, markers = "extra == \"socks\""} urllib3 = ">=1.21.1,<3" @@ -2894,7 +2895,7 @@ description = "Manipulate well-formed Roman numerals" optional = false python-versions = ">=3.9" groups = ["docs"] -markers = "python_version >= \"3.11\"" +markers = "python_version != \"3.10\"" files = [ {file = "roman_numerals_py-3.1.0-py3-none-any.whl", hash = "sha256:9da2ad2fb670bcf24e81070ceb3be72f6c11c440d73bd579fbeca1e9f330954c"}, {file = "roman_numerals_py-3.1.0.tar.gz", hash = "sha256:be4bf804f083a4ce001b5eb7e3c0862479d10f94c936f6c4e5f250aa5ff5bd2d"}, @@ -2921,14 +2922,14 @@ pyasn1 = ">=0.1.3" [[package]] name = "ruamel-yaml" -version = "0.18.12" +version = "0.18.14" description = "ruamel.yaml is a YAML parser/emitter that supports roundtrip preservation of comments, seq/map flow style, and map key order" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" groups = ["main"] files = [ - {file = "ruamel.yaml-0.18.12-py3-none-any.whl", hash = "sha256:790ba4c48b6a6e6b12b532a7308779eb12d2aaab3a80fdb8389216f28ea2b287"}, - {file = "ruamel.yaml-0.18.12.tar.gz", hash = "sha256:5a38fd5ce39d223bebb9e3a6779e86b9427a03fb0bf9f270060f8b149cffe5e2"}, + {file = "ruamel.yaml-0.18.14-py3-none-any.whl", hash = "sha256:710ff198bb53da66718c7db27eec4fbcc9aa6ca7204e4c1df2f282b6fe5eb6b2"}, + {file = "ruamel.yaml-0.18.14.tar.gz", hash = "sha256:7227b76aaec364df15936730efbf7d72b30c0b79b1d578bbb8e3dcb2d81f52b7"}, ] [package.dependencies] @@ -3112,14 +3113,14 @@ websocket-client = ">=1.8.0,<1.9.0" [[package]] name = "seleniumbase" -version = "4.39.2" +version = "4.39.3" description = "A complete web automation framework for end-to-end testing." optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "seleniumbase-4.39.2-py3-none-any.whl", hash = "sha256:23b2d071c02ba269a8239b828fd5098edb208d04171143c93b40d8a351ba2861"}, - {file = "seleniumbase-4.39.2.tar.gz", hash = "sha256:3a18d582ca90f4d633debb8ec45871db1b7aed71e5876fc634962fba79731967"}, + {file = "seleniumbase-4.39.3-py3-none-any.whl", hash = "sha256:cbb94d7858a9ef3b0b4431a5879150649f4a73029afaa8ecfb7bda113f2565e1"}, + {file = "seleniumbase-4.39.3.tar.gz", hash = "sha256:b32978e685b1e4e2c7859b2dcb377ac14ba99bf748ea428548f9e450257b861b"}, ] [package.dependencies] @@ -3156,7 +3157,7 @@ pygments = ">=2.19.1" pynose = ">=1.5.4" pyotp = "2.9.0" pyreadline3 = {version = ">=3.5.3", markers = "platform_system == \"Windows\""} -pytest = "8.3.5" +pytest = {version = "8.4.0", markers = "python_version >= \"3.9\""} pytest-html = "4.0.2" pytest-metadata = "3.1.1" pytest-ordering = "0.6" @@ -3164,11 +3165,11 @@ pytest-rerunfailures = {version = "15.1", markers = "python_version >= \"3.9\""} pytest-xdist = {version = "3.7.0", markers = "python_version >= \"3.9\""} python-xlib = {version = "0.33", markers = "platform_system == \"Linux\""} pyyaml = ">=6.0.2" -requests = "2.32.3" +requests = "2.32.4" rich = ">=14.0.0,<15" sbvirtualdisplay = ">=1.4.0" selenium = {version = "4.33.0", markers = "python_version >= \"3.10\""} -setuptools = {version = ">=80.8.0", markers = "python_version >= \"3.10\""} +setuptools = {version = ">=80.9.0", markers = "python_version >= \"3.10\""} six = ">=1.17.0" sniffio = "1.3.1" sortedcontainers = "2.4.0" @@ -3323,7 +3324,7 @@ description = "Python documentation generator" optional = false python-versions = ">=3.11" groups = ["docs"] -markers = "python_version >= \"3.11\"" +markers = "python_version != \"3.10\"" files = [ {file = "sphinx-8.2.3-py3-none-any.whl", hash = "sha256:4405915165f13521d875a8c29c8970800a0141c14cc5416a38feca4ea5d9b9c3"}, {file = "sphinx-8.2.3.tar.gz", hash = "sha256:398ad29dee7f63a75888314e9424d40f52ce5a6a87ae88e7071e80af296ec348"}, @@ -3801,14 +3802,14 @@ test = ["coverage", "pytest", "pytest-cov"] [[package]] name = "uritemplate" -version = "4.1.1" +version = "4.2.0" description = "Implementation of RFC 6570 URI Templates" optional = false -python-versions = ">=3.6" +python-versions = ">=3.9" groups = ["main"] files = [ - {file = "uritemplate-4.1.1-py2.py3-none-any.whl", hash = "sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e"}, - {file = "uritemplate-4.1.1.tar.gz", hash = "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0"}, + {file = "uritemplate-4.2.0-py3-none-any.whl", hash = "sha256:962201ba1c4edcab02e60f9a0d3821e82dfc5d2d6662a21abd533879bdb8a686"}, + {file = "uritemplate-4.2.0.tar.gz", hash = "sha256:480c2ed180878955863323eea31b0ede668795de182617fef9c6ca09e6ec9d0e"}, ] [[package]] @@ -4120,14 +4121,14 @@ h11 = ">=0.9.0,<1" [[package]] name = "yt-dlp" -version = "2025.5.22" +version = "2025.6.9" description = "A feature-rich command-line audio/video downloader" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "yt_dlp-2025.5.22-py3-none-any.whl", hash = "sha256:a49c4b76afeaded6254c3e2b759d8d5a13271aa963d5fccb51fe059d1c313151"}, - {file = "yt_dlp-2025.5.22.tar.gz", hash = "sha256:ea73854c5dabc124f29a35a8fae9bc5d422ef3231bebeea2bdfa82ac191a9c29"}, + {file = "yt_dlp-2025.6.9-py3-none-any.whl", hash = "sha256:ebdfda9ffa807f6a26aed7c8f906e5557cd06b4c388dc547df1ec2078631fca8"}, + {file = "yt_dlp-2025.6.9.tar.gz", hash = "sha256:751f53a3b61353522bf805fa30bbcbd16666126537e39706eab4f8c368f111ac"}, ] [package.dependencies] @@ -4142,7 +4143,7 @@ urllib3 = {version = ">=1.26.17,<3", optional = true, markers = "extra == \"defa websockets = {version = ">=13.0", optional = true, markers = "extra == \"default\""} [package.extras] -build = ["build", "hatchling", "pip", "setuptools (>=71.0.2)", "wheel"] +build = ["build", "hatchling", "pip", "setuptools (>=71.0.2,<81)", "wheel"] curl-cffi = ["curl-cffi (>=0.5.10,<0.6.dev0 || ==0.10.*) ; implementation_name == \"cpython\""] default = ["brotli ; implementation_name == \"cpython\"", "brotlicffi ; implementation_name != \"cpython\"", "certifi", "mutagen", "pycryptodomex", "requests (>=2.32.2,<3)", "urllib3 (>=1.26.17,<3)", "websockets (>=13.0)"] dev = ["autopep8 (>=2.0,<3.0)", "pre-commit", "pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)", "ruff (>=0.11.0,<0.12.0)"] diff --git a/pyproject.toml b/pyproject.toml index 466d090..cdbb86b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [project] name = "auto-archiver" -version = "1.0.1" +version = "1.1.0" description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)." requires-python = ">=3.10,<3.13" diff --git a/scripts/settings/package-lock.json b/scripts/settings/package-lock.json index a60d74e..4aced9a 100644 --- a/scripts/settings/package-lock.json +++ b/scripts/settings/package-lock.json @@ -10,21 +10,21 @@ "dependencies": { "@dnd-kit/core": "^6.3.1", "@dnd-kit/sortable": "^10.0.0", - "@emotion/react": "*", - "@emotion/styled": "*", + "@emotion/react": "latest", + "@emotion/styled": "latest", "@mui/icons-material": "^7.1.1", - "@mui/material": "*", + "@mui/material": "latest", "react": "19.1.0", "react-dom": "19.1.0", "react-markdown": "^10.0.0", "yaml": "^2.7.0" }, "devDependencies": { - "@types/react": "*", - "@types/react-dom": "*", - "@vitejs/plugin-react": "*", - "typescript": "*", - "vite": "*", + "@types/react": "latest", + "@types/react-dom": "latest", + "@vitejs/plugin-react": "latest", + "typescript": "latest", + "vite": "latest", "vite-plugin-singlefile": "^2.1.0" } }, @@ -57,9 +57,9 @@ } }, "node_modules/@babel/compat-data": { - "version": "7.27.3", - "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.27.3.tgz", - "integrity": "sha512-V42wFfx1ymFte+ecf6iXghnnP8kWTO+ZLXIyZq+1LAXHHvTZdVxicn4yiVYdYMGaCO3tmqub11AorKkv+iodqw==", + "version": "7.27.5", + "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.27.5.tgz", + "integrity": "sha512-KiRAp/VoJaWkkte84TvUd9qjdbZAdiqyvMxrGl1N6vzFogKmaLgoM3L1kgtLicp2HP5fBJS8JrZKLVIZGVJAVg==", "dev": true, "license": "MIT", "engines": { @@ -105,12 +105,12 @@ "license": "MIT" }, "node_modules/@babel/generator": { - "version": "7.27.3", - "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.27.3.tgz", - "integrity": "sha512-xnlJYj5zepml8NXtjkG0WquFUv8RskFqyFcVgTBp5k+NaA/8uw/K+OSVf8AMGw5e9HKP2ETd5xpK5MLZQD6b4Q==", + "version": "7.27.5", + "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.27.5.tgz", + "integrity": "sha512-ZGhA37l0e/g2s1Cnzdix0O3aLYm66eF8aufiVteOgnwxgnRP8GoyMj7VWsgWnQbVKXyge7hqrFh2K2TQM6t1Hw==", "license": "MIT", "dependencies": { - "@babel/parser": "^7.27.3", + "@babel/parser": "^7.27.5", "@babel/types": "^7.27.3", "@jridgewell/gen-mapping": "^0.3.5", "@jridgewell/trace-mapping": "^0.3.25", @@ -207,23 +207,23 @@ } }, "node_modules/@babel/helpers": { - "version": "7.27.4", - "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.27.4.tgz", - "integrity": "sha512-Y+bO6U+I7ZKaM5G5rDUZiYfUvQPUibYmAFe7EnKdnKBbVXDZxvp+MWOH5gYciY0EPk4EScsuFMQBbEfpdRKSCQ==", + "version": "7.27.6", + "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.27.6.tgz", + "integrity": "sha512-muE8Tt8M22638HU31A3CgfSUciwz1fhATfoVai05aPXGor//CdWDCbnlY1yvBPo07njuVOCNGCSp/GTt12lIug==", "dev": true, "license": "MIT", "dependencies": { "@babel/template": "^7.27.2", - "@babel/types": "^7.27.3" + "@babel/types": "^7.27.6" }, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/parser": { - "version": "7.27.4", - "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.27.4.tgz", - "integrity": "sha512-BRmLHGwpUqLFR2jzx9orBuX/ABDkj2jLKOXrHDTN2aOKL+jFDDKaRNo9nyYsIl9h/UE/7lMKdDjKQQyxKKDZ7g==", + "version": "7.27.5", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.27.5.tgz", + "integrity": "sha512-OsQd175SxWkGlzbny8J3K8TnnDD0N3lrIUtB92xwyRpzaenGZhxDvxN/JgU00U3CDZNj9tPuDJ5H0WS4Nt3vKg==", "license": "MIT", "dependencies": { "@babel/types": "^7.27.3" @@ -268,9 +268,9 @@ } }, "node_modules/@babel/runtime": { - "version": "7.27.4", - "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.27.4.tgz", - "integrity": "sha512-t3yaEOuGu9NlIZ+hIeGbBjFtZT7j2cb2tg0fuaJKeGotchRjjLfrBA9Kwf8quhpP1EUuxModQg04q/mBwyg8uA==", + "version": "7.27.6", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.27.6.tgz", + "integrity": "sha512-vbavdySgbTTrmFE+EsiqUTzlOr5bzlnJtUv9PynGCAKvfQqjIXbvFdumPM/GxMDfyuGMJaJAU6TO4zc1Jf1i8Q==", "license": "MIT", "engines": { "node": ">=6.9.0" @@ -309,9 +309,9 @@ } }, "node_modules/@babel/types": { - "version": "7.27.3", - "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.27.3.tgz", - "integrity": "sha512-Y1GkI4ktrtvmawoSq+4FCVHNryea6uR+qUQy0AGxLSsjCX0nVmkYQMBLHDkXZuo5hGx7eYdnIaslsdBFm7zbUw==", + "version": "7.27.6", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.27.6.tgz", + "integrity": "sha512-ETyHEk2VHHvl9b9jZP5IHPavHYk57EhanlRRuae9XCpb/j5bDCbPPMOBfCWhnl/7EDJz0jEMCi/RhccCE8r1+Q==", "license": "MIT", "dependencies": { "@babel/helper-string-parser": "^7.27.1", @@ -1237,16 +1237,16 @@ } }, "node_modules/@rolldown/pluginutils": { - "version": "1.0.0-beta.9", - "resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.9.tgz", - "integrity": "sha512-e9MeMtVWo186sgvFFJOPGy7/d2j2mZhLJIdVW0C/xDluuOvymEATqz6zKsP0ZmXGzQtqlyjz5sC1sYQUoJG98w==", + "version": "1.0.0-beta.11", + "resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.11.tgz", + "integrity": "sha512-L/gAA/hyCSuzTF1ftlzUSI/IKr2POHsv1Dd78GfqkR83KMNuswWD61JxGV2L7nRwBBBSDr6R1gCkdTmoN7W4ag==", "dev": true, "license": "MIT" }, "node_modules/@rollup/rollup-android-arm-eabi": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.41.1.tgz", - "integrity": "sha512-NELNvyEWZ6R9QMkiytB4/L4zSEaBC03KIXEghptLGLZWJ6VPrL63ooZQCOnlx36aQPGhzuOMwDerC1Eb2VmrLw==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.42.0.tgz", + "integrity": "sha512-gldmAyS9hpj+H6LpRNlcjQWbuKUtb94lodB9uCz71Jm+7BxK1VIOo7y62tZZwxhA7j1ylv/yQz080L5WkS+LoQ==", "cpu": [ "arm" ], @@ -1258,9 +1258,9 @@ ] }, "node_modules/@rollup/rollup-android-arm64": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.41.1.tgz", - "integrity": "sha512-DXdQe1BJ6TK47ukAoZLehRHhfKnKg9BjnQYUu9gzhI8Mwa1d2fzxA1aw2JixHVl403bwp1+/o/NhhHtxWJBgEA==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.42.0.tgz", + "integrity": "sha512-bpRipfTgmGFdCZDFLRvIkSNO1/3RGS74aWkJJTFJBH7h3MRV4UijkaEUeOMbi9wxtxYmtAbVcnMtHTPBhLEkaw==", "cpu": [ "arm64" ], @@ -1272,9 +1272,9 @@ ] }, "node_modules/@rollup/rollup-darwin-arm64": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.41.1.tgz", - "integrity": "sha512-5afxvwszzdulsU2w8JKWwY8/sJOLPzf0e1bFuvcW5h9zsEg+RQAojdW0ux2zyYAz7R8HvvzKCjLNJhVq965U7w==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.42.0.tgz", + "integrity": "sha512-JxHtA081izPBVCHLKnl6GEA0w3920mlJPLh89NojpU2GsBSB6ypu4erFg/Wx1qbpUbepn0jY4dVWMGZM8gplgA==", "cpu": [ "arm64" ], @@ -1286,9 +1286,9 @@ ] }, "node_modules/@rollup/rollup-darwin-x64": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.41.1.tgz", - "integrity": "sha512-egpJACny8QOdHNNMZKf8xY0Is6gIMz+tuqXlusxquWu3F833DcMwmGM7WlvCO9sB3OsPjdC4U0wHw5FabzCGZg==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.42.0.tgz", + "integrity": "sha512-rv5UZaWVIJTDMyQ3dCEK+m0SAn6G7H3PRc2AZmExvbDvtaDc+qXkei0knQWcI3+c9tEs7iL/4I4pTQoPbNL2SA==", "cpu": [ "x64" ], @@ -1300,9 +1300,9 @@ ] }, "node_modules/@rollup/rollup-freebsd-arm64": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.41.1.tgz", - "integrity": "sha512-DBVMZH5vbjgRk3r0OzgjS38z+atlupJ7xfKIDJdZZL6sM6wjfDNo64aowcLPKIx7LMQi8vybB56uh1Ftck/Atg==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.42.0.tgz", + "integrity": "sha512-fJcN4uSGPWdpVmvLuMtALUFwCHgb2XiQjuECkHT3lWLZhSQ3MBQ9pq+WoWeJq2PrNxr9rPM1Qx+IjyGj8/c6zQ==", "cpu": [ "arm64" ], @@ -1314,9 +1314,9 @@ ] }, "node_modules/@rollup/rollup-freebsd-x64": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.41.1.tgz", - "integrity": "sha512-3FkydeohozEskBxNWEIbPfOE0aqQgB6ttTkJ159uWOFn42VLyfAiyD9UK5mhu+ItWzft60DycIN1Xdgiy8o/SA==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.42.0.tgz", + "integrity": "sha512-CziHfyzpp8hJpCVE/ZdTizw58gr+m7Y2Xq5VOuCSrZR++th2xWAz4Nqk52MoIIrV3JHtVBhbBsJcAxs6NammOQ==", "cpu": [ "x64" ], @@ -1328,9 +1328,9 @@ ] }, "node_modules/@rollup/rollup-linux-arm-gnueabihf": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.41.1.tgz", - "integrity": "sha512-wC53ZNDgt0pqx5xCAgNunkTzFE8GTgdZ9EwYGVcg+jEjJdZGtq9xPjDnFgfFozQI/Xm1mh+D9YlYtl+ueswNEg==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.42.0.tgz", + "integrity": "sha512-UsQD5fyLWm2Fe5CDM7VPYAo+UC7+2Px4Y+N3AcPh/LdZu23YcuGPegQly++XEVaC8XUTFVPscl5y5Cl1twEI4A==", "cpu": [ "arm" ], @@ -1342,9 +1342,9 @@ ] }, "node_modules/@rollup/rollup-linux-arm-musleabihf": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.41.1.tgz", - "integrity": "sha512-jwKCca1gbZkZLhLRtsrka5N8sFAaxrGz/7wRJ8Wwvq3jug7toO21vWlViihG85ei7uJTpzbXZRcORotE+xyrLA==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.42.0.tgz", + "integrity": "sha512-/i8NIrlgc/+4n1lnoWl1zgH7Uo0XK5xK3EDqVTf38KvyYgCU/Rm04+o1VvvzJZnVS5/cWSd07owkzcVasgfIkQ==", "cpu": [ "arm" ], @@ -1356,9 +1356,9 @@ ] }, "node_modules/@rollup/rollup-linux-arm64-gnu": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.41.1.tgz", - "integrity": "sha512-g0UBcNknsmmNQ8V2d/zD2P7WWfJKU0F1nu0k5pW4rvdb+BIqMm8ToluW/eeRmxCared5dD76lS04uL4UaNgpNA==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.42.0.tgz", + "integrity": "sha512-eoujJFOvoIBjZEi9hJnXAbWg+Vo1Ov8n/0IKZZcPZ7JhBzxh2A+2NFyeMZIRkY9iwBvSjloKgcvnjTbGKHE44Q==", "cpu": [ "arm64" ], @@ -1370,9 +1370,9 @@ ] }, "node_modules/@rollup/rollup-linux-arm64-musl": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.41.1.tgz", - "integrity": "sha512-XZpeGB5TKEZWzIrj7sXr+BEaSgo/ma/kCgrZgL0oo5qdB1JlTzIYQKel/RmhT6vMAvOdM2teYlAaOGJpJ9lahg==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.42.0.tgz", + "integrity": "sha512-/3NrcOWFSR7RQUQIuZQChLND36aTU9IYE4j+TB40VU78S+RA0IiqHR30oSh6P1S9f9/wVOenHQnacs/Byb824g==", "cpu": [ "arm64" ], @@ -1384,9 +1384,9 @@ ] }, "node_modules/@rollup/rollup-linux-loongarch64-gnu": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loongarch64-gnu/-/rollup-linux-loongarch64-gnu-4.41.1.tgz", - "integrity": "sha512-bkCfDJ4qzWfFRCNt5RVV4DOw6KEgFTUZi2r2RuYhGWC8WhCA8lCAJhDeAmrM/fdiAH54m0mA0Vk2FGRPyzI+tw==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loongarch64-gnu/-/rollup-linux-loongarch64-gnu-4.42.0.tgz", + "integrity": "sha512-O8AplvIeavK5ABmZlKBq9/STdZlnQo7Sle0LLhVA7QT+CiGpNVe197/t8Aph9bhJqbDVGCHpY2i7QyfEDDStDg==", "cpu": [ "loong64" ], @@ -1398,9 +1398,9 @@ ] }, "node_modules/@rollup/rollup-linux-powerpc64le-gnu": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.41.1.tgz", - "integrity": "sha512-3mr3Xm+gvMX+/8EKogIZSIEF0WUu0HL9di+YWlJpO8CQBnoLAEL/roTCxuLncEdgcfJcvA4UMOf+2dnjl4Ut1A==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.42.0.tgz", + "integrity": "sha512-6Qb66tbKVN7VyQrekhEzbHRxXXFFD8QKiFAwX5v9Xt6FiJ3BnCVBuyBxa2fkFGqxOCSGGYNejxd8ht+q5SnmtA==", "cpu": [ "ppc64" ], @@ -1412,9 +1412,9 @@ ] }, "node_modules/@rollup/rollup-linux-riscv64-gnu": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.41.1.tgz", - "integrity": "sha512-3rwCIh6MQ1LGrvKJitQjZFuQnT2wxfU+ivhNBzmxXTXPllewOF7JR1s2vMX/tWtUYFgphygxjqMl76q4aMotGw==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.42.0.tgz", + "integrity": "sha512-KQETDSEBamQFvg/d8jajtRwLNBlGc3aKpaGiP/LvEbnmVUKlFta1vqJqTrvPtsYsfbE/DLg5CC9zyXRX3fnBiA==", "cpu": [ "riscv64" ], @@ -1426,9 +1426,9 @@ ] }, "node_modules/@rollup/rollup-linux-riscv64-musl": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.41.1.tgz", - "integrity": "sha512-LdIUOb3gvfmpkgFZuccNa2uYiqtgZAz3PTzjuM5bH3nvuy9ty6RGc/Q0+HDFrHrizJGVpjnTZ1yS5TNNjFlklw==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.42.0.tgz", + "integrity": "sha512-qMvnyjcU37sCo/tuC+JqeDKSuukGAd+pVlRl/oyDbkvPJ3awk6G6ua7tyum02O3lI+fio+eM5wsVd66X0jQtxw==", "cpu": [ "riscv64" ], @@ -1440,9 +1440,9 @@ ] }, "node_modules/@rollup/rollup-linux-s390x-gnu": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.41.1.tgz", - "integrity": "sha512-oIE6M8WC9ma6xYqjvPhzZYk6NbobIURvP/lEbh7FWplcMO6gn7MM2yHKA1eC/GvYwzNKK/1LYgqzdkZ8YFxR8g==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.42.0.tgz", + "integrity": "sha512-I2Y1ZUgTgU2RLddUHXTIgyrdOwljjkmcZ/VilvaEumtS3Fkuhbw4p4hgHc39Ypwvo2o7sBFNl2MquNvGCa55Iw==", "cpu": [ "s390x" ], @@ -1454,9 +1454,9 @@ ] }, "node_modules/@rollup/rollup-linux-x64-gnu": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.41.1.tgz", - "integrity": "sha512-cWBOvayNvA+SyeQMp79BHPK8ws6sHSsYnK5zDcsC3Hsxr1dgTABKjMnMslPq1DvZIp6uO7kIWhiGwaTdR4Og9A==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.42.0.tgz", + "integrity": "sha512-Gfm6cV6mj3hCUY8TqWa63DB8Mx3NADoFwiJrMpoZ1uESbK8FQV3LXkhfry+8bOniq9pqY1OdsjFWNsSbfjPugw==", "cpu": [ "x64" ], @@ -1468,9 +1468,9 @@ ] }, "node_modules/@rollup/rollup-linux-x64-musl": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.41.1.tgz", - "integrity": "sha512-y5CbN44M+pUCdGDlZFzGGBSKCA4A/J2ZH4edTYSSxFg7ce1Xt3GtydbVKWLlzL+INfFIZAEg1ZV6hh9+QQf9YQ==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.42.0.tgz", + "integrity": "sha512-g86PF8YZ9GRqkdi0VoGlcDUb4rYtQKyTD1IVtxxN4Hpe7YqLBShA7oHMKU6oKTCi3uxwW4VkIGnOaH/El8de3w==", "cpu": [ "x64" ], @@ -1482,9 +1482,9 @@ ] }, "node_modules/@rollup/rollup-win32-arm64-msvc": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.41.1.tgz", - "integrity": "sha512-lZkCxIrjlJlMt1dLO/FbpZbzt6J/A8p4DnqzSa4PWqPEUUUnzXLeki/iyPLfV0BmHItlYgHUqJe+3KiyydmiNQ==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.42.0.tgz", + "integrity": "sha512-+axkdyDGSp6hjyzQ5m1pgcvQScfHnMCcsXkx8pTgy/6qBmWVhtRVlgxjWwDp67wEXXUr0x+vD6tp5W4x6V7u1A==", "cpu": [ "arm64" ], @@ -1496,9 +1496,9 @@ ] }, "node_modules/@rollup/rollup-win32-ia32-msvc": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.41.1.tgz", - "integrity": "sha512-+psFT9+pIh2iuGsxFYYa/LhS5MFKmuivRsx9iPJWNSGbh2XVEjk90fmpUEjCnILPEPJnikAU6SFDiEUyOv90Pg==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.42.0.tgz", + "integrity": "sha512-F+5J9pelstXKwRSDq92J0TEBXn2nfUrQGg+HK1+Tk7VOL09e0gBqUHugZv7SW4MGrYj41oNCUe3IKCDGVlis2g==", "cpu": [ "ia32" ], @@ -1510,9 +1510,9 @@ ] }, "node_modules/@rollup/rollup-win32-x64-msvc": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.41.1.tgz", - "integrity": "sha512-Wq2zpapRYLfi4aKxf2Xff0tN+7slj2d4R87WEzqw7ZLsVvO5zwYCIuEGSZYiK41+GlwUo1HiR+GdkLEJnCKTCw==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.42.0.tgz", + "integrity": "sha512-LpHiJRwkaVz/LqjHjK8LCi8osq7elmpwujwbXKNW88bM8eeGxavJIKKjkjpMHAh/2xfnrt1ZSnhTv41WYUHYmA==", "cpu": [ "x64" ], @@ -1578,9 +1578,9 @@ } }, "node_modules/@types/estree": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.7.tgz", - "integrity": "sha512-w28IoSUCJpidD/TGviZwwMJckNESJZXFu7NBZ5YJ4mEUnNraUn9Pm8HSZm/jDF1pDWYKspWE7oVphigUPRakIQ==", + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", + "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==", "license": "MIT" }, "node_modules/@types/estree-jsx": { @@ -1623,24 +1623,24 @@ "license": "MIT" }, "node_modules/@types/prop-types": { - "version": "15.7.14", - "resolved": "https://registry.npmjs.org/@types/prop-types/-/prop-types-15.7.14.tgz", - "integrity": "sha512-gNMvNH49DJ7OJYv+KAKn0Xp45p8PLl6zo2YnvDIbTd4J6MER2BmWN49TG7n9LvkyihINxeKW8+3bfS2yDC9dzQ==", + "version": "15.7.15", + "resolved": "https://registry.npmjs.org/@types/prop-types/-/prop-types-15.7.15.tgz", + "integrity": "sha512-F6bEyamV9jKGAFBEmlQnesRPGOQqS2+Uwi0Em15xenOxHaf2hv6L8YCVn3rPdPJOiJfPiCnLIRyvwVaqMY3MIw==", "license": "MIT" }, "node_modules/@types/react": { - "version": "19.1.6", - "resolved": "https://registry.npmjs.org/@types/react/-/react-19.1.6.tgz", - "integrity": "sha512-JeG0rEWak0N6Itr6QUx+X60uQmN+5t3j9r/OVDtWzFXKaj6kD1BwJzOksD0FF6iWxZlbE1kB0q9vtnU2ekqa1Q==", + "version": "19.1.7", + "resolved": "https://registry.npmjs.org/@types/react/-/react-19.1.7.tgz", + "integrity": "sha512-BnsPLV43ddr05N71gaGzyZ5hzkCmGwhMvYc8zmvI8Ci1bRkkDSzDDVfAXfN2tk748OwI7ediiPX6PfT9p0QGVg==", "license": "MIT", "dependencies": { "csstype": "^3.0.2" } }, "node_modules/@types/react-dom": { - "version": "19.1.5", - "resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-19.1.5.tgz", - "integrity": "sha512-CMCjrWucUBZvohgZxkjd6S9h0nZxXjzus6yDfUb+xLxYM7VvjKNH1tQrE9GWLql1XoOP4/Ds3bwFqShHUYraGg==", + "version": "19.1.6", + "resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-19.1.6.tgz", + "integrity": "sha512-4hOiT/dwO8Ko0gV1m/TJZYk3y0KBnY9vzDh7W+DH17b2HFSOGgdj33dhihPeuy3l0q23+4e+hoXHV6hCC4dCXw==", "dev": true, "license": "MIT", "peerDependencies": { @@ -1669,16 +1669,16 @@ "license": "ISC" }, "node_modules/@vitejs/plugin-react": { - "version": "4.5.0", - "resolved": "https://registry.npmjs.org/@vitejs/plugin-react/-/plugin-react-4.5.0.tgz", - "integrity": "sha512-JuLWaEqypaJmOJPLWwO335Ig6jSgC1FTONCWAxnqcQthLTK/Yc9aH6hr9z/87xciejbQcnP3GnA1FWUSWeXaeg==", + "version": "4.5.2", + "resolved": "https://registry.npmjs.org/@vitejs/plugin-react/-/plugin-react-4.5.2.tgz", + "integrity": "sha512-QNVT3/Lxx99nMQWJWF7K4N6apUEuT0KlZA3mx/mVaoGj3smm/8rc8ezz15J1pcbcjDK0V15rpHetVfya08r76Q==", "dev": true, "license": "MIT", "dependencies": { - "@babel/core": "^7.26.10", - "@babel/plugin-transform-react-jsx-self": "^7.25.9", - "@babel/plugin-transform-react-jsx-source": "^7.25.9", - "@rolldown/pluginutils": "1.0.0-beta.9", + "@babel/core": "^7.27.4", + "@babel/plugin-transform-react-jsx-self": "^7.27.1", + "@babel/plugin-transform-react-jsx-source": "^7.27.1", + "@rolldown/pluginutils": "1.0.0-beta.11", "@types/babel__core": "^7.20.5", "react-refresh": "^0.17.0" }, @@ -1686,7 +1686,7 @@ "node": "^14.18.0 || >=16.0.0" }, "peerDependencies": { - "vite": "^4.2.0 || ^5.0.0 || ^6.0.0" + "vite": "^4.2.0 || ^5.0.0 || ^6.0.0 || ^7.0.0-beta.0" } }, "node_modules/babel-plugin-macros": { @@ -1770,9 +1770,9 @@ } }, "node_modules/caniuse-lite": { - "version": "1.0.30001720", - "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001720.tgz", - "integrity": "sha512-Ec/2yV2nNPwb4DnTANEV99ZWwm3ZWfdlfkQbWSDDt+PsXEVYwlhPH8tdMaPunYTKKmz7AnHi2oNEi1GcmKCD8g==", + "version": "1.0.30001721", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001721.tgz", + "integrity": "sha512-cOuvmUVtKrtEaoKiO0rSc29jcjwMwX5tOHDy4MgVFEWiUXj4uBMJkwI8MDySkgXidpMiHUcviogAvFi4pA2hDQ==", "dev": true, "funding": [ { @@ -1959,9 +1959,9 @@ } }, "node_modules/electron-to-chromium": { - "version": "1.5.161", - "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.161.tgz", - "integrity": "sha512-hwtetwfKNZo/UlwHIVBlKZVdy7o8bIZxxKs0Mv/ROPiQQQmDgdm5a+KvKtBsxM8ZjFzTaCeLoodZ8jiBE3o9rA==", + "version": "1.5.166", + "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.166.tgz", + "integrity": "sha512-QPWqHL0BglzPYyJJ1zSSmwFFL6MFXhbACOCcsCdUMCkzPdS9/OIBVxg516X/Ado2qwAq8k0nJJ7phQPCqiaFAw==", "dev": true, "license": "ISC" }, @@ -2054,9 +2054,9 @@ "license": "MIT" }, "node_modules/fdir": { - "version": "6.4.5", - "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.4.5.tgz", - "integrity": "sha512-4BG7puHpVsIYxZUbiUE3RqGloLaSSwzYie5jvasC4LWuBWzZawynvYouhjbQKw2JuIGYdm0DzIxl8iVidKlUEw==", + "version": "6.4.6", + "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.4.6.tgz", + "integrity": "sha512-hiFoqpyZcfNm1yc4u8oWCf9A2c4D3QjCrks3zmoVKVxpQRzmPNar1hUJcBG2RQHvEVGDN+Jm81ZheVLAQMK6+w==", "dev": true, "license": "MIT", "peerDependencies": { @@ -3342,9 +3342,9 @@ } }, "node_modules/rollup": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.41.1.tgz", - "integrity": "sha512-cPmwD3FnFv8rKMBc1MxWCwVQFxwf1JEmSX3iQXrRVVG15zerAIXRjMFVWnd5Q5QvgKF7Aj+5ykXFhUl+QGnyOw==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.42.0.tgz", + "integrity": "sha512-LW+Vse3BJPyGJGAJt1j8pWDKPd73QM8cRXYK1IxOBgL2AGLu7Xd2YOW0M2sLUBCkF5MshXXtMApyEAEzMVMsnw==", "dev": true, "license": "MIT", "dependencies": { @@ -3358,29 +3358,36 @@ "npm": ">=8.0.0" }, "optionalDependencies": { - "@rollup/rollup-android-arm-eabi": "4.41.1", - "@rollup/rollup-android-arm64": "4.41.1", - "@rollup/rollup-darwin-arm64": "4.41.1", - "@rollup/rollup-darwin-x64": "4.41.1", - "@rollup/rollup-freebsd-arm64": "4.41.1", - "@rollup/rollup-freebsd-x64": "4.41.1", - "@rollup/rollup-linux-arm-gnueabihf": "4.41.1", - "@rollup/rollup-linux-arm-musleabihf": "4.41.1", - "@rollup/rollup-linux-arm64-gnu": "4.41.1", - "@rollup/rollup-linux-arm64-musl": "4.41.1", - "@rollup/rollup-linux-loongarch64-gnu": "4.41.1", - "@rollup/rollup-linux-powerpc64le-gnu": "4.41.1", - "@rollup/rollup-linux-riscv64-gnu": "4.41.1", - "@rollup/rollup-linux-riscv64-musl": "4.41.1", - "@rollup/rollup-linux-s390x-gnu": "4.41.1", - "@rollup/rollup-linux-x64-gnu": "4.41.1", - "@rollup/rollup-linux-x64-musl": "4.41.1", - "@rollup/rollup-win32-arm64-msvc": "4.41.1", - "@rollup/rollup-win32-ia32-msvc": "4.41.1", - "@rollup/rollup-win32-x64-msvc": "4.41.1", + "@rollup/rollup-android-arm-eabi": "4.42.0", + "@rollup/rollup-android-arm64": "4.42.0", + "@rollup/rollup-darwin-arm64": "4.42.0", + "@rollup/rollup-darwin-x64": "4.42.0", + "@rollup/rollup-freebsd-arm64": "4.42.0", + "@rollup/rollup-freebsd-x64": "4.42.0", + "@rollup/rollup-linux-arm-gnueabihf": "4.42.0", + "@rollup/rollup-linux-arm-musleabihf": "4.42.0", + "@rollup/rollup-linux-arm64-gnu": "4.42.0", + "@rollup/rollup-linux-arm64-musl": "4.42.0", + "@rollup/rollup-linux-loongarch64-gnu": "4.42.0", + "@rollup/rollup-linux-powerpc64le-gnu": "4.42.0", + "@rollup/rollup-linux-riscv64-gnu": "4.42.0", + "@rollup/rollup-linux-riscv64-musl": "4.42.0", + "@rollup/rollup-linux-s390x-gnu": "4.42.0", + "@rollup/rollup-linux-x64-gnu": "4.42.0", + "@rollup/rollup-linux-x64-musl": "4.42.0", + "@rollup/rollup-win32-arm64-msvc": "4.42.0", + "@rollup/rollup-win32-ia32-msvc": "4.42.0", + "@rollup/rollup-win32-x64-msvc": "4.42.0", "fsevents": "~2.3.2" } }, + "node_modules/rollup/node_modules/@types/estree": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.7.tgz", + "integrity": "sha512-w28IoSUCJpidD/TGviZwwMJckNESJZXFu7NBZ5YJ4mEUnNraUn9Pm8HSZm/jDF1pDWYKspWE7oVphigUPRakIQ==", + "dev": true, + "license": "MIT" + }, "node_modules/scheduler": { "version": "0.26.0", "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.26.0.tgz", diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index cf42f1e..ca3359d 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -8,6 +8,7 @@ Factory method to initialize an extractor instance based on its name. from __future__ import annotations from abc import abstractmethod +from contextlib import suppress import mimetypes import os import requests @@ -16,6 +17,7 @@ from retrying import retry import re from auto_archiver.core import Metadata, BaseModule +from auto_archiver.utils.url import get_media_url_best_quality class Extractor(BaseModule): @@ -70,10 +72,20 @@ class Extractor(BaseModule): return "" @retry(wait_random_min=500, wait_random_max=3500, stop_max_attempt_number=5) - def download_from_url(self, url: str, to_filename: str = None, verbose=True) -> str: + def download_from_url(self, url: str, to_filename: str = None, verbose=True, try_best_quality=False) -> str: """ downloads a URL to provided filename, or inferred from URL, returns local filename + Warning: if try_best_quality is True, it will return a tuple of (filename, best_quality_url) if the download was successful. """ + + if try_best_quality: + with suppress(Exception): + # Attempt to download the original URL + best_quality_url = get_media_url_best_quality(url) + orig_download = self.download_from_url(best_quality_url, to_filename, verbose) + if orig_download: + return orig_download, best_quality_url + if not to_filename: to_filename = url.split("/")[-1].split("?")[0] if len(to_filename) > 64: @@ -98,10 +110,12 @@ class Extractor(BaseModule): with open(to_filename, "wb") as f: for chunk in d.iter_content(chunk_size=8192): f.write(chunk) + if try_best_quality: + return to_filename, url return to_filename except requests.RequestException as e: - logger.warning(f"Failed to fetch the Media URL: {e}") + logger.warning(f"Failed to fetch the Media URL: {str(e)[:250]}") @abstractmethod def download(self, item: Metadata) -> Metadata | False: diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py index d2e9d66..e2bcad9 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py @@ -17,13 +17,14 @@ "default": 50, "help": "maximum number of videos to download from the page (0 = no download, inf = no limit).", }, - "exclude_media_extensions": { - "default": ".svg,.ico,.gif", - "help": "CSV of media (image/video) file extensions to exclude from download", - }, "user_data_dir": { "default": "secrets/antibot_user_data", - "help": "Path to the user data directory for the webdriver. This is used to persist browser state, such as cookies and local storage. When using docker it's best to let docker create the folder otherwise there may be permission issues. The Extractor will try to work without it if that error occurs but login sessions will not be used or preserved on those runs.", + "help": "Path to the user data directory for the webdriver. This is used to persist browser state, such as cookies and local storage. If you use the docker deployment, this path will be appended with `_docker` that is because the folder cannot be shared between the host and the container due to user permissions.", + }, + "detect_auth_wall": { + "default": True, + "type": "bool", + "help": "detect if the page is behind an authentication wall (e.g. login required) and skip it. disable if you want to archive pages where logins are required.", }, "proxy": { "default": None, @@ -31,7 +32,9 @@ }, }, "description": """ - Uses a browser controlled by SeleniumBase to capture HTML, media, and screenshots/PDFs of a web page, by bypassing anti-bot measures like Cloudflare's Turnstile. + Uses a browser controlled by SeleniumBase to capture HTML, media, and screenshots/PDFs of a web page, by bypassing anti-bot measures like Cloudflare's Turnstile or Google Recaptcha. + + Still in trial development, please report any issues or suggestions via GitHub Issues. ### Features - Extracts the HTML source code of the page. @@ -40,7 +43,6 @@ - Downloads images and videos from the page, excluding specified file extensions. ### Notes - - Requires a WebDriver (e.g., ChromeDriver) installed and accessible via the system's PATH. - Using a proxy affects Cloudflare Turnstile captcha handling, so it is recommended to use a proxy only if necessary. """, } diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py index e82a2f8..1982389 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py @@ -1,12 +1,10 @@ import base64 import math -import mimetypes import os import sys import traceback from urllib.parse import urljoin import glob -import stat import importlib.util from loguru import logger @@ -15,7 +13,9 @@ from seleniumbase import SB from auto_archiver.core import Extractor, Enricher, Metadata, Media from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin +from auto_archiver.modules.antibot_extractor_enricher.dropins.default import DefaultDropin from auto_archiver.utils.misc import random_str +from auto_archiver.utils.url import is_relevant_url class AntibotExtractorEnricher(Extractor, Enricher): @@ -25,10 +25,6 @@ class AntibotExtractorEnricher(Extractor, Enricher): self.agent = None # Use the default UserAgent # parse configuration options - self.exclude_media_mimetypes = set( - [mimetypes.guess_type(f"file{m}")[0] for m in self.exclude_media_extensions.split(",")] - ) - {None} - if self.max_download_images == "inf": self.max_download_images = math.inf else: @@ -39,7 +35,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): else: self.max_download_videos = int(self.max_download_videos) - self._prepare_and_warn_about_docker_and_user_data_dir() + self._prepare_user_data_dir() self.dropins = self.load_dropins() @@ -77,19 +73,12 @@ class AntibotExtractorEnricher(Extractor, Enricher): result.status = "antibot" return result - def _prepare_and_warn_about_docker_and_user_data_dir(self): - os.makedirs(self.user_data_dir, exist_ok=True) - - in_docker = os.environ.get("RUNNING_IN_DOCKER") - if in_docker and self.user_data_dir: - st = os.stat(self.user_data_dir) - perms = stat.filemode(st.st_mode) - owner = st.st_uid - group = st.st_gid - if owner != 0 or group != 0: - logger.warning( - f"""ANTIBOT: Running in Docker with user_data_dir {self.user_data_dir} with permissions {perms} and non-root {owner=}. This may cause issues with Chrome, if you get 'session not created' errors make sure to remove the folder and let docker create it.""" - ) + def _prepare_user_data_dir(self): + if self.user_data_dir: + in_docker = os.environ.get("RUNNING_IN_DOCKER") + if in_docker: + self.user_data_dir = self.user_data_dir.rstrip(os.path.sep) + "_docker" + os.makedirs(self.user_data_dir, exist_ok=True) def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool: using_user_data_dir = self.user_data_dir if custom_data_dir else None @@ -102,39 +91,41 @@ class AntibotExtractorEnricher(Extractor, Enricher): sb.uc_open_with_reconnect(url, 4) logger.debug(f"ANTIBOT handling CAPTCHAs for {url_sample}...") + sb.uc_gui_handle_cf() + sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future - # TODO: implement other Captcha handling - sb.uc_gui_handle_captcha() # handles Cloudflare Turnstile captcha if detected + dropin = self._get_suitable_dropin(url, sb) + dropin.open_page(url) - suitable_dropin = self._get_suitable_dropin(url, sb) - - if suitable_dropin: - suitable_dropin.open_page(url) - - if self._hit_auth_wall(sb): + if self.detect_auth_wall and self._hit_auth_wall(sb): logger.warning(f"ANTIBOT SKIP since auth wall or CAPTCHA was detected for {url_sample}") return False - logger.debug(f"ANTIBOT no auth wall detected for {url_sample}...") + sb.wait_for_ready_state_complete() sb.sleep(1) # margin for the page to load completely to_enrich.set_title(sb.get_title()) self._enrich_html_source_code(sb, to_enrich) + self._enrich_full_page_screenshot(sb, to_enrich) if self.save_to_pdf: self._enrich_full_page_pdf(sb, to_enrich) - downloaded_images, downloaded_videos = 0, 0 - if suitable_dropin: - downloaded_images, downloaded_videos = suitable_dropin.add_extra_media(to_enrich) + downloaded_images, downloaded_videos = dropin.add_extra_media(to_enrich) self._enrich_download_media( - sb, to_enrich, css_selector="img", max_media=self.max_download_images - downloaded_images + sb, + to_enrich, + css_selector=dropin.images_selectors(), + max_media=self.max_download_images - downloaded_images, ) self._enrich_download_media( - sb, to_enrich, css_selector="video, source", max_media=self.max_download_videos - downloaded_videos + sb, + to_enrich, + css_selector=dropin.video_selectors(), + max_media=self.max_download_videos - downloaded_videos, ) - logger.success(f"ANTIBOT completed for {url_sample}") + logger.info(f"ANTIBOT completed for {url_sample}") return to_enrich except selenium.common.exceptions.SessionNotCreatedException as e: @@ -155,10 +146,10 @@ class AntibotExtractorEnricher(Extractor, Enricher): """ for dropin in self.dropins: if dropin.suitable(url): - logger.debug(f"ANTIBOT using drop-in {dropin.__class__.__name__} for {url}") + logger.debug(f"ANTIBOT using drop-in {dropin.__name__} for {url}") return dropin(sb, self) - # logger.warning(f"ANTIBOT no suitable drop-in found for {url}") - return None + + return DefaultDropin(sb, self) def _hit_auth_wall(self, sb: SB) -> bool: """ @@ -168,8 +159,8 @@ class AntibotExtractorEnricher(Extractor, Enricher): # TODO: improve this detection logic, currently it is very basic and may not cover all cases # Common URL patterns - url = sb.get_current_url().lower() - if any(kw in url for kw in ["login", "signin", "signup", "register", "captcha"]): + current_url = sb.get_current_url().lower() + if any(kw in current_url for kw in ["login", "signin", "signup", "register", "captcha"]): return True # Common visible text markers @@ -245,8 +236,12 @@ class AntibotExtractorEnricher(Extractor, Enricher): Enriches the full page screenshot of the Metadata object. This method is called by the enrich method. """ - x = sb.execute_script("return document.documentElement.scrollWidth") - y = min(sb.execute_script("return document.documentElement.scrollHeight"), 25_000) + start_size = sb.get_window_size() + w, h = start_size["width"], start_size["height"] + + x = max(sb.execute_script("return document.documentElement.scrollWidth"), w) + y = min(max(sb.execute_script("return document.documentElement.scrollHeight"), h), 25_000) + logger.debug(f"Setting window size to {x}x{y} for full page screenshot.") sb.set_window_size(x, y) screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png") @@ -278,12 +273,9 @@ class AntibotExtractorEnricher(Extractor, Enricher): """ if max_media == 0: return - logger.debug( - f"Downloading media from {to_enrich.get_url()} with selector '{css_selector}' up to {max_media} items." - ) url = to_enrich.get_url() all_urls = set() - # media_elements = sb.find_elements(css_selector) + sources = sb.execute_script(f""" return Array.from(document.querySelectorAll("{css_selector}")) .map(el => el.src || el.href) @@ -293,10 +285,12 @@ class AntibotExtractorEnricher(Extractor, Enricher): if len(all_urls) >= max_media: logger.debug(f"Reached max download limit of {max_media} images/videos.") break - mimerype = mimetypes.guess_type(src)[0] - if mimerype in self.exclude_media_mimetypes: + if not is_relevant_url(src): continue full_src = urljoin(url, src) - if full_src not in all_urls and (filename := self.download_from_url(full_src)): + if full_src not in all_urls: + filename, full_src = self.download_from_url(full_src, try_best_quality=True) + if not filename: + continue all_urls.add(full_src) to_enrich.add_media(Media(filename=filename, properties={"url": full_src})) diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py index 805edfd..15c2e28 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py @@ -1,7 +1,10 @@ +import os +from loguru import logger from seleniumbase import SB +import yt_dlp -from auto_archiver.core.extractor import Extractor -from auto_archiver.core.metadata import Metadata +from auto_archiver.core import Extractor, Media, Metadata +from auto_archiver.utils.misc import ydl_entry_to_filename class Dropin: @@ -36,6 +39,20 @@ class Dropin: """ return url + @staticmethod + def images_selectors() -> str: + """ + CSS selector to find images in the HTML page + """ + return "img" + + @staticmethod + def video_selectors() -> str: + """ + CSS selector to find videos in the HTML page. + """ + return "video, source" + def open_page(self, url) -> bool: """ Make sure the page is opened, even if it requires authentication, captcha solving, etc. @@ -50,3 +67,59 @@ class Dropin: :return: A tuple (number of Images added, number of Videos added). """ raise NotImplementedError("This method should be implemented in the subclass") + + def _get_username_password(self, site) -> tuple[str, str]: + """ + Get the username and password for the site from the extractor's auth data. + :return: A tuple (username, password). + """ + auth = self.extractor.auth_for_site(site) + username = auth.get("username", "") + password = auth.get("password", "") + if not username or not password: + raise ValueError(f"{site} authentication requires a username and password.") + return username, password + + def _download_videos_with_ytdlp(self, video_urls: list[str], to_enrich: Metadata) -> int: + """ + Download videos using yt-dlp. + :param video_urls: List of video URLs to download. + :return: The number of videos downloaded. + """ + if type(self.extractor.max_download_videos) is int: + video_urls = video_urls[: self.extractor.max_download_videos] + + if not video_urls: + return 0 + + ydl_options = [ + "-o", + os.path.join(self.extractor.tmp_dir, "%(id)s.%(ext)s"), + "--quiet", + "--no-playlist", + "--no-write-subs", + "--no-write-auto-subs", + "--postprocessor-args", + "ffmpeg:-bitexact", + "--max-filesize", + "1000M", # Limit to 1GB per video + ] + *_, validated_options = yt_dlp.parse_options(ydl_options) + downloaded = 0 + with yt_dlp.YoutubeDL(validated_options) as ydl: + for url in video_urls: + try: + logger.debug(f"Downloading video from URL: {url}") + info = ydl.extract_info(url, download=True) + filename = ydl_entry_to_filename(ydl, info) + if not filename: # Failed to download video. + continue + media = Media(filename) + for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]: + if x in info: + media.set(x, info[x]) + to_enrich.add_media(media) + downloaded += 1 + except Exception as e: + logger.error(f"Error downloading {url}: {e}") + return downloaded diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py new file mode 100644 index 0000000..c5c865a --- /dev/null +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py @@ -0,0 +1,18 @@ +from auto_archiver.core.metadata import Metadata +from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin + + +class DefaultDropin(Dropin): + """ + A default fallback drop-in class for handling generic cases in the antibot extractor enricher module. + """ + + @staticmethod + def suitable(url: str) -> bool: + return False + + def open_page(self, url) -> bool: + return True + + def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]: + return 0, 0 diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py new file mode 100644 index 0000000..44d572b --- /dev/null +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py @@ -0,0 +1,78 @@ +from contextlib import suppress +from auto_archiver.core.metadata import Metadata +from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin + +from loguru import logger + + +class RedditDropin(Dropin): + """ + A class to handle Reddit drop-in functionality for the antibot extractor enricher module. + """ + + @staticmethod + def suitable(url: str) -> bool: + return "reddit.com" in url + + @staticmethod + def images_selectors() -> str: + return "shreddit-post img" + + @staticmethod + def video_selectors() -> str: + return "shreddit-post video, shreddit-post source" + + def open_page(self, url) -> bool: + if self.sb.is_text_visible("You've been blocked by network security."): + self._login() + if url != self.sb.get_current_url(): + self.sb.open(url) + return True + + @logger.catch + def _login(self): + self.sb.click_link_text("Log in") + self.sb.wait_for_ready_state_complete() + self._close_cookies_banner() + + username, password = self._get_username_password("reddit.com") + logger.debug("RedditDropin Logging in to VK with username: {}", username) + + self.sb.type("#login-username", username) + self.sb.type("#login-password", password) + + elem = self.sb.find_element("button.login") + self.sb.execute_script("arguments[0].scrollIntoView(true);", elem) + self.sb.slow_click("button.login") + self.sb.wait_for_ready_state_complete() + + if "https://www.reddit.com/login/" in self.sb.get_current_url(): + self.sb.sleep(5) + self.sb.wait_for_ready_state_complete() + + if self.sb.is_text_visible("You've been blocked by network security."): + self.sb.click_link_text("Log in") + self.sb.wait_for_ready_state_complete() + if self.sb.is_text_visible("Welcome back"): + logger.debug("RedditDropin Login successful") + self.sb.click_if_visible("this link") + + def _close_cookies_banner(self): + with suppress(Exception): # selenium.common.exceptions.JavascriptException + self.sb.execute_script(""" + document + .querySelector("reddit-cookie-banner") + .shadowRoot.querySelector("faceplate-dialog") + .querySelector("#accept-all-cookies-button button") + .click() + """) + + @logger.catch + def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]: + filtered_urls = self.sb.execute_script(rf""" + return [...document.querySelectorAll("{self.video_selectors()}")] + .map(el => el.src || el.href) + .filter(url => url && /\.(m3u8|mpd|ism)$/.test(url)); + """) + logger.debug("RedditDropin Found {} video URLs", len(filtered_urls)) + return 0, self._download_videos_with_ytdlp(filtered_urls, to_enrich) diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py index 6f54187..6888727 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py @@ -1,12 +1,8 @@ -import os import re -from auto_archiver.core.media import Media from auto_archiver.core.metadata import Metadata from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin -from auto_archiver.utils.misc import ydl_entry_to_filename -import yt_dlp from loguru import logger @@ -37,10 +33,11 @@ class VkDropin(Dropin): def open_page(self, url) -> bool: if self.sb.is_text_visible("Sign in to VK"): - self._login() - self.sb.open(url) + if self._login(): + self.sb.open(url) return True + @logger.catch def _login(self) -> bool: # TODO: test method self.sb.open("https://vk.com") @@ -50,13 +47,9 @@ class VkDropin(Dropin): return True # need to login - logger.debug("Logging in to VK...") - auth = self.extractor.auth_for_site("vk.com") - username = auth.get("username", "") - password = auth.get("password", "") - if not username or not password: - raise ValueError("VK authentication requires a username and password.") - logger.debug("Using username: {}", username) + username, password = self._get_username_password("vk.com") + logger.debug("Logging in to VK with username: {}", username) + self.sb.click('[data-testid="enter-another-way"]', timeout=10) self.sb.clear('input[name="login"][type="tel"]', by="css selector", timeout=10) self.sb.type('input[name="login"][type="tel"]', username, by="css selector", timeout=10) @@ -80,47 +73,6 @@ class VkDropin(Dropin): @logger.catch def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]: - """ - Extract video data from the currently open post with SeleniumBase. - - :return: A tuple (number of Images added, number of Videos added). - """ video_urls = [v.get_attribute("href") for v in self.sb.find_elements('a[href*="/video-"]')] - if type(self.extractor.max_download_videos) is int: - video_urls = video_urls[: self.extractor.max_download_videos] - if not video_urls: - return 0, 0 - - logger.debug(f"Found {len(video_urls)} video URLs in the post, using ytdlp for download.") - ydl_options = [ - "-o", - os.path.join(self.extractor.tmp_dir, "%(id)s.%(ext)s"), - "--quiet", - "--no-playlist", - "--no-write-subs", - "--no-write-auto-subs", - "--postprocessor-args", - "ffmpeg:-bitexact", - "--max-filesize", - "1000M", # Limit to 1GB per video - ] - *_, validated_options = yt_dlp.parse_options(ydl_options) - downloaded = 0 - with yt_dlp.YoutubeDL(validated_options) as ydl: - for url in video_urls: - try: - logger.debug(f"Downloading video from URL: {url}") - info = ydl.extract_info(url, download=True) - filename = ydl_entry_to_filename(ydl, info) - if not filename: # Failed to download video. - continue - media = Media(filename) - for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]: - if x in info: - media.set(x, info[x]) - to_enrich.add_media(media) - downloaded += 1 - except Exception as e: - logger.error(f"Error downloading {url}: {e}") - return 0, downloaded + return 0, self._download_videos_with_ytdlp(video_urls, to_enrich) diff --git a/src/auto_archiver/modules/generic_extractor/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py index 72db630..52cf8b8 100644 --- a/src/auto_archiver/modules/generic_extractor/__manifest__.py +++ b/src/auto_archiver/modules/generic_extractor/__manifest__.py @@ -30,6 +30,8 @@ For a full list of video platforms supported by `yt-dlp`, see the custom dropins can be created to handle additional websites and passed to the archiver via the command line using the `--dropins` option (TODO!). +You can see all currently implemented dropins in [the source code](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules/generic_extractor). + ### Auto-Updates The Generic Extractor will also automatically check for updates to `yt-dlp` (every 5 days by default). diff --git a/src/auto_archiver/utils/url.py b/src/auto_archiver/utils/url.py index 368d93c..2bb19cf 100644 --- a/src/auto_archiver/utils/url.py +++ b/src/auto_archiver/utils/url.py @@ -78,6 +78,8 @@ def remove_get_parameters(url: str) -> str: def is_relevant_url(url: str) -> bool: """ Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc. + + Assumption: URLs are relevant if they refer to files that can be downloaded with curl/requests, so excludes extensions like .m3u8. """ clean_url = remove_get_parameters(url) @@ -104,11 +106,19 @@ def is_relevant_url(url: str) -> bool: ("vk.com/images/reaction/",), # wikipedia ("wikipedia.org/static",), + # reddit + ("styles.redditmedia.com",), # opinionated but excludes may irrelevant images like avatars and banners + ("emoji.redditmedia.com",), ] + # TODO: make these globally configurable IRRELEVANT_ENDS_WITH = [ ".svg", # ignore SVGs ".ico", # ignore icons + # ignore index files for videos, these should be handled by ytdlp + ".m3u8", + ".mpd", + ".ism", ] for end in IRRELEVANT_ENDS_WITH: @@ -125,6 +135,36 @@ def is_relevant_url(url: str) -> bool: def twitter_best_quality_url(url: str) -> str: """ some twitter image URLs point to a less-than best quality - this returns the URL pointing to the highest (original) quality + this returns the URL pointing to the highest (original) quality (with 'name=orig') """ - return re.sub(r"name=(\w+)", "name=orig", url, 1) + parsed = urlparse(url) + query = parsed.query + if "name=" in query: + # Replace only the first occurrence of name=xxx with name=orig + new_query = re.sub(r"name=[^&]*", "name=orig", query, 1) + parsed = parsed._replace(query=new_query) + return urlunparse(parsed) + return url + + +def get_media_url_best_quality(url: str) -> str: + """ + Returns the best quality URL for the given media URL, it may not exist. + """ + parsed = urlparse(url) + + # twitter case + if any(d in parsed.netloc.replace("www", "") for d in ("twitter.com", "twimg.com", "x.com")): + url = twitter_best_quality_url(url) + parsed = urlparse(url) + + # some cases https://example.com/media-1280x720.mp4 to https://example.com/media.mp4 + basename = parsed.path.split("/")[-1] + match = re.match(r"(.+)-\d+x\d+(\.[a-zA-Z0-9]+)$", basename) + if match: + orig_basename = match.group(1) + match.group(2) + new_path = "/".join(parsed.path.split("/")[:-1] + [orig_basename]) + parsed = parsed._replace(path=new_path) # keep the query unchanged + url = urlunparse(parsed) + + return url diff --git a/tests/extractors/test_antibot_extractor_enricher.py b/tests/extractors/test_antibot_extractor_enricher.py index 1da025d..06107b4 100644 --- a/tests/extractors/test_antibot_extractor_enricher.py +++ b/tests/extractors/test_antibot_extractor_enricher.py @@ -34,7 +34,6 @@ class TestAntibotExtractorEnricher(TestExtractorBase): "save_to_pdf": False, "max_download_images": 0, "max_download_videos": 0, - "exclude_media_extensions": ".svg,.ico,.gif", "proxy": None, } @@ -129,15 +128,33 @@ class TestAntibotExtractorEnricher(TestExtractorBase): ), ( "https://seleniumbase.io/apps/turnstile", - 'id="captcha-success"', + '', + ), + ( + "https://seleniumbase.io/apps/form_turnstile", + '', + ), + ( + "https://gitlab.com/users/sign_in", + "Password", ), ], ) - def test_download_with_cloudflare_turnstile(self, setup_module, make_item, url, in_html): + def test_overcome_cloudflare_turnstile(self, setup_module, make_item, url, in_html): """ Test downloading a page with Cloudflare Turnstile captcha. """ + self.extractor = setup_module( + self.extractor_module, + { + "save_to_pdf": True, + "detect_auth_wall": False, + "max_download_images": 5, + "max_download_videos": "inf", + }, + ) + item = make_item(url) self.extractor.enrich(item) diff --git a/tests/utils/test_urls.py b/tests/utils/test_urls.py index 7871847..df8e0f3 100644 --- a/tests/utils/test_urls.py +++ b/tests/utils/test_urls.py @@ -6,6 +6,7 @@ from auto_archiver.utils.url import ( is_relevant_url, remove_get_parameters, twitter_best_quality_url, + get_media_url_best_quality, ) @@ -95,6 +96,11 @@ def test_remove_get_parameters(url, without_get): ("https://example.com/150x150.jpg", True), ("https://example.com/rsrc.php/", True), ("https://example.com/img/emoji/", True), + ("https://styles.redditmedia.com/123", False), + ("https://emoji.redditmedia.com/abc.jpg", False), + ("https://example.com/rsrc.m3u8?asdasd=10", False), + ("https://example.com/rsrc.mpd", False), + ("https://example.com/rsrc.ism?vid=12", False), ], ) def test_is_relevant_url(url, relevant): @@ -104,10 +110,51 @@ def test_is_relevant_url(url, relevant): @pytest.mark.parametrize( "url, best_quality", [ - ("https://twitter.com/some_image.jpg?name=small", "https://twitter.com/some_image.jpg?name=orig"), + ( + "https://twitter.com/some_image.jpg?name=small&this_is_another=145", + "https://twitter.com/some_image.jpg?name=orig&this_is_another=145", + ), ("https://twitter.com/some_image.jpg", "https://twitter.com/some_image.jpg"), ("https://twitter.com/some_image.jpg?name=orig", "https://twitter.com/some_image.jpg?name=orig"), ], ) def test_twitter_best_quality_url(url, best_quality): assert twitter_best_quality_url(url) == best_quality + + +@pytest.mark.parametrize( + "input_url,expected_url", + [ + # Twitter: add/replace name= to name=orig + ( + "https://pbs.twimg.com/media/abc123?format=jpg&name=small", + "https://pbs.twimg.com/media/abc123?format=jpg&name=orig", + ), + ("https://pbs.twimg.com/media/abc123?name=large", "https://pbs.twimg.com/media/abc123?name=orig"), + ("https://pbs.twimg.com/media/abc123?format=jpg", "https://pbs.twimg.com/media/abc123?format=jpg"), + # Twitter: already orig + ( + "https://pbs.twimg.com/media/abc123?format=jpg&name=orig", + "https://pbs.twimg.com/media/abc123?format=jpg&name=orig", + ), + # X.com domain + ("https://x.com/media/abc123?name=medium", "https://x.com/media/abc123?name=orig"), + # twimg.com domain + ("https://twimg.com/media/abc123?name=thumb", "https://twimg.com/media/abc123?name=orig"), + # Non-twitter domain, no change + ("https://example.com/media/file.mp4", "https://example.com/media/file.mp4"), + # Remove -WxH from basename + ("https://example.com/media/file-1280x720.mp4", "https://example.com/media/file.mp4"), + ("https://example.com/media/file-1920x1080.jpg?foo=bar", "https://example.com/media/file.jpg?foo=bar"), + # Both twitter and -WxH + ("https://pbs.twimg.com/media/abc-1280x720.jpg?name=small", "https://pbs.twimg.com/media/abc.jpg?name=orig"), + # No match for -WxH, no change + ("https://example.com/media/file.mp4?foo=bar", "https://example.com/media/file.mp4?foo=bar"), + # Path with multiple directories + ("https://example.com/a/b/c/file-640x480.png", "https://example.com/a/b/c/file.png"), + # -WxH in directory, not basename (should not change) + ("https://example.com/media-1280x720/file.mp4", "https://example.com/media-1280x720/file.mp4"), + ], +) +def test_get_media_url_best_quality(input_url, expected_url): + assert get_media_url_best_quality(input_url) == expected_url