mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-07 19:08:30 +03:00
Merge pull request #318 from bellingcat/feat/antibot-reddit
Adds RedditDropin and other flow improvements
This commit is contained in:
8
.github/dependabot.yml
vendored
8
.github/dependabot.yml
vendored
@@ -12,7 +12,7 @@ updates:
|
||||
patterns:
|
||||
- "*"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
interval: "monthly"
|
||||
|
||||
- package-ecosystem: "github-actions"
|
||||
directory: "/"
|
||||
@@ -21,7 +21,7 @@ updates:
|
||||
patterns:
|
||||
- "*"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
interval: "monthly"
|
||||
|
||||
- package-ecosystem: "npm"
|
||||
directory: "/scripts/settings/"
|
||||
@@ -30,11 +30,11 @@ updates:
|
||||
patterns:
|
||||
- "*"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
interval: "monthly"
|
||||
|
||||
- package-ecosystem: "docker"
|
||||
# Look for a `Dockerfile` in the `root` directory
|
||||
directory: "/"
|
||||
# Check for updates once a week
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
interval: "monthly"
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM webrecorder/browsertrix-crawler:1.6.1 AS base
|
||||
FROM webrecorder/browsertrix-crawler:1.6.2 AS base
|
||||
|
||||
ENV RUNNING_IN_DOCKER=1 \
|
||||
LANG=C.UTF-8 \
|
||||
|
||||
44
docs/source/how_to/upgrading_1_0_1_to_1_1_0.md
Normal file
44
docs/source/how_to/upgrading_1_0_1_to_1_1_0.md
Normal file
@@ -0,0 +1,44 @@
|
||||
# Upgrading from v1.0.1
|
||||
|
||||
```{note} This how-to is only relevant for people who used Auto Archiver before June 2025 (versions prior to 1.1.0).
|
||||
|
||||
If you are new to Auto Archiver, then you are already using the latest configuration format and this how-to is not relevant for you.
|
||||
```
|
||||
|
||||
Versions 1.1.0+ of Auto Archiver has breaking changes in the configuration format, which means earlier configuration formats will not work without slight modifications.
|
||||
|
||||
|
||||
## Dropping `vk_extractor` module
|
||||
We have dropped the `vk_extractor` because of problems in a project we relied on. You will need to remove it from your configuration file, otherwise you will see an error like:
|
||||
|
||||
```{code} console
|
||||
Module 'vk_extractor' not found. Are you sure it's installed/exists?
|
||||
```
|
||||
|
||||
## New `antibot_extractor_enricher` module and VkDropin
|
||||
We have added a new `antibot_extractor_enricher` module that uses a computer-controlled browser to extract content from websites that use anti-bot measures. You can add it to your configuration file like this:
|
||||
|
||||
```{code} yaml
|
||||
steps:
|
||||
extractors:
|
||||
- antibot_extractor_enricher
|
||||
|
||||
# or alternatively, if you want to use it as an enricher:
|
||||
enrichers:
|
||||
- antibot_extractor_enricher
|
||||
```
|
||||
|
||||
It comes with Dropins that we will be adding and maintaining.
|
||||
|
||||
> Dropin: A module with site-specific behaviours that is loaded automatically. You don't need to add them to your configuration steps for them to run. Sometimes they need `authentication` configurations though.
|
||||
|
||||
One such Dropin is the VkDropin which uses this automated browser to access VKontakte (VK) pages. You should add a username/password to the configuration file if you get authentication blocks from VK, to do so use the [authentication settings](authentication_how_to.md):
|
||||
|
||||
```{code} yaml
|
||||
authentication:
|
||||
vk:
|
||||
username: your_username
|
||||
password: your_password
|
||||
```
|
||||
|
||||
See all available Dropins in [the source code](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules/antibot_extractor_enricher/dropins). Usually each Dropin needs its own authentication settings, similarly to the VkDropin.
|
||||
109
poetry.lock
generated
109
poetry.lock
generated
@@ -193,18 +193,18 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "boto3"
|
||||
version = "1.38.27"
|
||||
version = "1.38.33"
|
||||
description = "The AWS SDK for Python"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "boto3-1.38.27-py3-none-any.whl", hash = "sha256:95f5fe688795303a8a15e8b7e7f255cadab35eae459d00cc281a4fd77252ea80"},
|
||||
{file = "boto3-1.38.27.tar.gz", hash = "sha256:94bd7fdd92d5701b362d4df100d21e28f8307a67ff56b6a8b0398119cf22f859"},
|
||||
{file = "boto3-1.38.33-py3-none-any.whl", hash = "sha256:25d0717489c658f7ae6c3c7f0f7e1b4d611b30b2f08f0fcef6455ac6864a8768"},
|
||||
{file = "boto3-1.38.33.tar.gz", hash = "sha256:6467909c1ae01ff67981f021bb2568592211765ec8a9a1d2c4529191e46c3541"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
botocore = ">=1.38.27,<1.39.0"
|
||||
botocore = ">=1.38.33,<1.39.0"
|
||||
jmespath = ">=0.7.1,<2.0.0"
|
||||
s3transfer = ">=0.13.0,<0.14.0"
|
||||
|
||||
@@ -213,14 +213,14 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
|
||||
|
||||
[[package]]
|
||||
name = "botocore"
|
||||
version = "1.38.27"
|
||||
version = "1.38.33"
|
||||
description = "Low-level, data-driven core of boto 3."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "botocore-1.38.27-py3-none-any.whl", hash = "sha256:a785d5e9a5eda88ad6ab9ed8b87d1f2ac409d0226bba6ff801c55359e94d91a8"},
|
||||
{file = "botocore-1.38.27.tar.gz", hash = "sha256:9788f7efe974328a38cbade64cc0b1e67d27944b899f88cb786ae362973133b6"},
|
||||
{file = "botocore-1.38.33-py3-none-any.whl", hash = "sha256:ad25233e93dcebe95809b1f9393c1f11a639696327793d166295fb78dd7bc597"},
|
||||
{file = "botocore-1.38.33.tar.gz", hash = "sha256:dbe8fea9d0426c644c89ef2018ead886ccbcc22901a02b377b4e65ce1cb69a2b"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -941,14 +941,14 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "google-api-core"
|
||||
version = "2.24.2"
|
||||
version = "2.25.0"
|
||||
description = "Google API client core library"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "google_api_core-2.24.2-py3-none-any.whl", hash = "sha256:810a63ac95f3c441b7c0e43d344e372887f62ce9071ba972eacf32672e072de9"},
|
||||
{file = "google_api_core-2.24.2.tar.gz", hash = "sha256:81718493daf06d96d6bc76a91c23874dbf2fac0adbbf542831b805ee6e974696"},
|
||||
{file = "google_api_core-2.25.0-py3-none-any.whl", hash = "sha256:1db79d1281dcf9f3d10023283299ba38f3dc9f639ec41085968fd23e5bcf512e"},
|
||||
{file = "google_api_core-2.25.0.tar.gz", hash = "sha256:9b548e688702f82a34ed8409fb8a6961166f0b7795032f0be8f48308dff4333a"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -959,21 +959,21 @@ protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4
|
||||
requests = ">=2.18.0,<3.0.0"
|
||||
|
||||
[package.extras]
|
||||
async-rest = ["google-auth[aiohttp] (>=2.35.0,<3.0.dev0)"]
|
||||
grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev) ; python_version >= \"3.11\"", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0) ; python_version >= \"3.11\""]
|
||||
grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
|
||||
grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
|
||||
async-rest = ["google-auth[aiohttp] (>=2.35.0,<3.0.0)"]
|
||||
grpc = ["grpcio (>=1.33.2,<2.0.0)", "grpcio (>=1.49.1,<2.0.0) ; python_version >= \"3.11\"", "grpcio-status (>=1.33.2,<2.0.0)", "grpcio-status (>=1.49.1,<2.0.0) ; python_version >= \"3.11\""]
|
||||
grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0.0)"]
|
||||
grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "google-api-python-client"
|
||||
version = "2.170.0"
|
||||
version = "2.171.0"
|
||||
description = "Google API Client Library for Python"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "google_api_python_client-2.170.0-py3-none-any.whl", hash = "sha256:7bf518a0527ad23322f070fa69f4f24053170d5c766821dc970ff0571ec22748"},
|
||||
{file = "google_api_python_client-2.170.0.tar.gz", hash = "sha256:75f3a1856f11418ea3723214e0abc59d9b217fd7ed43dcf743aab7f06ab9e2b1"},
|
||||
{file = "google_api_python_client-2.171.0-py3-none-any.whl", hash = "sha256:c9c9b76f561e9d9ac14e54a9e2c0842876201d5b96e69e48f967373f0784cbe9"},
|
||||
{file = "google_api_python_client-2.171.0.tar.gz", hash = "sha256:057a5c08d28463c6b9eb89746355de5f14b7ed27a65c11fdbf1d06c66bb66b23"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -985,14 +985,14 @@ uritemplate = ">=3.0.1,<5"
|
||||
|
||||
[[package]]
|
||||
name = "google-auth"
|
||||
version = "2.40.2"
|
||||
version = "2.40.3"
|
||||
description = "Google Authentication Library"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "google_auth-2.40.2-py2.py3-none-any.whl", hash = "sha256:f7e568d42eedfded58734f6a60c58321896a621f7c116c411550a4b4a13da90b"},
|
||||
{file = "google_auth-2.40.2.tar.gz", hash = "sha256:a33cde547a2134273226fa4b853883559947ebe9207521f7afc707efbf690f58"},
|
||||
{file = "google_auth-2.40.3-py2.py3-none-any.whl", hash = "sha256:1370d4593e86213563547f97a92752fc658456fe4514c809544f330fed45a7ca"},
|
||||
{file = "google_auth-2.40.3.tar.gz", hash = "sha256:500c3a29adedeb36ea9cf24b8d10858e152f2412e3ca37829b3fa18e33d63b77"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -2130,7 +2130,7 @@ version = "2.19.1"
|
||||
description = "Pygments is a syntax highlighting package written in Python."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main", "docs"]
|
||||
groups = ["main", "dev", "docs"]
|
||||
files = [
|
||||
{file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"},
|
||||
{file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"},
|
||||
@@ -2335,26 +2335,27 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "pytest"
|
||||
version = "8.3.5"
|
||||
version = "8.4.0"
|
||||
description = "pytest: simple powerful testing with Python"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main", "dev"]
|
||||
files = [
|
||||
{file = "pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820"},
|
||||
{file = "pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845"},
|
||||
{file = "pytest-8.4.0-py3-none-any.whl", hash = "sha256:f40f825768ad76c0977cbacdf1fd37c6f7a468e460ea6a0636078f8972d4517e"},
|
||||
{file = "pytest-8.4.0.tar.gz", hash = "sha256:14d920b48472ea0dbf68e45b96cd1ffda4705f33307dcc86c676c1b5104838a6"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
colorama = {version = "*", markers = "sys_platform == \"win32\""}
|
||||
exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
|
||||
iniconfig = "*"
|
||||
packaging = "*"
|
||||
colorama = {version = ">=0.4", markers = "sys_platform == \"win32\""}
|
||||
exceptiongroup = {version = ">=1", markers = "python_version < \"3.11\""}
|
||||
iniconfig = ">=1"
|
||||
packaging = ">=20"
|
||||
pluggy = ">=1.5,<2"
|
||||
pygments = ">=2.7.2"
|
||||
tomli = {version = ">=1", markers = "python_version < \"3.11\""}
|
||||
|
||||
[package.extras]
|
||||
dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
|
||||
dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "requests", "setuptools", "xmlschema"]
|
||||
|
||||
[[package]]
|
||||
name = "pytest-html"
|
||||
@@ -2766,19 +2767,19 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "requests"
|
||||
version = "2.32.3"
|
||||
version = "2.32.4"
|
||||
description = "Python HTTP for Humans."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main", "docs"]
|
||||
files = [
|
||||
{file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
|
||||
{file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
|
||||
{file = "requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c"},
|
||||
{file = "requests-2.32.4.tar.gz", hash = "sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
certifi = ">=2017.4.17"
|
||||
charset-normalizer = ">=2,<4"
|
||||
charset_normalizer = ">=2,<4"
|
||||
idna = ">=2.5,<4"
|
||||
PySocks = {version = ">=1.5.6,<1.5.7 || >1.5.7", optional = true, markers = "extra == \"socks\""}
|
||||
urllib3 = ">=1.21.1,<3"
|
||||
@@ -2894,7 +2895,7 @@ description = "Manipulate well-formed Roman numerals"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["docs"]
|
||||
markers = "python_version >= \"3.11\""
|
||||
markers = "python_version != \"3.10\""
|
||||
files = [
|
||||
{file = "roman_numerals_py-3.1.0-py3-none-any.whl", hash = "sha256:9da2ad2fb670bcf24e81070ceb3be72f6c11c440d73bd579fbeca1e9f330954c"},
|
||||
{file = "roman_numerals_py-3.1.0.tar.gz", hash = "sha256:be4bf804f083a4ce001b5eb7e3c0862479d10f94c936f6c4e5f250aa5ff5bd2d"},
|
||||
@@ -2921,14 +2922,14 @@ pyasn1 = ">=0.1.3"
|
||||
|
||||
[[package]]
|
||||
name = "ruamel-yaml"
|
||||
version = "0.18.12"
|
||||
version = "0.18.14"
|
||||
description = "ruamel.yaml is a YAML parser/emitter that supports roundtrip preservation of comments, seq/map flow style, and map key order"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "ruamel.yaml-0.18.12-py3-none-any.whl", hash = "sha256:790ba4c48b6a6e6b12b532a7308779eb12d2aaab3a80fdb8389216f28ea2b287"},
|
||||
{file = "ruamel.yaml-0.18.12.tar.gz", hash = "sha256:5a38fd5ce39d223bebb9e3a6779e86b9427a03fb0bf9f270060f8b149cffe5e2"},
|
||||
{file = "ruamel.yaml-0.18.14-py3-none-any.whl", hash = "sha256:710ff198bb53da66718c7db27eec4fbcc9aa6ca7204e4c1df2f282b6fe5eb6b2"},
|
||||
{file = "ruamel.yaml-0.18.14.tar.gz", hash = "sha256:7227b76aaec364df15936730efbf7d72b30c0b79b1d578bbb8e3dcb2d81f52b7"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -3112,14 +3113,14 @@ websocket-client = ">=1.8.0,<1.9.0"
|
||||
|
||||
[[package]]
|
||||
name = "seleniumbase"
|
||||
version = "4.39.2"
|
||||
version = "4.39.3"
|
||||
description = "A complete web automation framework for end-to-end testing."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "seleniumbase-4.39.2-py3-none-any.whl", hash = "sha256:23b2d071c02ba269a8239b828fd5098edb208d04171143c93b40d8a351ba2861"},
|
||||
{file = "seleniumbase-4.39.2.tar.gz", hash = "sha256:3a18d582ca90f4d633debb8ec45871db1b7aed71e5876fc634962fba79731967"},
|
||||
{file = "seleniumbase-4.39.3-py3-none-any.whl", hash = "sha256:cbb94d7858a9ef3b0b4431a5879150649f4a73029afaa8ecfb7bda113f2565e1"},
|
||||
{file = "seleniumbase-4.39.3.tar.gz", hash = "sha256:b32978e685b1e4e2c7859b2dcb377ac14ba99bf748ea428548f9e450257b861b"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -3156,7 +3157,7 @@ pygments = ">=2.19.1"
|
||||
pynose = ">=1.5.4"
|
||||
pyotp = "2.9.0"
|
||||
pyreadline3 = {version = ">=3.5.3", markers = "platform_system == \"Windows\""}
|
||||
pytest = "8.3.5"
|
||||
pytest = {version = "8.4.0", markers = "python_version >= \"3.9\""}
|
||||
pytest-html = "4.0.2"
|
||||
pytest-metadata = "3.1.1"
|
||||
pytest-ordering = "0.6"
|
||||
@@ -3164,11 +3165,11 @@ pytest-rerunfailures = {version = "15.1", markers = "python_version >= \"3.9\""}
|
||||
pytest-xdist = {version = "3.7.0", markers = "python_version >= \"3.9\""}
|
||||
python-xlib = {version = "0.33", markers = "platform_system == \"Linux\""}
|
||||
pyyaml = ">=6.0.2"
|
||||
requests = "2.32.3"
|
||||
requests = "2.32.4"
|
||||
rich = ">=14.0.0,<15"
|
||||
sbvirtualdisplay = ">=1.4.0"
|
||||
selenium = {version = "4.33.0", markers = "python_version >= \"3.10\""}
|
||||
setuptools = {version = ">=80.8.0", markers = "python_version >= \"3.10\""}
|
||||
setuptools = {version = ">=80.9.0", markers = "python_version >= \"3.10\""}
|
||||
six = ">=1.17.0"
|
||||
sniffio = "1.3.1"
|
||||
sortedcontainers = "2.4.0"
|
||||
@@ -3323,7 +3324,7 @@ description = "Python documentation generator"
|
||||
optional = false
|
||||
python-versions = ">=3.11"
|
||||
groups = ["docs"]
|
||||
markers = "python_version >= \"3.11\""
|
||||
markers = "python_version != \"3.10\""
|
||||
files = [
|
||||
{file = "sphinx-8.2.3-py3-none-any.whl", hash = "sha256:4405915165f13521d875a8c29c8970800a0141c14cc5416a38feca4ea5d9b9c3"},
|
||||
{file = "sphinx-8.2.3.tar.gz", hash = "sha256:398ad29dee7f63a75888314e9424d40f52ce5a6a87ae88e7071e80af296ec348"},
|
||||
@@ -3801,14 +3802,14 @@ test = ["coverage", "pytest", "pytest-cov"]
|
||||
|
||||
[[package]]
|
||||
name = "uritemplate"
|
||||
version = "4.1.1"
|
||||
version = "4.2.0"
|
||||
description = "Implementation of RFC 6570 URI Templates"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "uritemplate-4.1.1-py2.py3-none-any.whl", hash = "sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e"},
|
||||
{file = "uritemplate-4.1.1.tar.gz", hash = "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0"},
|
||||
{file = "uritemplate-4.2.0-py3-none-any.whl", hash = "sha256:962201ba1c4edcab02e60f9a0d3821e82dfc5d2d6662a21abd533879bdb8a686"},
|
||||
{file = "uritemplate-4.2.0.tar.gz", hash = "sha256:480c2ed180878955863323eea31b0ede668795de182617fef9c6ca09e6ec9d0e"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4120,14 +4121,14 @@ h11 = ">=0.9.0,<1"
|
||||
|
||||
[[package]]
|
||||
name = "yt-dlp"
|
||||
version = "2025.5.22"
|
||||
version = "2025.6.9"
|
||||
description = "A feature-rich command-line audio/video downloader"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "yt_dlp-2025.5.22-py3-none-any.whl", hash = "sha256:a49c4b76afeaded6254c3e2b759d8d5a13271aa963d5fccb51fe059d1c313151"},
|
||||
{file = "yt_dlp-2025.5.22.tar.gz", hash = "sha256:ea73854c5dabc124f29a35a8fae9bc5d422ef3231bebeea2bdfa82ac191a9c29"},
|
||||
{file = "yt_dlp-2025.6.9-py3-none-any.whl", hash = "sha256:ebdfda9ffa807f6a26aed7c8f906e5557cd06b4c388dc547df1ec2078631fca8"},
|
||||
{file = "yt_dlp-2025.6.9.tar.gz", hash = "sha256:751f53a3b61353522bf805fa30bbcbd16666126537e39706eab4f8c368f111ac"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -4142,7 +4143,7 @@ urllib3 = {version = ">=1.26.17,<3", optional = true, markers = "extra == \"defa
|
||||
websockets = {version = ">=13.0", optional = true, markers = "extra == \"default\""}
|
||||
|
||||
[package.extras]
|
||||
build = ["build", "hatchling", "pip", "setuptools (>=71.0.2)", "wheel"]
|
||||
build = ["build", "hatchling", "pip", "setuptools (>=71.0.2,<81)", "wheel"]
|
||||
curl-cffi = ["curl-cffi (>=0.5.10,<0.6.dev0 || ==0.10.*) ; implementation_name == \"cpython\""]
|
||||
default = ["brotli ; implementation_name == \"cpython\"", "brotlicffi ; implementation_name != \"cpython\"", "certifi", "mutagen", "pycryptodomex", "requests (>=2.32.2,<3)", "urllib3 (>=1.26.17,<3)", "websockets (>=13.0)"]
|
||||
dev = ["autopep8 (>=2.0,<3.0)", "pre-commit", "pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)", "ruff (>=0.11.0,<0.12.0)"]
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[project]
|
||||
name = "auto-archiver"
|
||||
version = "1.0.1"
|
||||
version = "1.1.0"
|
||||
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
||||
|
||||
requires-python = ">=3.10,<3.13"
|
||||
|
||||
293
scripts/settings/package-lock.json
generated
293
scripts/settings/package-lock.json
generated
@@ -10,21 +10,21 @@
|
||||
"dependencies": {
|
||||
"@dnd-kit/core": "^6.3.1",
|
||||
"@dnd-kit/sortable": "^10.0.0",
|
||||
"@emotion/react": "*",
|
||||
"@emotion/styled": "*",
|
||||
"@emotion/react": "latest",
|
||||
"@emotion/styled": "latest",
|
||||
"@mui/icons-material": "^7.1.1",
|
||||
"@mui/material": "*",
|
||||
"@mui/material": "latest",
|
||||
"react": "19.1.0",
|
||||
"react-dom": "19.1.0",
|
||||
"react-markdown": "^10.0.0",
|
||||
"yaml": "^2.7.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/react": "*",
|
||||
"@types/react-dom": "*",
|
||||
"@vitejs/plugin-react": "*",
|
||||
"typescript": "*",
|
||||
"vite": "*",
|
||||
"@types/react": "latest",
|
||||
"@types/react-dom": "latest",
|
||||
"@vitejs/plugin-react": "latest",
|
||||
"typescript": "latest",
|
||||
"vite": "latest",
|
||||
"vite-plugin-singlefile": "^2.1.0"
|
||||
}
|
||||
},
|
||||
@@ -57,9 +57,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/compat-data": {
|
||||
"version": "7.27.3",
|
||||
"resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.27.3.tgz",
|
||||
"integrity": "sha512-V42wFfx1ymFte+ecf6iXghnnP8kWTO+ZLXIyZq+1LAXHHvTZdVxicn4yiVYdYMGaCO3tmqub11AorKkv+iodqw==",
|
||||
"version": "7.27.5",
|
||||
"resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.27.5.tgz",
|
||||
"integrity": "sha512-KiRAp/VoJaWkkte84TvUd9qjdbZAdiqyvMxrGl1N6vzFogKmaLgoM3L1kgtLicp2HP5fBJS8JrZKLVIZGVJAVg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
@@ -105,12 +105,12 @@
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@babel/generator": {
|
||||
"version": "7.27.3",
|
||||
"resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.27.3.tgz",
|
||||
"integrity": "sha512-xnlJYj5zepml8NXtjkG0WquFUv8RskFqyFcVgTBp5k+NaA/8uw/K+OSVf8AMGw5e9HKP2ETd5xpK5MLZQD6b4Q==",
|
||||
"version": "7.27.5",
|
||||
"resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.27.5.tgz",
|
||||
"integrity": "sha512-ZGhA37l0e/g2s1Cnzdix0O3aLYm66eF8aufiVteOgnwxgnRP8GoyMj7VWsgWnQbVKXyge7hqrFh2K2TQM6t1Hw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/parser": "^7.27.3",
|
||||
"@babel/parser": "^7.27.5",
|
||||
"@babel/types": "^7.27.3",
|
||||
"@jridgewell/gen-mapping": "^0.3.5",
|
||||
"@jridgewell/trace-mapping": "^0.3.25",
|
||||
@@ -207,23 +207,23 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/helpers": {
|
||||
"version": "7.27.4",
|
||||
"resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.27.4.tgz",
|
||||
"integrity": "sha512-Y+bO6U+I7ZKaM5G5rDUZiYfUvQPUibYmAFe7EnKdnKBbVXDZxvp+MWOH5gYciY0EPk4EScsuFMQBbEfpdRKSCQ==",
|
||||
"version": "7.27.6",
|
||||
"resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.27.6.tgz",
|
||||
"integrity": "sha512-muE8Tt8M22638HU31A3CgfSUciwz1fhATfoVai05aPXGor//CdWDCbnlY1yvBPo07njuVOCNGCSp/GTt12lIug==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/template": "^7.27.2",
|
||||
"@babel/types": "^7.27.3"
|
||||
"@babel/types": "^7.27.6"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=6.9.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/parser": {
|
||||
"version": "7.27.4",
|
||||
"resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.27.4.tgz",
|
||||
"integrity": "sha512-BRmLHGwpUqLFR2jzx9orBuX/ABDkj2jLKOXrHDTN2aOKL+jFDDKaRNo9nyYsIl9h/UE/7lMKdDjKQQyxKKDZ7g==",
|
||||
"version": "7.27.5",
|
||||
"resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.27.5.tgz",
|
||||
"integrity": "sha512-OsQd175SxWkGlzbny8J3K8TnnDD0N3lrIUtB92xwyRpzaenGZhxDvxN/JgU00U3CDZNj9tPuDJ5H0WS4Nt3vKg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/types": "^7.27.3"
|
||||
@@ -268,9 +268,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/runtime": {
|
||||
"version": "7.27.4",
|
||||
"resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.27.4.tgz",
|
||||
"integrity": "sha512-t3yaEOuGu9NlIZ+hIeGbBjFtZT7j2cb2tg0fuaJKeGotchRjjLfrBA9Kwf8quhpP1EUuxModQg04q/mBwyg8uA==",
|
||||
"version": "7.27.6",
|
||||
"resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.27.6.tgz",
|
||||
"integrity": "sha512-vbavdySgbTTrmFE+EsiqUTzlOr5bzlnJtUv9PynGCAKvfQqjIXbvFdumPM/GxMDfyuGMJaJAU6TO4zc1Jf1i8Q==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=6.9.0"
|
||||
@@ -309,9 +309,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/types": {
|
||||
"version": "7.27.3",
|
||||
"resolved": "https://registry.npmjs.org/@babel/types/-/types-7.27.3.tgz",
|
||||
"integrity": "sha512-Y1GkI4ktrtvmawoSq+4FCVHNryea6uR+qUQy0AGxLSsjCX0nVmkYQMBLHDkXZuo5hGx7eYdnIaslsdBFm7zbUw==",
|
||||
"version": "7.27.6",
|
||||
"resolved": "https://registry.npmjs.org/@babel/types/-/types-7.27.6.tgz",
|
||||
"integrity": "sha512-ETyHEk2VHHvl9b9jZP5IHPavHYk57EhanlRRuae9XCpb/j5bDCbPPMOBfCWhnl/7EDJz0jEMCi/RhccCE8r1+Q==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/helper-string-parser": "^7.27.1",
|
||||
@@ -1237,16 +1237,16 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@rolldown/pluginutils": {
|
||||
"version": "1.0.0-beta.9",
|
||||
"resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.9.tgz",
|
||||
"integrity": "sha512-e9MeMtVWo186sgvFFJOPGy7/d2j2mZhLJIdVW0C/xDluuOvymEATqz6zKsP0ZmXGzQtqlyjz5sC1sYQUoJG98w==",
|
||||
"version": "1.0.0-beta.11",
|
||||
"resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.11.tgz",
|
||||
"integrity": "sha512-L/gAA/hyCSuzTF1ftlzUSI/IKr2POHsv1Dd78GfqkR83KMNuswWD61JxGV2L7nRwBBBSDr6R1gCkdTmoN7W4ag==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@rollup/rollup-android-arm-eabi": {
|
||||
"version": "4.41.1",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.41.1.tgz",
|
||||
"integrity": "sha512-NELNvyEWZ6R9QMkiytB4/L4zSEaBC03KIXEghptLGLZWJ6VPrL63ooZQCOnlx36aQPGhzuOMwDerC1Eb2VmrLw==",
|
||||
"version": "4.42.0",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.42.0.tgz",
|
||||
"integrity": "sha512-gldmAyS9hpj+H6LpRNlcjQWbuKUtb94lodB9uCz71Jm+7BxK1VIOo7y62tZZwxhA7j1ylv/yQz080L5WkS+LoQ==",
|
||||
"cpu": [
|
||||
"arm"
|
||||
],
|
||||
@@ -1258,9 +1258,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@rollup/rollup-android-arm64": {
|
||||
"version": "4.41.1",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.41.1.tgz",
|
||||
"integrity": "sha512-DXdQe1BJ6TK47ukAoZLehRHhfKnKg9BjnQYUu9gzhI8Mwa1d2fzxA1aw2JixHVl403bwp1+/o/NhhHtxWJBgEA==",
|
||||
"version": "4.42.0",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.42.0.tgz",
|
||||
"integrity": "sha512-bpRipfTgmGFdCZDFLRvIkSNO1/3RGS74aWkJJTFJBH7h3MRV4UijkaEUeOMbi9wxtxYmtAbVcnMtHTPBhLEkaw==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -1272,9 +1272,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@rollup/rollup-darwin-arm64": {
|
||||
"version": "4.41.1",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.41.1.tgz",
|
||||
"integrity": "sha512-5afxvwszzdulsU2w8JKWwY8/sJOLPzf0e1bFuvcW5h9zsEg+RQAojdW0ux2zyYAz7R8HvvzKCjLNJhVq965U7w==",
|
||||
"version": "4.42.0",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.42.0.tgz",
|
||||
"integrity": "sha512-JxHtA081izPBVCHLKnl6GEA0w3920mlJPLh89NojpU2GsBSB6ypu4erFg/Wx1qbpUbepn0jY4dVWMGZM8gplgA==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -1286,9 +1286,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@rollup/rollup-darwin-x64": {
|
||||
"version": "4.41.1",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.41.1.tgz",
|
||||
"integrity": "sha512-egpJACny8QOdHNNMZKf8xY0Is6gIMz+tuqXlusxquWu3F833DcMwmGM7WlvCO9sB3OsPjdC4U0wHw5FabzCGZg==",
|
||||
"version": "4.42.0",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.42.0.tgz",
|
||||
"integrity": "sha512-rv5UZaWVIJTDMyQ3dCEK+m0SAn6G7H3PRc2AZmExvbDvtaDc+qXkei0knQWcI3+c9tEs7iL/4I4pTQoPbNL2SA==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -1300,9 +1300,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@rollup/rollup-freebsd-arm64": {
|
||||
"version": "4.41.1",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.41.1.tgz",
|
||||
"integrity": "sha512-DBVMZH5vbjgRk3r0OzgjS38z+atlupJ7xfKIDJdZZL6sM6wjfDNo64aowcLPKIx7LMQi8vybB56uh1Ftck/Atg==",
|
||||
"version": "4.42.0",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.42.0.tgz",
|
||||
"integrity": "sha512-fJcN4uSGPWdpVmvLuMtALUFwCHgb2XiQjuECkHT3lWLZhSQ3MBQ9pq+WoWeJq2PrNxr9rPM1Qx+IjyGj8/c6zQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -1314,9 +1314,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@rollup/rollup-freebsd-x64": {
|
||||
"version": "4.41.1",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.41.1.tgz",
|
||||
"integrity": "sha512-3FkydeohozEskBxNWEIbPfOE0aqQgB6ttTkJ159uWOFn42VLyfAiyD9UK5mhu+ItWzft60DycIN1Xdgiy8o/SA==",
|
||||
"version": "4.42.0",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.42.0.tgz",
|
||||
"integrity": "sha512-CziHfyzpp8hJpCVE/ZdTizw58gr+m7Y2Xq5VOuCSrZR++th2xWAz4Nqk52MoIIrV3JHtVBhbBsJcAxs6NammOQ==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -1328,9 +1328,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@rollup/rollup-linux-arm-gnueabihf": {
|
||||
"version": "4.41.1",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.41.1.tgz",
|
||||
"integrity": "sha512-wC53ZNDgt0pqx5xCAgNunkTzFE8GTgdZ9EwYGVcg+jEjJdZGtq9xPjDnFgfFozQI/Xm1mh+D9YlYtl+ueswNEg==",
|
||||
"version": "4.42.0",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.42.0.tgz",
|
||||
"integrity": "sha512-UsQD5fyLWm2Fe5CDM7VPYAo+UC7+2Px4Y+N3AcPh/LdZu23YcuGPegQly++XEVaC8XUTFVPscl5y5Cl1twEI4A==",
|
||||
"cpu": [
|
||||
"arm"
|
||||
],
|
||||
@@ -1342,9 +1342,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@rollup/rollup-linux-arm-musleabihf": {
|
||||
"version": "4.41.1",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.41.1.tgz",
|
||||
"integrity": "sha512-jwKCca1gbZkZLhLRtsrka5N8sFAaxrGz/7wRJ8Wwvq3jug7toO21vWlViihG85ei7uJTpzbXZRcORotE+xyrLA==",
|
||||
"version": "4.42.0",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.42.0.tgz",
|
||||
"integrity": "sha512-/i8NIrlgc/+4n1lnoWl1zgH7Uo0XK5xK3EDqVTf38KvyYgCU/Rm04+o1VvvzJZnVS5/cWSd07owkzcVasgfIkQ==",
|
||||
"cpu": [
|
||||
"arm"
|
||||
],
|
||||
@@ -1356,9 +1356,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@rollup/rollup-linux-arm64-gnu": {
|
||||
"version": "4.41.1",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.41.1.tgz",
|
||||
"integrity": "sha512-g0UBcNknsmmNQ8V2d/zD2P7WWfJKU0F1nu0k5pW4rvdb+BIqMm8ToluW/eeRmxCared5dD76lS04uL4UaNgpNA==",
|
||||
"version": "4.42.0",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.42.0.tgz",
|
||||
"integrity": "sha512-eoujJFOvoIBjZEi9hJnXAbWg+Vo1Ov8n/0IKZZcPZ7JhBzxh2A+2NFyeMZIRkY9iwBvSjloKgcvnjTbGKHE44Q==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -1370,9 +1370,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@rollup/rollup-linux-arm64-musl": {
|
||||
"version": "4.41.1",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.41.1.tgz",
|
||||
"integrity": "sha512-XZpeGB5TKEZWzIrj7sXr+BEaSgo/ma/kCgrZgL0oo5qdB1JlTzIYQKel/RmhT6vMAvOdM2teYlAaOGJpJ9lahg==",
|
||||
"version": "4.42.0",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.42.0.tgz",
|
||||
"integrity": "sha512-/3NrcOWFSR7RQUQIuZQChLND36aTU9IYE4j+TB40VU78S+RA0IiqHR30oSh6P1S9f9/wVOenHQnacs/Byb824g==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -1384,9 +1384,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@rollup/rollup-linux-loongarch64-gnu": {
|
||||
"version": "4.41.1",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loongarch64-gnu/-/rollup-linux-loongarch64-gnu-4.41.1.tgz",
|
||||
"integrity": "sha512-bkCfDJ4qzWfFRCNt5RVV4DOw6KEgFTUZi2r2RuYhGWC8WhCA8lCAJhDeAmrM/fdiAH54m0mA0Vk2FGRPyzI+tw==",
|
||||
"version": "4.42.0",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loongarch64-gnu/-/rollup-linux-loongarch64-gnu-4.42.0.tgz",
|
||||
"integrity": "sha512-O8AplvIeavK5ABmZlKBq9/STdZlnQo7Sle0LLhVA7QT+CiGpNVe197/t8Aph9bhJqbDVGCHpY2i7QyfEDDStDg==",
|
||||
"cpu": [
|
||||
"loong64"
|
||||
],
|
||||
@@ -1398,9 +1398,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@rollup/rollup-linux-powerpc64le-gnu": {
|
||||
"version": "4.41.1",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.41.1.tgz",
|
||||
"integrity": "sha512-3mr3Xm+gvMX+/8EKogIZSIEF0WUu0HL9di+YWlJpO8CQBnoLAEL/roTCxuLncEdgcfJcvA4UMOf+2dnjl4Ut1A==",
|
||||
"version": "4.42.0",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.42.0.tgz",
|
||||
"integrity": "sha512-6Qb66tbKVN7VyQrekhEzbHRxXXFFD8QKiFAwX5v9Xt6FiJ3BnCVBuyBxa2fkFGqxOCSGGYNejxd8ht+q5SnmtA==",
|
||||
"cpu": [
|
||||
"ppc64"
|
||||
],
|
||||
@@ -1412,9 +1412,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@rollup/rollup-linux-riscv64-gnu": {
|
||||
"version": "4.41.1",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.41.1.tgz",
|
||||
"integrity": "sha512-3rwCIh6MQ1LGrvKJitQjZFuQnT2wxfU+ivhNBzmxXTXPllewOF7JR1s2vMX/tWtUYFgphygxjqMl76q4aMotGw==",
|
||||
"version": "4.42.0",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.42.0.tgz",
|
||||
"integrity": "sha512-KQETDSEBamQFvg/d8jajtRwLNBlGc3aKpaGiP/LvEbnmVUKlFta1vqJqTrvPtsYsfbE/DLg5CC9zyXRX3fnBiA==",
|
||||
"cpu": [
|
||||
"riscv64"
|
||||
],
|
||||
@@ -1426,9 +1426,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@rollup/rollup-linux-riscv64-musl": {
|
||||
"version": "4.41.1",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.41.1.tgz",
|
||||
"integrity": "sha512-LdIUOb3gvfmpkgFZuccNa2uYiqtgZAz3PTzjuM5bH3nvuy9ty6RGc/Q0+HDFrHrizJGVpjnTZ1yS5TNNjFlklw==",
|
||||
"version": "4.42.0",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.42.0.tgz",
|
||||
"integrity": "sha512-qMvnyjcU37sCo/tuC+JqeDKSuukGAd+pVlRl/oyDbkvPJ3awk6G6ua7tyum02O3lI+fio+eM5wsVd66X0jQtxw==",
|
||||
"cpu": [
|
||||
"riscv64"
|
||||
],
|
||||
@@ -1440,9 +1440,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@rollup/rollup-linux-s390x-gnu": {
|
||||
"version": "4.41.1",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.41.1.tgz",
|
||||
"integrity": "sha512-oIE6M8WC9ma6xYqjvPhzZYk6NbobIURvP/lEbh7FWplcMO6gn7MM2yHKA1eC/GvYwzNKK/1LYgqzdkZ8YFxR8g==",
|
||||
"version": "4.42.0",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.42.0.tgz",
|
||||
"integrity": "sha512-I2Y1ZUgTgU2RLddUHXTIgyrdOwljjkmcZ/VilvaEumtS3Fkuhbw4p4hgHc39Ypwvo2o7sBFNl2MquNvGCa55Iw==",
|
||||
"cpu": [
|
||||
"s390x"
|
||||
],
|
||||
@@ -1454,9 +1454,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@rollup/rollup-linux-x64-gnu": {
|
||||
"version": "4.41.1",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.41.1.tgz",
|
||||
"integrity": "sha512-cWBOvayNvA+SyeQMp79BHPK8ws6sHSsYnK5zDcsC3Hsxr1dgTABKjMnMslPq1DvZIp6uO7kIWhiGwaTdR4Og9A==",
|
||||
"version": "4.42.0",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.42.0.tgz",
|
||||
"integrity": "sha512-Gfm6cV6mj3hCUY8TqWa63DB8Mx3NADoFwiJrMpoZ1uESbK8FQV3LXkhfry+8bOniq9pqY1OdsjFWNsSbfjPugw==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -1468,9 +1468,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@rollup/rollup-linux-x64-musl": {
|
||||
"version": "4.41.1",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.41.1.tgz",
|
||||
"integrity": "sha512-y5CbN44M+pUCdGDlZFzGGBSKCA4A/J2ZH4edTYSSxFg7ce1Xt3GtydbVKWLlzL+INfFIZAEg1ZV6hh9+QQf9YQ==",
|
||||
"version": "4.42.0",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.42.0.tgz",
|
||||
"integrity": "sha512-g86PF8YZ9GRqkdi0VoGlcDUb4rYtQKyTD1IVtxxN4Hpe7YqLBShA7oHMKU6oKTCi3uxwW4VkIGnOaH/El8de3w==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -1482,9 +1482,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@rollup/rollup-win32-arm64-msvc": {
|
||||
"version": "4.41.1",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.41.1.tgz",
|
||||
"integrity": "sha512-lZkCxIrjlJlMt1dLO/FbpZbzt6J/A8p4DnqzSa4PWqPEUUUnzXLeki/iyPLfV0BmHItlYgHUqJe+3KiyydmiNQ==",
|
||||
"version": "4.42.0",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.42.0.tgz",
|
||||
"integrity": "sha512-+axkdyDGSp6hjyzQ5m1pgcvQScfHnMCcsXkx8pTgy/6qBmWVhtRVlgxjWwDp67wEXXUr0x+vD6tp5W4x6V7u1A==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -1496,9 +1496,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@rollup/rollup-win32-ia32-msvc": {
|
||||
"version": "4.41.1",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.41.1.tgz",
|
||||
"integrity": "sha512-+psFT9+pIh2iuGsxFYYa/LhS5MFKmuivRsx9iPJWNSGbh2XVEjk90fmpUEjCnILPEPJnikAU6SFDiEUyOv90Pg==",
|
||||
"version": "4.42.0",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.42.0.tgz",
|
||||
"integrity": "sha512-F+5J9pelstXKwRSDq92J0TEBXn2nfUrQGg+HK1+Tk7VOL09e0gBqUHugZv7SW4MGrYj41oNCUe3IKCDGVlis2g==",
|
||||
"cpu": [
|
||||
"ia32"
|
||||
],
|
||||
@@ -1510,9 +1510,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@rollup/rollup-win32-x64-msvc": {
|
||||
"version": "4.41.1",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.41.1.tgz",
|
||||
"integrity": "sha512-Wq2zpapRYLfi4aKxf2Xff0tN+7slj2d4R87WEzqw7ZLsVvO5zwYCIuEGSZYiK41+GlwUo1HiR+GdkLEJnCKTCw==",
|
||||
"version": "4.42.0",
|
||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.42.0.tgz",
|
||||
"integrity": "sha512-LpHiJRwkaVz/LqjHjK8LCi8osq7elmpwujwbXKNW88bM8eeGxavJIKKjkjpMHAh/2xfnrt1ZSnhTv41WYUHYmA==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -1578,9 +1578,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@types/estree": {
|
||||
"version": "1.0.7",
|
||||
"resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.7.tgz",
|
||||
"integrity": "sha512-w28IoSUCJpidD/TGviZwwMJckNESJZXFu7NBZ5YJ4mEUnNraUn9Pm8HSZm/jDF1pDWYKspWE7oVphigUPRakIQ==",
|
||||
"version": "1.0.8",
|
||||
"resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz",
|
||||
"integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@types/estree-jsx": {
|
||||
@@ -1623,24 +1623,24 @@
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@types/prop-types": {
|
||||
"version": "15.7.14",
|
||||
"resolved": "https://registry.npmjs.org/@types/prop-types/-/prop-types-15.7.14.tgz",
|
||||
"integrity": "sha512-gNMvNH49DJ7OJYv+KAKn0Xp45p8PLl6zo2YnvDIbTd4J6MER2BmWN49TG7n9LvkyihINxeKW8+3bfS2yDC9dzQ==",
|
||||
"version": "15.7.15",
|
||||
"resolved": "https://registry.npmjs.org/@types/prop-types/-/prop-types-15.7.15.tgz",
|
||||
"integrity": "sha512-F6bEyamV9jKGAFBEmlQnesRPGOQqS2+Uwi0Em15xenOxHaf2hv6L8YCVn3rPdPJOiJfPiCnLIRyvwVaqMY3MIw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@types/react": {
|
||||
"version": "19.1.6",
|
||||
"resolved": "https://registry.npmjs.org/@types/react/-/react-19.1.6.tgz",
|
||||
"integrity": "sha512-JeG0rEWak0N6Itr6QUx+X60uQmN+5t3j9r/OVDtWzFXKaj6kD1BwJzOksD0FF6iWxZlbE1kB0q9vtnU2ekqa1Q==",
|
||||
"version": "19.1.7",
|
||||
"resolved": "https://registry.npmjs.org/@types/react/-/react-19.1.7.tgz",
|
||||
"integrity": "sha512-BnsPLV43ddr05N71gaGzyZ5hzkCmGwhMvYc8zmvI8Ci1bRkkDSzDDVfAXfN2tk748OwI7ediiPX6PfT9p0QGVg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"csstype": "^3.0.2"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/react-dom": {
|
||||
"version": "19.1.5",
|
||||
"resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-19.1.5.tgz",
|
||||
"integrity": "sha512-CMCjrWucUBZvohgZxkjd6S9h0nZxXjzus6yDfUb+xLxYM7VvjKNH1tQrE9GWLql1XoOP4/Ds3bwFqShHUYraGg==",
|
||||
"version": "19.1.6",
|
||||
"resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-19.1.6.tgz",
|
||||
"integrity": "sha512-4hOiT/dwO8Ko0gV1m/TJZYk3y0KBnY9vzDh7W+DH17b2HFSOGgdj33dhihPeuy3l0q23+4e+hoXHV6hCC4dCXw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peerDependencies": {
|
||||
@@ -1669,16 +1669,16 @@
|
||||
"license": "ISC"
|
||||
},
|
||||
"node_modules/@vitejs/plugin-react": {
|
||||
"version": "4.5.0",
|
||||
"resolved": "https://registry.npmjs.org/@vitejs/plugin-react/-/plugin-react-4.5.0.tgz",
|
||||
"integrity": "sha512-JuLWaEqypaJmOJPLWwO335Ig6jSgC1FTONCWAxnqcQthLTK/Yc9aH6hr9z/87xciejbQcnP3GnA1FWUSWeXaeg==",
|
||||
"version": "4.5.2",
|
||||
"resolved": "https://registry.npmjs.org/@vitejs/plugin-react/-/plugin-react-4.5.2.tgz",
|
||||
"integrity": "sha512-QNVT3/Lxx99nMQWJWF7K4N6apUEuT0KlZA3mx/mVaoGj3smm/8rc8ezz15J1pcbcjDK0V15rpHetVfya08r76Q==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/core": "^7.26.10",
|
||||
"@babel/plugin-transform-react-jsx-self": "^7.25.9",
|
||||
"@babel/plugin-transform-react-jsx-source": "^7.25.9",
|
||||
"@rolldown/pluginutils": "1.0.0-beta.9",
|
||||
"@babel/core": "^7.27.4",
|
||||
"@babel/plugin-transform-react-jsx-self": "^7.27.1",
|
||||
"@babel/plugin-transform-react-jsx-source": "^7.27.1",
|
||||
"@rolldown/pluginutils": "1.0.0-beta.11",
|
||||
"@types/babel__core": "^7.20.5",
|
||||
"react-refresh": "^0.17.0"
|
||||
},
|
||||
@@ -1686,7 +1686,7 @@
|
||||
"node": "^14.18.0 || >=16.0.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"vite": "^4.2.0 || ^5.0.0 || ^6.0.0"
|
||||
"vite": "^4.2.0 || ^5.0.0 || ^6.0.0 || ^7.0.0-beta.0"
|
||||
}
|
||||
},
|
||||
"node_modules/babel-plugin-macros": {
|
||||
@@ -1770,9 +1770,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/caniuse-lite": {
|
||||
"version": "1.0.30001720",
|
||||
"resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001720.tgz",
|
||||
"integrity": "sha512-Ec/2yV2nNPwb4DnTANEV99ZWwm3ZWfdlfkQbWSDDt+PsXEVYwlhPH8tdMaPunYTKKmz7AnHi2oNEi1GcmKCD8g==",
|
||||
"version": "1.0.30001721",
|
||||
"resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001721.tgz",
|
||||
"integrity": "sha512-cOuvmUVtKrtEaoKiO0rSc29jcjwMwX5tOHDy4MgVFEWiUXj4uBMJkwI8MDySkgXidpMiHUcviogAvFi4pA2hDQ==",
|
||||
"dev": true,
|
||||
"funding": [
|
||||
{
|
||||
@@ -1959,9 +1959,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/electron-to-chromium": {
|
||||
"version": "1.5.161",
|
||||
"resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.161.tgz",
|
||||
"integrity": "sha512-hwtetwfKNZo/UlwHIVBlKZVdy7o8bIZxxKs0Mv/ROPiQQQmDgdm5a+KvKtBsxM8ZjFzTaCeLoodZ8jiBE3o9rA==",
|
||||
"version": "1.5.166",
|
||||
"resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.166.tgz",
|
||||
"integrity": "sha512-QPWqHL0BglzPYyJJ1zSSmwFFL6MFXhbACOCcsCdUMCkzPdS9/OIBVxg516X/Ado2qwAq8k0nJJ7phQPCqiaFAw==",
|
||||
"dev": true,
|
||||
"license": "ISC"
|
||||
},
|
||||
@@ -2054,9 +2054,9 @@
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/fdir": {
|
||||
"version": "6.4.5",
|
||||
"resolved": "https://registry.npmjs.org/fdir/-/fdir-6.4.5.tgz",
|
||||
"integrity": "sha512-4BG7puHpVsIYxZUbiUE3RqGloLaSSwzYie5jvasC4LWuBWzZawynvYouhjbQKw2JuIGYdm0DzIxl8iVidKlUEw==",
|
||||
"version": "6.4.6",
|
||||
"resolved": "https://registry.npmjs.org/fdir/-/fdir-6.4.6.tgz",
|
||||
"integrity": "sha512-hiFoqpyZcfNm1yc4u8oWCf9A2c4D3QjCrks3zmoVKVxpQRzmPNar1hUJcBG2RQHvEVGDN+Jm81ZheVLAQMK6+w==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peerDependencies": {
|
||||
@@ -3342,9 +3342,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/rollup": {
|
||||
"version": "4.41.1",
|
||||
"resolved": "https://registry.npmjs.org/rollup/-/rollup-4.41.1.tgz",
|
||||
"integrity": "sha512-cPmwD3FnFv8rKMBc1MxWCwVQFxwf1JEmSX3iQXrRVVG15zerAIXRjMFVWnd5Q5QvgKF7Aj+5ykXFhUl+QGnyOw==",
|
||||
"version": "4.42.0",
|
||||
"resolved": "https://registry.npmjs.org/rollup/-/rollup-4.42.0.tgz",
|
||||
"integrity": "sha512-LW+Vse3BJPyGJGAJt1j8pWDKPd73QM8cRXYK1IxOBgL2AGLu7Xd2YOW0M2sLUBCkF5MshXXtMApyEAEzMVMsnw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
@@ -3358,29 +3358,36 @@
|
||||
"npm": ">=8.0.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@rollup/rollup-android-arm-eabi": "4.41.1",
|
||||
"@rollup/rollup-android-arm64": "4.41.1",
|
||||
"@rollup/rollup-darwin-arm64": "4.41.1",
|
||||
"@rollup/rollup-darwin-x64": "4.41.1",
|
||||
"@rollup/rollup-freebsd-arm64": "4.41.1",
|
||||
"@rollup/rollup-freebsd-x64": "4.41.1",
|
||||
"@rollup/rollup-linux-arm-gnueabihf": "4.41.1",
|
||||
"@rollup/rollup-linux-arm-musleabihf": "4.41.1",
|
||||
"@rollup/rollup-linux-arm64-gnu": "4.41.1",
|
||||
"@rollup/rollup-linux-arm64-musl": "4.41.1",
|
||||
"@rollup/rollup-linux-loongarch64-gnu": "4.41.1",
|
||||
"@rollup/rollup-linux-powerpc64le-gnu": "4.41.1",
|
||||
"@rollup/rollup-linux-riscv64-gnu": "4.41.1",
|
||||
"@rollup/rollup-linux-riscv64-musl": "4.41.1",
|
||||
"@rollup/rollup-linux-s390x-gnu": "4.41.1",
|
||||
"@rollup/rollup-linux-x64-gnu": "4.41.1",
|
||||
"@rollup/rollup-linux-x64-musl": "4.41.1",
|
||||
"@rollup/rollup-win32-arm64-msvc": "4.41.1",
|
||||
"@rollup/rollup-win32-ia32-msvc": "4.41.1",
|
||||
"@rollup/rollup-win32-x64-msvc": "4.41.1",
|
||||
"@rollup/rollup-android-arm-eabi": "4.42.0",
|
||||
"@rollup/rollup-android-arm64": "4.42.0",
|
||||
"@rollup/rollup-darwin-arm64": "4.42.0",
|
||||
"@rollup/rollup-darwin-x64": "4.42.0",
|
||||
"@rollup/rollup-freebsd-arm64": "4.42.0",
|
||||
"@rollup/rollup-freebsd-x64": "4.42.0",
|
||||
"@rollup/rollup-linux-arm-gnueabihf": "4.42.0",
|
||||
"@rollup/rollup-linux-arm-musleabihf": "4.42.0",
|
||||
"@rollup/rollup-linux-arm64-gnu": "4.42.0",
|
||||
"@rollup/rollup-linux-arm64-musl": "4.42.0",
|
||||
"@rollup/rollup-linux-loongarch64-gnu": "4.42.0",
|
||||
"@rollup/rollup-linux-powerpc64le-gnu": "4.42.0",
|
||||
"@rollup/rollup-linux-riscv64-gnu": "4.42.0",
|
||||
"@rollup/rollup-linux-riscv64-musl": "4.42.0",
|
||||
"@rollup/rollup-linux-s390x-gnu": "4.42.0",
|
||||
"@rollup/rollup-linux-x64-gnu": "4.42.0",
|
||||
"@rollup/rollup-linux-x64-musl": "4.42.0",
|
||||
"@rollup/rollup-win32-arm64-msvc": "4.42.0",
|
||||
"@rollup/rollup-win32-ia32-msvc": "4.42.0",
|
||||
"@rollup/rollup-win32-x64-msvc": "4.42.0",
|
||||
"fsevents": "~2.3.2"
|
||||
}
|
||||
},
|
||||
"node_modules/rollup/node_modules/@types/estree": {
|
||||
"version": "1.0.7",
|
||||
"resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.7.tgz",
|
||||
"integrity": "sha512-w28IoSUCJpidD/TGviZwwMJckNESJZXFu7NBZ5YJ4mEUnNraUn9Pm8HSZm/jDF1pDWYKspWE7oVphigUPRakIQ==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/scheduler": {
|
||||
"version": "0.26.0",
|
||||
"resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.26.0.tgz",
|
||||
|
||||
@@ -8,6 +8,7 @@ Factory method to initialize an extractor instance based on its name.
|
||||
|
||||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from contextlib import suppress
|
||||
import mimetypes
|
||||
import os
|
||||
import requests
|
||||
@@ -16,6 +17,7 @@ from retrying import retry
|
||||
import re
|
||||
|
||||
from auto_archiver.core import Metadata, BaseModule
|
||||
from auto_archiver.utils.url import get_media_url_best_quality
|
||||
|
||||
|
||||
class Extractor(BaseModule):
|
||||
@@ -70,10 +72,20 @@ class Extractor(BaseModule):
|
||||
return ""
|
||||
|
||||
@retry(wait_random_min=500, wait_random_max=3500, stop_max_attempt_number=5)
|
||||
def download_from_url(self, url: str, to_filename: str = None, verbose=True) -> str:
|
||||
def download_from_url(self, url: str, to_filename: str = None, verbose=True, try_best_quality=False) -> str:
|
||||
"""
|
||||
downloads a URL to provided filename, or inferred from URL, returns local filename
|
||||
Warning: if try_best_quality is True, it will return a tuple of (filename, best_quality_url) if the download was successful.
|
||||
"""
|
||||
|
||||
if try_best_quality:
|
||||
with suppress(Exception):
|
||||
# Attempt to download the original URL
|
||||
best_quality_url = get_media_url_best_quality(url)
|
||||
orig_download = self.download_from_url(best_quality_url, to_filename, verbose)
|
||||
if orig_download:
|
||||
return orig_download, best_quality_url
|
||||
|
||||
if not to_filename:
|
||||
to_filename = url.split("/")[-1].split("?")[0]
|
||||
if len(to_filename) > 64:
|
||||
@@ -98,10 +110,12 @@ class Extractor(BaseModule):
|
||||
with open(to_filename, "wb") as f:
|
||||
for chunk in d.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
if try_best_quality:
|
||||
return to_filename, url
|
||||
return to_filename
|
||||
|
||||
except requests.RequestException as e:
|
||||
logger.warning(f"Failed to fetch the Media URL: {e}")
|
||||
logger.warning(f"Failed to fetch the Media URL: {str(e)[:250]}")
|
||||
|
||||
@abstractmethod
|
||||
def download(self, item: Metadata) -> Metadata | False:
|
||||
|
||||
@@ -17,13 +17,14 @@
|
||||
"default": 50,
|
||||
"help": "maximum number of videos to download from the page (0 = no download, inf = no limit).",
|
||||
},
|
||||
"exclude_media_extensions": {
|
||||
"default": ".svg,.ico,.gif",
|
||||
"help": "CSV of media (image/video) file extensions to exclude from download",
|
||||
},
|
||||
"user_data_dir": {
|
||||
"default": "secrets/antibot_user_data",
|
||||
"help": "Path to the user data directory for the webdriver. This is used to persist browser state, such as cookies and local storage. When using docker it's best to let docker create the folder otherwise there may be permission issues. The Extractor will try to work without it if that error occurs but login sessions will not be used or preserved on those runs.",
|
||||
"help": "Path to the user data directory for the webdriver. This is used to persist browser state, such as cookies and local storage. If you use the docker deployment, this path will be appended with `_docker` that is because the folder cannot be shared between the host and the container due to user permissions.",
|
||||
},
|
||||
"detect_auth_wall": {
|
||||
"default": True,
|
||||
"type": "bool",
|
||||
"help": "detect if the page is behind an authentication wall (e.g. login required) and skip it. disable if you want to archive pages where logins are required.",
|
||||
},
|
||||
"proxy": {
|
||||
"default": None,
|
||||
@@ -31,7 +32,9 @@
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Uses a browser controlled by SeleniumBase to capture HTML, media, and screenshots/PDFs of a web page, by bypassing anti-bot measures like Cloudflare's Turnstile.
|
||||
Uses a browser controlled by SeleniumBase to capture HTML, media, and screenshots/PDFs of a web page, by bypassing anti-bot measures like Cloudflare's Turnstile or Google Recaptcha.
|
||||
|
||||
Still in trial development, please report any issues or suggestions via GitHub Issues.
|
||||
|
||||
### Features
|
||||
- Extracts the HTML source code of the page.
|
||||
@@ -40,7 +43,6 @@
|
||||
- Downloads images and videos from the page, excluding specified file extensions.
|
||||
|
||||
### Notes
|
||||
- Requires a WebDriver (e.g., ChromeDriver) installed and accessible via the system's PATH.
|
||||
- Using a proxy affects Cloudflare Turnstile captcha handling, so it is recommended to use a proxy only if necessary.
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -1,12 +1,10 @@
|
||||
import base64
|
||||
import math
|
||||
import mimetypes
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
from urllib.parse import urljoin
|
||||
import glob
|
||||
import stat
|
||||
import importlib.util
|
||||
|
||||
from loguru import logger
|
||||
@@ -15,7 +13,9 @@ from seleniumbase import SB
|
||||
|
||||
from auto_archiver.core import Extractor, Enricher, Metadata, Media
|
||||
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||
from auto_archiver.modules.antibot_extractor_enricher.dropins.default import DefaultDropin
|
||||
from auto_archiver.utils.misc import random_str
|
||||
from auto_archiver.utils.url import is_relevant_url
|
||||
|
||||
|
||||
class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
@@ -25,10 +25,6 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
self.agent = None # Use the default UserAgent
|
||||
|
||||
# parse configuration options
|
||||
self.exclude_media_mimetypes = set(
|
||||
[mimetypes.guess_type(f"file{m}")[0] for m in self.exclude_media_extensions.split(",")]
|
||||
) - {None}
|
||||
|
||||
if self.max_download_images == "inf":
|
||||
self.max_download_images = math.inf
|
||||
else:
|
||||
@@ -39,7 +35,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
else:
|
||||
self.max_download_videos = int(self.max_download_videos)
|
||||
|
||||
self._prepare_and_warn_about_docker_and_user_data_dir()
|
||||
self._prepare_user_data_dir()
|
||||
|
||||
self.dropins = self.load_dropins()
|
||||
|
||||
@@ -77,19 +73,12 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
result.status = "antibot"
|
||||
return result
|
||||
|
||||
def _prepare_and_warn_about_docker_and_user_data_dir(self):
|
||||
os.makedirs(self.user_data_dir, exist_ok=True)
|
||||
|
||||
in_docker = os.environ.get("RUNNING_IN_DOCKER")
|
||||
if in_docker and self.user_data_dir:
|
||||
st = os.stat(self.user_data_dir)
|
||||
perms = stat.filemode(st.st_mode)
|
||||
owner = st.st_uid
|
||||
group = st.st_gid
|
||||
if owner != 0 or group != 0:
|
||||
logger.warning(
|
||||
f"""ANTIBOT: Running in Docker with user_data_dir {self.user_data_dir} with permissions {perms} and non-root {owner=}. This may cause issues with Chrome, if you get 'session not created' errors make sure to remove the folder and let docker create it."""
|
||||
)
|
||||
def _prepare_user_data_dir(self):
|
||||
if self.user_data_dir:
|
||||
in_docker = os.environ.get("RUNNING_IN_DOCKER")
|
||||
if in_docker:
|
||||
self.user_data_dir = self.user_data_dir.rstrip(os.path.sep) + "_docker"
|
||||
os.makedirs(self.user_data_dir, exist_ok=True)
|
||||
|
||||
def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool:
|
||||
using_user_data_dir = self.user_data_dir if custom_data_dir else None
|
||||
@@ -102,39 +91,41 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
sb.uc_open_with_reconnect(url, 4)
|
||||
|
||||
logger.debug(f"ANTIBOT handling CAPTCHAs for {url_sample}...")
|
||||
sb.uc_gui_handle_cf()
|
||||
sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future
|
||||
|
||||
# TODO: implement other Captcha handling
|
||||
sb.uc_gui_handle_captcha() # handles Cloudflare Turnstile captcha if detected
|
||||
dropin = self._get_suitable_dropin(url, sb)
|
||||
dropin.open_page(url)
|
||||
|
||||
suitable_dropin = self._get_suitable_dropin(url, sb)
|
||||
|
||||
if suitable_dropin:
|
||||
suitable_dropin.open_page(url)
|
||||
|
||||
if self._hit_auth_wall(sb):
|
||||
if self.detect_auth_wall and self._hit_auth_wall(sb):
|
||||
logger.warning(f"ANTIBOT SKIP since auth wall or CAPTCHA was detected for {url_sample}")
|
||||
return False
|
||||
logger.debug(f"ANTIBOT no auth wall detected for {url_sample}...")
|
||||
|
||||
sb.wait_for_ready_state_complete()
|
||||
sb.sleep(1) # margin for the page to load completely
|
||||
|
||||
to_enrich.set_title(sb.get_title())
|
||||
self._enrich_html_source_code(sb, to_enrich)
|
||||
|
||||
self._enrich_full_page_screenshot(sb, to_enrich)
|
||||
if self.save_to_pdf:
|
||||
self._enrich_full_page_pdf(sb, to_enrich)
|
||||
|
||||
downloaded_images, downloaded_videos = 0, 0
|
||||
if suitable_dropin:
|
||||
downloaded_images, downloaded_videos = suitable_dropin.add_extra_media(to_enrich)
|
||||
downloaded_images, downloaded_videos = dropin.add_extra_media(to_enrich)
|
||||
|
||||
self._enrich_download_media(
|
||||
sb, to_enrich, css_selector="img", max_media=self.max_download_images - downloaded_images
|
||||
sb,
|
||||
to_enrich,
|
||||
css_selector=dropin.images_selectors(),
|
||||
max_media=self.max_download_images - downloaded_images,
|
||||
)
|
||||
self._enrich_download_media(
|
||||
sb, to_enrich, css_selector="video, source", max_media=self.max_download_videos - downloaded_videos
|
||||
sb,
|
||||
to_enrich,
|
||||
css_selector=dropin.video_selectors(),
|
||||
max_media=self.max_download_videos - downloaded_videos,
|
||||
)
|
||||
logger.success(f"ANTIBOT completed for {url_sample}")
|
||||
logger.info(f"ANTIBOT completed for {url_sample}")
|
||||
|
||||
return to_enrich
|
||||
except selenium.common.exceptions.SessionNotCreatedException as e:
|
||||
@@ -155,10 +146,10 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
"""
|
||||
for dropin in self.dropins:
|
||||
if dropin.suitable(url):
|
||||
logger.debug(f"ANTIBOT using drop-in {dropin.__class__.__name__} for {url}")
|
||||
logger.debug(f"ANTIBOT using drop-in {dropin.__name__} for {url}")
|
||||
return dropin(sb, self)
|
||||
# logger.warning(f"ANTIBOT no suitable drop-in found for {url}")
|
||||
return None
|
||||
|
||||
return DefaultDropin(sb, self)
|
||||
|
||||
def _hit_auth_wall(self, sb: SB) -> bool:
|
||||
"""
|
||||
@@ -168,8 +159,8 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
# TODO: improve this detection logic, currently it is very basic and may not cover all cases
|
||||
|
||||
# Common URL patterns
|
||||
url = sb.get_current_url().lower()
|
||||
if any(kw in url for kw in ["login", "signin", "signup", "register", "captcha"]):
|
||||
current_url = sb.get_current_url().lower()
|
||||
if any(kw in current_url for kw in ["login", "signin", "signup", "register", "captcha"]):
|
||||
return True
|
||||
|
||||
# Common visible text markers
|
||||
@@ -245,8 +236,12 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
Enriches the full page screenshot of the Metadata object.
|
||||
This method is called by the enrich method.
|
||||
"""
|
||||
x = sb.execute_script("return document.documentElement.scrollWidth")
|
||||
y = min(sb.execute_script("return document.documentElement.scrollHeight"), 25_000)
|
||||
start_size = sb.get_window_size()
|
||||
w, h = start_size["width"], start_size["height"]
|
||||
|
||||
x = max(sb.execute_script("return document.documentElement.scrollWidth"), w)
|
||||
y = min(max(sb.execute_script("return document.documentElement.scrollHeight"), h), 25_000)
|
||||
logger.debug(f"Setting window size to {x}x{y} for full page screenshot.")
|
||||
sb.set_window_size(x, y)
|
||||
|
||||
screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png")
|
||||
@@ -278,12 +273,9 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
"""
|
||||
if max_media == 0:
|
||||
return
|
||||
logger.debug(
|
||||
f"Downloading media from {to_enrich.get_url()} with selector '{css_selector}' up to {max_media} items."
|
||||
)
|
||||
url = to_enrich.get_url()
|
||||
all_urls = set()
|
||||
# media_elements = sb.find_elements(css_selector)
|
||||
|
||||
sources = sb.execute_script(f"""
|
||||
return Array.from(document.querySelectorAll("{css_selector}"))
|
||||
.map(el => el.src || el.href)
|
||||
@@ -293,10 +285,12 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
if len(all_urls) >= max_media:
|
||||
logger.debug(f"Reached max download limit of {max_media} images/videos.")
|
||||
break
|
||||
mimerype = mimetypes.guess_type(src)[0]
|
||||
if mimerype in self.exclude_media_mimetypes:
|
||||
if not is_relevant_url(src):
|
||||
continue
|
||||
full_src = urljoin(url, src)
|
||||
if full_src not in all_urls and (filename := self.download_from_url(full_src)):
|
||||
if full_src not in all_urls:
|
||||
filename, full_src = self.download_from_url(full_src, try_best_quality=True)
|
||||
if not filename:
|
||||
continue
|
||||
all_urls.add(full_src)
|
||||
to_enrich.add_media(Media(filename=filename, properties={"url": full_src}))
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
import os
|
||||
from loguru import logger
|
||||
from seleniumbase import SB
|
||||
import yt_dlp
|
||||
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.core import Extractor, Media, Metadata
|
||||
from auto_archiver.utils.misc import ydl_entry_to_filename
|
||||
|
||||
|
||||
class Dropin:
|
||||
@@ -36,6 +39,20 @@ class Dropin:
|
||||
"""
|
||||
return url
|
||||
|
||||
@staticmethod
|
||||
def images_selectors() -> str:
|
||||
"""
|
||||
CSS selector to find images in the HTML page
|
||||
"""
|
||||
return "img"
|
||||
|
||||
@staticmethod
|
||||
def video_selectors() -> str:
|
||||
"""
|
||||
CSS selector to find videos in the HTML page.
|
||||
"""
|
||||
return "video, source"
|
||||
|
||||
def open_page(self, url) -> bool:
|
||||
"""
|
||||
Make sure the page is opened, even if it requires authentication, captcha solving, etc.
|
||||
@@ -50,3 +67,59 @@ class Dropin:
|
||||
:return: A tuple (number of Images added, number of Videos added).
|
||||
"""
|
||||
raise NotImplementedError("This method should be implemented in the subclass")
|
||||
|
||||
def _get_username_password(self, site) -> tuple[str, str]:
|
||||
"""
|
||||
Get the username and password for the site from the extractor's auth data.
|
||||
:return: A tuple (username, password).
|
||||
"""
|
||||
auth = self.extractor.auth_for_site(site)
|
||||
username = auth.get("username", "")
|
||||
password = auth.get("password", "")
|
||||
if not username or not password:
|
||||
raise ValueError(f"{site} authentication requires a username and password.")
|
||||
return username, password
|
||||
|
||||
def _download_videos_with_ytdlp(self, video_urls: list[str], to_enrich: Metadata) -> int:
|
||||
"""
|
||||
Download videos using yt-dlp.
|
||||
:param video_urls: List of video URLs to download.
|
||||
:return: The number of videos downloaded.
|
||||
"""
|
||||
if type(self.extractor.max_download_videos) is int:
|
||||
video_urls = video_urls[: self.extractor.max_download_videos]
|
||||
|
||||
if not video_urls:
|
||||
return 0
|
||||
|
||||
ydl_options = [
|
||||
"-o",
|
||||
os.path.join(self.extractor.tmp_dir, "%(id)s.%(ext)s"),
|
||||
"--quiet",
|
||||
"--no-playlist",
|
||||
"--no-write-subs",
|
||||
"--no-write-auto-subs",
|
||||
"--postprocessor-args",
|
||||
"ffmpeg:-bitexact",
|
||||
"--max-filesize",
|
||||
"1000M", # Limit to 1GB per video
|
||||
]
|
||||
*_, validated_options = yt_dlp.parse_options(ydl_options)
|
||||
downloaded = 0
|
||||
with yt_dlp.YoutubeDL(validated_options) as ydl:
|
||||
for url in video_urls:
|
||||
try:
|
||||
logger.debug(f"Downloading video from URL: {url}")
|
||||
info = ydl.extract_info(url, download=True)
|
||||
filename = ydl_entry_to_filename(ydl, info)
|
||||
if not filename: # Failed to download video.
|
||||
continue
|
||||
media = Media(filename)
|
||||
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
|
||||
if x in info:
|
||||
media.set(x, info[x])
|
||||
to_enrich.add_media(media)
|
||||
downloaded += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading {url}: {e}")
|
||||
return downloaded
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||
|
||||
|
||||
class DefaultDropin(Dropin):
|
||||
"""
|
||||
A default fallback drop-in class for handling generic cases in the antibot extractor enricher module.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def suitable(url: str) -> bool:
|
||||
return False
|
||||
|
||||
def open_page(self, url) -> bool:
|
||||
return True
|
||||
|
||||
def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
|
||||
return 0, 0
|
||||
@@ -0,0 +1,78 @@
|
||||
from contextlib import suppress
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class RedditDropin(Dropin):
|
||||
"""
|
||||
A class to handle Reddit drop-in functionality for the antibot extractor enricher module.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def suitable(url: str) -> bool:
|
||||
return "reddit.com" in url
|
||||
|
||||
@staticmethod
|
||||
def images_selectors() -> str:
|
||||
return "shreddit-post img"
|
||||
|
||||
@staticmethod
|
||||
def video_selectors() -> str:
|
||||
return "shreddit-post video, shreddit-post source"
|
||||
|
||||
def open_page(self, url) -> bool:
|
||||
if self.sb.is_text_visible("You've been blocked by network security."):
|
||||
self._login()
|
||||
if url != self.sb.get_current_url():
|
||||
self.sb.open(url)
|
||||
return True
|
||||
|
||||
@logger.catch
|
||||
def _login(self):
|
||||
self.sb.click_link_text("Log in")
|
||||
self.sb.wait_for_ready_state_complete()
|
||||
self._close_cookies_banner()
|
||||
|
||||
username, password = self._get_username_password("reddit.com")
|
||||
logger.debug("RedditDropin Logging in to VK with username: {}", username)
|
||||
|
||||
self.sb.type("#login-username", username)
|
||||
self.sb.type("#login-password", password)
|
||||
|
||||
elem = self.sb.find_element("button.login")
|
||||
self.sb.execute_script("arguments[0].scrollIntoView(true);", elem)
|
||||
self.sb.slow_click("button.login")
|
||||
self.sb.wait_for_ready_state_complete()
|
||||
|
||||
if "https://www.reddit.com/login/" in self.sb.get_current_url():
|
||||
self.sb.sleep(5)
|
||||
self.sb.wait_for_ready_state_complete()
|
||||
|
||||
if self.sb.is_text_visible("You've been blocked by network security."):
|
||||
self.sb.click_link_text("Log in")
|
||||
self.sb.wait_for_ready_state_complete()
|
||||
if self.sb.is_text_visible("Welcome back"):
|
||||
logger.debug("RedditDropin Login successful")
|
||||
self.sb.click_if_visible("this link")
|
||||
|
||||
def _close_cookies_banner(self):
|
||||
with suppress(Exception): # selenium.common.exceptions.JavascriptException
|
||||
self.sb.execute_script("""
|
||||
document
|
||||
.querySelector("reddit-cookie-banner")
|
||||
.shadowRoot.querySelector("faceplate-dialog")
|
||||
.querySelector("#accept-all-cookies-button button")
|
||||
.click()
|
||||
""")
|
||||
|
||||
@logger.catch
|
||||
def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
|
||||
filtered_urls = self.sb.execute_script(rf"""
|
||||
return [...document.querySelectorAll("{self.video_selectors()}")]
|
||||
.map(el => el.src || el.href)
|
||||
.filter(url => url && /\.(m3u8|mpd|ism)$/.test(url));
|
||||
""")
|
||||
logger.debug("RedditDropin Found {} video URLs", len(filtered_urls))
|
||||
return 0, self._download_videos_with_ytdlp(filtered_urls, to_enrich)
|
||||
@@ -1,12 +1,8 @@
|
||||
import os
|
||||
import re
|
||||
|
||||
from auto_archiver.core.media import Media
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||
from auto_archiver.utils.misc import ydl_entry_to_filename
|
||||
|
||||
import yt_dlp
|
||||
from loguru import logger
|
||||
|
||||
|
||||
@@ -37,10 +33,11 @@ class VkDropin(Dropin):
|
||||
|
||||
def open_page(self, url) -> bool:
|
||||
if self.sb.is_text_visible("Sign in to VK"):
|
||||
self._login()
|
||||
self.sb.open(url)
|
||||
if self._login():
|
||||
self.sb.open(url)
|
||||
return True
|
||||
|
||||
@logger.catch
|
||||
def _login(self) -> bool:
|
||||
# TODO: test method
|
||||
self.sb.open("https://vk.com")
|
||||
@@ -50,13 +47,9 @@ class VkDropin(Dropin):
|
||||
return True
|
||||
|
||||
# need to login
|
||||
logger.debug("Logging in to VK...")
|
||||
auth = self.extractor.auth_for_site("vk.com")
|
||||
username = auth.get("username", "")
|
||||
password = auth.get("password", "")
|
||||
if not username or not password:
|
||||
raise ValueError("VK authentication requires a username and password.")
|
||||
logger.debug("Using username: {}", username)
|
||||
username, password = self._get_username_password("vk.com")
|
||||
logger.debug("Logging in to VK with username: {}", username)
|
||||
|
||||
self.sb.click('[data-testid="enter-another-way"]', timeout=10)
|
||||
self.sb.clear('input[name="login"][type="tel"]', by="css selector", timeout=10)
|
||||
self.sb.type('input[name="login"][type="tel"]', username, by="css selector", timeout=10)
|
||||
@@ -80,47 +73,6 @@ class VkDropin(Dropin):
|
||||
|
||||
@logger.catch
|
||||
def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
|
||||
"""
|
||||
Extract video data from the currently open post with SeleniumBase.
|
||||
|
||||
:return: A tuple (number of Images added, number of Videos added).
|
||||
"""
|
||||
video_urls = [v.get_attribute("href") for v in self.sb.find_elements('a[href*="/video-"]')]
|
||||
if type(self.extractor.max_download_videos) is int:
|
||||
video_urls = video_urls[: self.extractor.max_download_videos]
|
||||
|
||||
if not video_urls:
|
||||
return 0, 0
|
||||
|
||||
logger.debug(f"Found {len(video_urls)} video URLs in the post, using ytdlp for download.")
|
||||
ydl_options = [
|
||||
"-o",
|
||||
os.path.join(self.extractor.tmp_dir, "%(id)s.%(ext)s"),
|
||||
"--quiet",
|
||||
"--no-playlist",
|
||||
"--no-write-subs",
|
||||
"--no-write-auto-subs",
|
||||
"--postprocessor-args",
|
||||
"ffmpeg:-bitexact",
|
||||
"--max-filesize",
|
||||
"1000M", # Limit to 1GB per video
|
||||
]
|
||||
*_, validated_options = yt_dlp.parse_options(ydl_options)
|
||||
downloaded = 0
|
||||
with yt_dlp.YoutubeDL(validated_options) as ydl:
|
||||
for url in video_urls:
|
||||
try:
|
||||
logger.debug(f"Downloading video from URL: {url}")
|
||||
info = ydl.extract_info(url, download=True)
|
||||
filename = ydl_entry_to_filename(ydl, info)
|
||||
if not filename: # Failed to download video.
|
||||
continue
|
||||
media = Media(filename)
|
||||
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
|
||||
if x in info:
|
||||
media.set(x, info[x])
|
||||
to_enrich.add_media(media)
|
||||
downloaded += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading {url}: {e}")
|
||||
return 0, downloaded
|
||||
return 0, self._download_videos_with_ytdlp(video_urls, to_enrich)
|
||||
|
||||
@@ -30,6 +30,8 @@ For a full list of video platforms supported by `yt-dlp`, see the
|
||||
custom dropins can be created to handle additional websites and passed to the archiver
|
||||
via the command line using the `--dropins` option (TODO!).
|
||||
|
||||
You can see all currently implemented dropins in [the source code](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules/generic_extractor).
|
||||
|
||||
### Auto-Updates
|
||||
|
||||
The Generic Extractor will also automatically check for updates to `yt-dlp` (every 5 days by default).
|
||||
|
||||
@@ -78,6 +78,8 @@ def remove_get_parameters(url: str) -> str:
|
||||
def is_relevant_url(url: str) -> bool:
|
||||
"""
|
||||
Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc.
|
||||
|
||||
Assumption: URLs are relevant if they refer to files that can be downloaded with curl/requests, so excludes extensions like .m3u8.
|
||||
"""
|
||||
clean_url = remove_get_parameters(url)
|
||||
|
||||
@@ -104,11 +106,19 @@ def is_relevant_url(url: str) -> bool:
|
||||
("vk.com/images/reaction/",),
|
||||
# wikipedia
|
||||
("wikipedia.org/static",),
|
||||
# reddit
|
||||
("styles.redditmedia.com",), # opinionated but excludes may irrelevant images like avatars and banners
|
||||
("emoji.redditmedia.com",),
|
||||
]
|
||||
|
||||
# TODO: make these globally configurable
|
||||
IRRELEVANT_ENDS_WITH = [
|
||||
".svg", # ignore SVGs
|
||||
".ico", # ignore icons
|
||||
# ignore index files for videos, these should be handled by ytdlp
|
||||
".m3u8",
|
||||
".mpd",
|
||||
".ism",
|
||||
]
|
||||
|
||||
for end in IRRELEVANT_ENDS_WITH:
|
||||
@@ -125,6 +135,36 @@ def is_relevant_url(url: str) -> bool:
|
||||
def twitter_best_quality_url(url: str) -> str:
|
||||
"""
|
||||
some twitter image URLs point to a less-than best quality
|
||||
this returns the URL pointing to the highest (original) quality
|
||||
this returns the URL pointing to the highest (original) quality (with 'name=orig')
|
||||
"""
|
||||
return re.sub(r"name=(\w+)", "name=orig", url, 1)
|
||||
parsed = urlparse(url)
|
||||
query = parsed.query
|
||||
if "name=" in query:
|
||||
# Replace only the first occurrence of name=xxx with name=orig
|
||||
new_query = re.sub(r"name=[^&]*", "name=orig", query, 1)
|
||||
parsed = parsed._replace(query=new_query)
|
||||
return urlunparse(parsed)
|
||||
return url
|
||||
|
||||
|
||||
def get_media_url_best_quality(url: str) -> str:
|
||||
"""
|
||||
Returns the best quality URL for the given media URL, it may not exist.
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
|
||||
# twitter case
|
||||
if any(d in parsed.netloc.replace("www", "") for d in ("twitter.com", "twimg.com", "x.com")):
|
||||
url = twitter_best_quality_url(url)
|
||||
parsed = urlparse(url)
|
||||
|
||||
# some cases https://example.com/media-1280x720.mp4 to https://example.com/media.mp4
|
||||
basename = parsed.path.split("/")[-1]
|
||||
match = re.match(r"(.+)-\d+x\d+(\.[a-zA-Z0-9]+)$", basename)
|
||||
if match:
|
||||
orig_basename = match.group(1) + match.group(2)
|
||||
new_path = "/".join(parsed.path.split("/")[:-1] + [orig_basename])
|
||||
parsed = parsed._replace(path=new_path) # keep the query unchanged
|
||||
url = urlunparse(parsed)
|
||||
|
||||
return url
|
||||
|
||||
@@ -34,7 +34,6 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
"save_to_pdf": False,
|
||||
"max_download_images": 0,
|
||||
"max_download_videos": 0,
|
||||
"exclude_media_extensions": ".svg,.ico,.gif",
|
||||
"proxy": None,
|
||||
}
|
||||
|
||||
@@ -129,15 +128,33 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
),
|
||||
(
|
||||
"https://seleniumbase.io/apps/turnstile",
|
||||
'id="captcha-success"',
|
||||
'<img id="captcha-success" src="https://seleniumbase.io/cdn/img/green_check.png" style="" width="180">',
|
||||
),
|
||||
(
|
||||
"https://seleniumbase.io/apps/form_turnstile",
|
||||
'<img id="captcha-success" src="https://seleniumbase.io/cdn/img/green_check.png" width="120" style="">',
|
||||
),
|
||||
(
|
||||
"https://gitlab.com/users/sign_in",
|
||||
"Password",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_download_with_cloudflare_turnstile(self, setup_module, make_item, url, in_html):
|
||||
def test_overcome_cloudflare_turnstile(self, setup_module, make_item, url, in_html):
|
||||
"""
|
||||
Test downloading a page with Cloudflare Turnstile captcha.
|
||||
"""
|
||||
|
||||
self.extractor = setup_module(
|
||||
self.extractor_module,
|
||||
{
|
||||
"save_to_pdf": True,
|
||||
"detect_auth_wall": False,
|
||||
"max_download_images": 5,
|
||||
"max_download_videos": "inf",
|
||||
},
|
||||
)
|
||||
|
||||
item = make_item(url)
|
||||
self.extractor.enrich(item)
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ from auto_archiver.utils.url import (
|
||||
is_relevant_url,
|
||||
remove_get_parameters,
|
||||
twitter_best_quality_url,
|
||||
get_media_url_best_quality,
|
||||
)
|
||||
|
||||
|
||||
@@ -95,6 +96,11 @@ def test_remove_get_parameters(url, without_get):
|
||||
("https://example.com/150x150.jpg", True),
|
||||
("https://example.com/rsrc.php/", True),
|
||||
("https://example.com/img/emoji/", True),
|
||||
("https://styles.redditmedia.com/123", False),
|
||||
("https://emoji.redditmedia.com/abc.jpg", False),
|
||||
("https://example.com/rsrc.m3u8?asdasd=10", False),
|
||||
("https://example.com/rsrc.mpd", False),
|
||||
("https://example.com/rsrc.ism?vid=12", False),
|
||||
],
|
||||
)
|
||||
def test_is_relevant_url(url, relevant):
|
||||
@@ -104,10 +110,51 @@ def test_is_relevant_url(url, relevant):
|
||||
@pytest.mark.parametrize(
|
||||
"url, best_quality",
|
||||
[
|
||||
("https://twitter.com/some_image.jpg?name=small", "https://twitter.com/some_image.jpg?name=orig"),
|
||||
(
|
||||
"https://twitter.com/some_image.jpg?name=small&this_is_another=145",
|
||||
"https://twitter.com/some_image.jpg?name=orig&this_is_another=145",
|
||||
),
|
||||
("https://twitter.com/some_image.jpg", "https://twitter.com/some_image.jpg"),
|
||||
("https://twitter.com/some_image.jpg?name=orig", "https://twitter.com/some_image.jpg?name=orig"),
|
||||
],
|
||||
)
|
||||
def test_twitter_best_quality_url(url, best_quality):
|
||||
assert twitter_best_quality_url(url) == best_quality
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_url,expected_url",
|
||||
[
|
||||
# Twitter: add/replace name= to name=orig
|
||||
(
|
||||
"https://pbs.twimg.com/media/abc123?format=jpg&name=small",
|
||||
"https://pbs.twimg.com/media/abc123?format=jpg&name=orig",
|
||||
),
|
||||
("https://pbs.twimg.com/media/abc123?name=large", "https://pbs.twimg.com/media/abc123?name=orig"),
|
||||
("https://pbs.twimg.com/media/abc123?format=jpg", "https://pbs.twimg.com/media/abc123?format=jpg"),
|
||||
# Twitter: already orig
|
||||
(
|
||||
"https://pbs.twimg.com/media/abc123?format=jpg&name=orig",
|
||||
"https://pbs.twimg.com/media/abc123?format=jpg&name=orig",
|
||||
),
|
||||
# X.com domain
|
||||
("https://x.com/media/abc123?name=medium", "https://x.com/media/abc123?name=orig"),
|
||||
# twimg.com domain
|
||||
("https://twimg.com/media/abc123?name=thumb", "https://twimg.com/media/abc123?name=orig"),
|
||||
# Non-twitter domain, no change
|
||||
("https://example.com/media/file.mp4", "https://example.com/media/file.mp4"),
|
||||
# Remove -WxH from basename
|
||||
("https://example.com/media/file-1280x720.mp4", "https://example.com/media/file.mp4"),
|
||||
("https://example.com/media/file-1920x1080.jpg?foo=bar", "https://example.com/media/file.jpg?foo=bar"),
|
||||
# Both twitter and -WxH
|
||||
("https://pbs.twimg.com/media/abc-1280x720.jpg?name=small", "https://pbs.twimg.com/media/abc.jpg?name=orig"),
|
||||
# No match for -WxH, no change
|
||||
("https://example.com/media/file.mp4?foo=bar", "https://example.com/media/file.mp4?foo=bar"),
|
||||
# Path with multiple directories
|
||||
("https://example.com/a/b/c/file-640x480.png", "https://example.com/a/b/c/file.png"),
|
||||
# -WxH in directory, not basename (should not change)
|
||||
("https://example.com/media-1280x720/file.mp4", "https://example.com/media-1280x720/file.mp4"),
|
||||
],
|
||||
)
|
||||
def test_get_media_url_best_quality(input_url, expected_url):
|
||||
assert get_media_url_best_quality(input_url) == expected_url
|
||||
|
||||
Reference in New Issue
Block a user