Merge pull request #311 from bellingcat/feat/seleniumbase

Replaces ScreenshotEnricher with AntibotExtractorEnricher, removes VkExtractor
This commit is contained in:
Miguel Sozinho Ramalho
2025-06-04 14:53:31 +01:00
committed by GitHub
24 changed files with 1160 additions and 536 deletions

2
.gitignore vendored
View File

@@ -36,4 +36,4 @@ docs/source/autoapi/
docs/source/modules/autogen/
scripts/settings_page.html
scripts/settings/src/schema.json
.vite
.vite

View File

@@ -11,26 +11,8 @@ ENV RUNNING_IN_DOCKER=1 \
ARG TARGETARCH
# Installing system dependencies
RUN add-apt-repository ppa:mozillateam/ppa && \
apt-get update && \
apt-get install -y --no-install-recommends gcc ffmpeg fonts-noto exiftool && \
apt-get install -y --no-install-recommends firefox-esr && \
ln -s /usr/bin/firefox-esr /usr/bin/firefox
ARG GECKODRIVER_VERSION=0.36.0
RUN if [ $(uname -m) = "aarch64" ]; then \
GECKODRIVER_ARCH=linux-aarch64; \
else \
GECKODRIVER_ARCH=linux64; \
fi && \
wget https://github.com/mozilla/geckodriver/releases/download/v${GECKODRIVER_VERSION}/geckodriver-v${GECKODRIVER_VERSION}-${GECKODRIVER_ARCH}.tar.gz && \
tar -xvzf geckodriver* -C /usr/local/bin && \
chmod +x /usr/local/bin/geckodriver && \
rm geckodriver-v* && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN apt-get update && \
apt-get install -y --no-install-recommends gcc ffmpeg fonts-noto exiftool python3-tk
# Poetry and runtime
FROM base AS runtime

View File

@@ -8,7 +8,7 @@ The archiver archives web pages using the following workflow
4. **Formatter** creates a report from all the archived content (HTML, PDF, ...)
5. **Database** knows what's been archived and also stores the archive result (spreadsheet, CSV, or just the console)
Each step in the workflow is handled by 'modules' that interact with the data in different ways. For example, the Twitter Extractor Module would extract information from the Twitter website. The Screenshot Enricher Module will take screenshots of the given page. See the [core modules page](core_modules.md) for an overview of all the modules that are available.
Each step in the workflow is handled by 'modules' that interact with the data in different ways. For example, the Twitter Extractor Module would extract information from the Twitter website. The AntiBot Module will download HTML and take screenshots of the given page. See the [core modules page](core_modules.md) for an overview of all the modules that are available.
Auto-archiver must have at least one module defined for each step of the workflow. This is done by setting the [configuration](installation/configurations.md) for your auto-archiver instance.

View File

@@ -51,8 +51,8 @@ After this, you're ready to set up your [your configuration file](configurations
If using the local installation method, you will also need to install the following dependencies locally:
1.[ffmpeg](https://www.ffmpeg.org/) - for handling of downloaded videos
2. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin` - for taking webpage screenshots with the screenshot enricher
3. (optional) [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
<!-- 2. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin` - for taking webpage screenshots with the screenshot enricher -->
3. (optional) [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium screenshots: `sudo apt install fonts-noto -y`.
4. [Browsertrix Crawler docker image](https://hub.docker.com/r/webrecorder/browsertrix-crawler) for the WACZ enricher/archiver

695
poetry.lock generated
View File

@@ -158,6 +158,27 @@ charset-normalizer = ["charset-normalizer"]
html5lib = ["html5lib"]
lxml = ["lxml"]
[[package]]
name = "behave"
version = "1.2.6"
description = "behave is behaviour-driven development, Python style"
optional = false
python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
groups = ["main"]
files = [
{file = "behave-1.2.6-py2.py3-none-any.whl", hash = "sha256:ebda1a6c9e5bfe95c5f9f0a2794e01c7098b3dde86c10a95d8621c5907ff6f1c"},
{file = "behave-1.2.6.tar.gz", hash = "sha256:b9662327aa53294c1351b0a9c369093ccec1d21026f050c3bd9b3e5cccf81a86"},
]
[package.dependencies]
parse = ">=1.8.2"
parse-type = ">=0.4.2"
six = ">=1.11"
[package.extras]
develop = ["coverage", "invoke (>=0.21.0)", "modernize (>=0.5)", "path.py (>=8.1.2)", "pathlib", "pycmd", "pylint", "pytest (>=3.0)", "pytest-cov", "tox"]
docs = ["sphinx (>=1.6)", "sphinx-bootstrap-theme (>=0.6)"]
[[package]]
name = "bgutil-ytdlp-pot-provider"
version = "1.1.0"
@@ -518,6 +539,18 @@ files = [
{file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
]
[[package]]
name = "chardet"
version = "5.2.0"
description = "Universal encoding detector for Python 3"
optional = false
python-versions = ">=3.7"
groups = ["main"]
files = [
{file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"},
{file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"},
]
[[package]]
name = "charset-normalizer"
version = "3.4.2"
@@ -646,7 +679,7 @@ files = [
{file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
]
markers = {main = "sys_platform == \"win32\" or platform_system == \"Windows\"", dev = "sys_platform == \"win32\""}
markers = {dev = "sys_platform == \"win32\""}
[[package]]
name = "cryptography"
@@ -708,6 +741,18 @@ ssh = ["bcrypt (>=3.1.5)"]
test = ["certifi (>=2024)", "cryptography-vectors (==44.0.3)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"]
test-randomorder = ["pytest-randomly"]
[[package]]
name = "cssselect"
version = "1.3.0"
description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0"
optional = false
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "cssselect-1.3.0-py3-none-any.whl", hash = "sha256:56d1bf3e198080cc1667e137bc51de9cadfca259f03c2d4e09037b3e01e30f0d"},
{file = "cssselect-1.3.0.tar.gz", hash = "sha256:57f8a99424cfab289a1b6a816a43075a4b00948c86b4dcf3ef4ee7e15f7ab0c7"},
]
[[package]]
name = "curl-cffi"
version = "0.10.0"
@@ -808,11 +853,11 @@ description = "Backport of PEP 654 (exception groups)"
optional = false
python-versions = ">=3.7"
groups = ["main", "dev", "docs"]
markers = "python_version == \"3.10\""
files = [
{file = "exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10"},
{file = "exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88"},
]
markers = {dev = "python_version == \"3.10\"", docs = "python_version == \"3.10\""}
[package.dependencies]
typing-extensions = {version = ">=4.6.0", markers = "python_version < \"3.13\""}
@@ -820,6 +865,33 @@ typing-extensions = {version = ">=4.6.0", markers = "python_version < \"3.13\""}
[package.extras]
test = ["pytest (>=6)"]
[[package]]
name = "execnet"
version = "2.1.1"
description = "execnet: rapid multi-Python deployment"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "execnet-2.1.1-py3-none-any.whl", hash = "sha256:26dee51f1b80cebd6d0ca8e74dd8745419761d3bef34163928cbebbdc4749fdc"},
{file = "execnet-2.1.1.tar.gz", hash = "sha256:5189b52c6121c24feae288166ab41b32549c7e2348652736540b9e6e7d4e72e3"},
]
[package.extras]
testing = ["hatch", "pre-commit", "pytest", "tox"]
[[package]]
name = "fasteners"
version = "0.19"
description = "A python package that provides useful locks"
optional = false
python-versions = ">=3.6"
groups = ["main"]
files = [
{file = "fasteners-0.19-py3-none-any.whl", hash = "sha256:758819cb5d94cdedf4e836988b74de396ceacb8e2794d21f82d131fd9ee77237"},
{file = "fasteners-0.19.tar.gz", hash = "sha256:b4f37c3ac52d8a445af3a66bce57b33b5e90b97c696b7b984f530cf8f0ded09c"},
]
[[package]]
name = "ffmpeg-python"
version = "0.2.0"
@@ -844,7 +916,7 @@ version = "3.18.0"
description = "A platform independent file lock."
optional = false
python-versions = ">=3.9"
groups = ["dev"]
groups = ["main", "dev"]
files = [
{file = "filelock-3.18.0-py3-none-any.whl", hash = "sha256:c401f4f8377c4464e6db25fff06205fd89bdd83b65eb0488ed1b160f780e21de"},
{file = "filelock-3.18.0.tar.gz", hash = "sha256:adbc88eabb99d2fec8c9c1b229b171f18afa655400173ddc653d5d01501fb9f2"},
@@ -1082,7 +1154,7 @@ version = "2.1.0"
description = "brain-dead simple config-ini parsing"
optional = false
python-versions = ">=3.8"
groups = ["dev"]
groups = ["main", "dev"]
files = [
{file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"},
{file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"},
@@ -1355,6 +1427,22 @@ files = [
{file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
]
[[package]]
name = "mouseinfo"
version = "0.1.3"
description = "An application to display XY position and RGB color information for the pixel currently under the mouse. Works on Python 2 and 3."
optional = false
python-versions = "*"
groups = ["main"]
files = [
{file = "MouseInfo-0.1.3.tar.gz", hash = "sha256:2c62fb8885062b8e520a3cce0a297c657adcc08c60952eb05bc8256ef6f7f6e7"},
]
[package.dependencies]
pyperclip = "*"
python3-Xlib = {version = "*", markers = "platform_system == \"Linux\" and python_version >= \"3.0\""}
rubicon-objc = {version = "*", markers = "platform_system == \"Darwin\""}
[[package]]
name = "mutagen"
version = "1.47.0"
@@ -1367,6 +1455,18 @@ files = [
{file = "mutagen-1.47.0.tar.gz", hash = "sha256:719fadef0a978c31b4cf3c956261b3c58b6948b32023078a2117b1de09f0fc99"},
]
[[package]]
name = "mycdp"
version = "1.2.0"
description = "Autogenerated CDP utilities for Python"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "mycdp-1.2.0-py3-none-any.whl", hash = "sha256:8f9ef628fa68e391f59ad9cd555ae75746bd3a48947017c9ecc65a63624a1d41"},
{file = "mycdp-1.2.0.tar.gz", hash = "sha256:0603fd8e3454147c4f549edaa13f5294f57ecb481640c03f808ed548a03f796f"},
]
[[package]]
name = "mypy-extensions"
version = "1.1.0"
@@ -1562,6 +1662,71 @@ files = [
{file = "packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f"},
]
[[package]]
name = "parameterized"
version = "0.9.0"
description = "Parameterized testing with any Python test framework"
optional = false
python-versions = ">=3.7"
groups = ["main"]
files = [
{file = "parameterized-0.9.0-py2.py3-none-any.whl", hash = "sha256:4e0758e3d41bea3bbd05ec14fc2c24736723f243b28d702081aef438c9372b1b"},
{file = "parameterized-0.9.0.tar.gz", hash = "sha256:7fc905272cefa4f364c1a3429cbbe9c0f98b793988efb5bf90aac80f08db09b1"},
]
[package.extras]
dev = ["jinja2"]
[[package]]
name = "parse"
version = "1.20.2"
description = "parse() is the opposite of format()"
optional = false
python-versions = "*"
groups = ["main"]
files = [
{file = "parse-1.20.2-py2.py3-none-any.whl", hash = "sha256:967095588cb802add9177d0c0b6133b5ba33b1ea9007ca800e526f42a85af558"},
{file = "parse-1.20.2.tar.gz", hash = "sha256:b41d604d16503c79d81af5165155c0b20f6c8d6c559efa66b4b695c3e5a0a0ce"},
]
[[package]]
name = "parse-type"
version = "0.6.4"
description = "Simplifies to build parse types based on the parse module"
optional = false
python-versions = "!=3.0.*,!=3.1.*,>=2.7"
groups = ["main"]
files = [
{file = "parse_type-0.6.4-py2.py3-none-any.whl", hash = "sha256:83d41144a82d6b8541127bf212dd76c7f01baff680b498ce8a4d052a7a5bce4c"},
{file = "parse_type-0.6.4.tar.gz", hash = "sha256:5e1ec10440b000c3f818006033372939e693a9ec0176f446d9303e4db88489a6"},
]
[package.dependencies]
parse = {version = ">=1.18.0", markers = "python_version >= \"3.0\""}
six = ">=1.15"
[package.extras]
develop = ["build (>=0.5.1)", "coverage (>=4.4)", "pylint", "pytest (<5.0) ; python_version < \"3.0\"", "pytest (>=5.0) ; python_version >= \"3.0\"", "pytest-cov", "pytest-html (>=1.19.0)", "ruff ; python_version >= \"3.7\"", "setuptools", "setuptools-scm", "tox (>=2.8,<4.0)", "twine (>=1.13.0)", "virtualenv (<20.22.0) ; python_version <= \"3.6\"", "virtualenv (>=20.0.0) ; python_version > \"3.6\"", "wheel"]
docs = ["Sphinx (>=1.6)", "sphinx-bootstrap-theme (>=0.6.0)"]
testing = ["pytest (<5.0) ; python_version < \"3.0\"", "pytest (>=5.0) ; python_version >= \"3.0\"", "pytest-html (>=1.19.0)"]
[[package]]
name = "pdbp"
version = "1.7.0"
description = "pdbp (Pdb+): A drop-in replacement for pdb and pdbpp."
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "pdbp-1.7.0-py3-none-any.whl", hash = "sha256:6ad99cb4e9f2fc1a5b4ef4f2e0acdb28b18b271bf71f6c9f997b652d935caa19"},
{file = "pdbp-1.7.0.tar.gz", hash = "sha256:d0a5b275720c451f5574427e35523aeb61c244f3faf622a80fe03019ef82d380"},
]
[package.dependencies]
colorama = {version = ">=0.4.6", markers = "platform_system == \"Windows\""}
pygments = ">=2.19.1"
tabcompleter = ">=1.4.0"
[[package]]
name = "pdqhash"
version = "0.2.8"
@@ -1681,13 +1846,25 @@ tests = ["check-manifest", "coverage (>=7.4.2)", "defusedxml", "markdown2", "ole
typing = ["typing-extensions ; python_version < \"3.10\""]
xmp = ["defusedxml"]
[[package]]
name = "pip"
version = "25.1.1"
description = "The PyPA recommended tool for installing Python packages."
optional = false
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "pip-25.1.1-py3-none-any.whl", hash = "sha256:2913a38a2abf4ea6b64ab507bd9e967f3b53dc1ede74b01b0931e1ce548751af"},
{file = "pip-25.1.1.tar.gz", hash = "sha256:3de45d411d308d5054c2168185d8da7f9a2cd753dbac8acbfa88a8909ecd9077"},
]
[[package]]
name = "platformdirs"
version = "4.3.8"
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
optional = false
python-versions = ">=3.9"
groups = ["dev"]
groups = ["main", "dev"]
files = [
{file = "platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4"},
{file = "platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc"},
@@ -1704,7 +1881,7 @@ version = "1.6.0"
description = "plugin and hook calling mechanisms for python"
optional = false
python-versions = ">=3.9"
groups = ["dev"]
groups = ["main", "dev"]
files = [
{file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"},
{file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"},
@@ -1808,6 +1985,27 @@ files = [
[package.dependencies]
pyasn1 = ">=0.6.1,<0.7.0"
[[package]]
name = "pyautogui"
version = "0.9.54"
description = "PyAutoGUI lets Python control the mouse and keyboard, and other GUI automation tasks. For Windows, macOS, and Linux, on Python 3 and 2."
optional = false
python-versions = "*"
groups = ["main"]
files = [
{file = "PyAutoGUI-0.9.54.tar.gz", hash = "sha256:dd1d29e8fd118941cb193f74df57e5c6ff8e9253b99c7b04f39cfc69f3ae04b2"},
]
[package.dependencies]
mouseinfo = "*"
pygetwindow = ">=0.0.5"
pymsgbox = "*"
pyobjc-core = {version = "*", markers = "platform_system == \"Darwin\""}
pyobjc-framework-quartz = {version = "*", markers = "platform_system == \"Darwin\""}
pyscreeze = ">=0.1.21"
python3-Xlib = {version = "*", markers = "platform_system == \"Linux\" and python_version >= \"3.0\""}
pytweening = ">=1.0.4"
[[package]]
name = "pycodestyle"
version = "2.13.0"
@@ -1912,6 +2110,20 @@ doc = ["ablog (>=0.11.8)", "colorama", "graphviz", "ipykernel", "ipyleaflet", "i
i18n = ["Babel", "jinja2"]
test = ["pytest", "pytest-cov", "pytest-regressions", "sphinx[test]"]
[[package]]
name = "pygetwindow"
version = "0.0.9"
description = "A simple, cross-platform module for obtaining GUI information on application's windows."
optional = false
python-versions = "*"
groups = ["main"]
files = [
{file = "PyGetWindow-0.0.9.tar.gz", hash = "sha256:17894355e7d2b305cd832d717708384017c1698a90ce24f6f7fbf0242dd0a688"},
]
[package.dependencies]
pyrect = "*"
[[package]]
name = "pygments"
version = "2.19.1"
@@ -1927,6 +2139,105 @@ files = [
[package.extras]
windows-terminal = ["colorama (>=0.4.6)"]
[[package]]
name = "pymsgbox"
version = "1.0.9"
description = "A simple, cross-platform, pure Python module for JavaScript-like message boxes."
optional = false
python-versions = "*"
groups = ["main"]
files = [
{file = "PyMsgBox-1.0.9.tar.gz", hash = "sha256:2194227de8bff7a3d6da541848705a155dcbb2a06ee120d9f280a1d7f51263ff"},
]
[[package]]
name = "pynose"
version = "1.5.4"
description = "pynose fixes nose to extend unittest and make testing easier"
optional = false
python-versions = ">=3.7"
groups = ["main"]
files = [
{file = "pynose-1.5.4-py3-none-any.whl", hash = "sha256:ee4ae91c9e2b54d46647f5d81b95392dd27e02ed26f016dadb5f1ac10f949d96"},
{file = "pynose-1.5.4.tar.gz", hash = "sha256:97dd0b7e85cf990120a01147e83ccd960c09ffcd69f6822f18e14128c6655e67"},
]
[[package]]
name = "pyobjc-core"
version = "11.0"
description = "Python<->ObjC Interoperability Module"
optional = false
python-versions = ">=3.8"
groups = ["main"]
markers = "platform_system == \"Darwin\""
files = [
{file = "pyobjc_core-11.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:10866b3a734d47caf48e456eea0d4815c2c9b21856157db5917b61dee06893a1"},
{file = "pyobjc_core-11.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:50675c0bb8696fe960a28466f9baf6943df2928a1fd85625d678fa2f428bd0bd"},
{file = "pyobjc_core-11.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a03061d4955c62ddd7754224a80cdadfdf17b6b5f60df1d9169a3b1b02923f0b"},
{file = "pyobjc_core-11.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:c338c1deb7ab2e9436d4175d1127da2eeed4a1b564b3d83b9f3ae4844ba97e86"},
{file = "pyobjc_core-11.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b4e9dc4296110f251a4033ff3f40320b35873ea7f876bd29a1c9705bb5e08c59"},
{file = "pyobjc_core-11.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:02406ece449d0f41b31e579e47ca77ced3eb57533df955281bfcecc99da74fba"},
{file = "pyobjc_core-11.0.tar.gz", hash = "sha256:63bced211cb8a8fb5c8ff46473603da30e51112861bd02c438fbbbc8578d9a70"},
]
[[package]]
name = "pyobjc-framework-cocoa"
version = "11.0"
description = "Wrappers for the Cocoa frameworks on macOS"
optional = false
python-versions = ">=3.9"
groups = ["main"]
markers = "platform_system == \"Darwin\""
files = [
{file = "pyobjc_framework_Cocoa-11.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:fbc65f260d617d5463c7fb9dbaaffc23c9a4fabfe3b1a50b039b61870b8daefd"},
{file = "pyobjc_framework_Cocoa-11.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3ea7be6e6dd801b297440de02d312ba3fa7fd3c322db747ae1cb237e975f5d33"},
{file = "pyobjc_framework_Cocoa-11.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:280a577b83c68175a28b2b7138d1d2d3111f2b2b66c30e86f81a19c2b02eae71"},
{file = "pyobjc_framework_Cocoa-11.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:15b2bd977ed340074f930f1330f03d42912d5882b697d78bd06f8ebe263ef92e"},
{file = "pyobjc_framework_Cocoa-11.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:5750001db544e67f2b66f02067d8f0da96bb2ef71732bde104f01b8628f9d7ea"},
{file = "pyobjc_framework_Cocoa-11.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ddff25b0755d59873d186e1e07d6aaddb19d55e3ae890d69ff2d9babf8627657"},
{file = "pyobjc_framework_cocoa-11.0.tar.gz", hash = "sha256:00346a8cb81ad7b017b32ff7bf596000f9faa905807b1bd234644ebd47f692c5"},
]
[package.dependencies]
pyobjc-core = ">=11.0"
[[package]]
name = "pyobjc-framework-quartz"
version = "11.0"
description = "Wrappers for the Quartz frameworks on macOS"
optional = false
python-versions = ">=3.9"
groups = ["main"]
markers = "platform_system == \"Darwin\""
files = [
{file = "pyobjc_framework_Quartz-11.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:da3ab13c9f92361959b41b0ad4cdd41ae872f90a6d8c58a9ed699bc08ab1c45c"},
{file = "pyobjc_framework_Quartz-11.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d251696bfd8e8ef72fbc90eb29fec95cb9d1cc409008a183d5cc3246130ae8c2"},
{file = "pyobjc_framework_Quartz-11.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:cb4a9f2d9d580ea15e25e6b270f47681afb5689cafc9e25712445ce715bcd18e"},
{file = "pyobjc_framework_Quartz-11.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:973b4f9b8ab844574461a038bd5269f425a7368d6e677e3cc81fcc9b27b65498"},
{file = "pyobjc_framework_Quartz-11.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:66ab58d65348863b8707e63b2ec5cdc54569ee8189d1af90d52f29f5fdf6272c"},
{file = "pyobjc_framework_Quartz-11.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1032f63f2a4ee98366764e69c249f1d93813821e17d224cf626cf11fb1801fc4"},
{file = "pyobjc_framework_quartz-11.0.tar.gz", hash = "sha256:3205bf7795fb9ae34747f701486b3db6dfac71924894d1f372977c4d70c3c619"},
]
[package.dependencies]
pyobjc-core = ">=11.0"
pyobjc-framework-Cocoa = ">=11.0"
[[package]]
name = "pyotp"
version = "2.9.0"
description = "Python One Time Password Library"
optional = false
python-versions = ">=3.7"
groups = ["main"]
files = [
{file = "pyotp-2.9.0-py3-none-any.whl", hash = "sha256:81c2e5865b8ac55e825b0358e496e1d9387c811e85bb40e71a3b29b288963612"},
{file = "pyotp-2.9.0.tar.gz", hash = "sha256:346b6642e0dbdde3b4ff5a930b664ca82abfa116356ed48cc42c7d6590d36f63"},
]
[package.extras]
test = ["coverage", "mypy", "ruff", "wheel"]
[[package]]
name = "pyparsing"
version = "3.2.3"
@@ -1942,6 +2253,61 @@ files = [
[package.extras]
diagrams = ["jinja2", "railroad-diagrams"]
[[package]]
name = "pyperclip"
version = "1.9.0"
description = "A cross-platform clipboard module for Python. (Only handles plain text for now.)"
optional = false
python-versions = "*"
groups = ["main"]
files = [
{file = "pyperclip-1.9.0.tar.gz", hash = "sha256:b7de0142ddc81bfc5c7507eea19da920b92252b548b96186caf94a5e2527d310"},
]
[[package]]
name = "pyreadline3"
version = "3.5.4"
description = "A python implementation of GNU readline."
optional = false
python-versions = ">=3.8"
groups = ["main"]
markers = "platform_system == \"Windows\""
files = [
{file = "pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6"},
{file = "pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7"},
]
[package.extras]
dev = ["build", "flake8", "mypy", "pytest", "twine"]
[[package]]
name = "pyrect"
version = "0.2.0"
description = "PyRect is a simple module with a Rect class for Pygame-like rectangular areas."
optional = false
python-versions = "*"
groups = ["main"]
files = [
{file = "PyRect-0.2.0.tar.gz", hash = "sha256:f65155f6df9b929b67caffbd57c0947c5ae5449d3b580d178074bffb47a09b78"},
]
[[package]]
name = "pyscreeze"
version = "1.0.1"
description = "A simple, cross-platform screenshot module for Python 2 and 3."
optional = false
python-versions = "*"
groups = ["main"]
files = [
{file = "pyscreeze-1.0.1.tar.gz", hash = "sha256:cf1662710f1b46aa5ff229ee23f367da9e20af4a78e6e365bee973cad0ead4be"},
]
[package.dependencies]
Pillow = [
{version = ">=9.3.0", markers = "python_version == \"3.11\""},
{version = ">=9.2.0", markers = "python_version == \"3.10\""},
]
[[package]]
name = "pysocks"
version = "1.7.1"
@@ -1973,7 +2339,7 @@ version = "8.3.5"
description = "pytest: simple powerful testing with Python"
optional = false
python-versions = ">=3.8"
groups = ["dev"]
groups = ["main", "dev"]
files = [
{file = "pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820"},
{file = "pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845"},
@@ -1990,6 +2356,27 @@ tomli = {version = ">=1", markers = "python_version < \"3.11\""}
[package.extras]
dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
[[package]]
name = "pytest-html"
version = "4.0.2"
description = "pytest plugin for generating HTML reports"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "pytest_html-4.0.2-py3-none-any.whl", hash = "sha256:907c3e68462df129d3ee96dee58bd63f70216b06421836b22fd3fd57ef314acb"},
{file = "pytest_html-4.0.2.tar.gz", hash = "sha256:88682b9e8e51392472546a70a2139b27d6bc1834a4afd3e41da33c9d9f91e4a4"},
]
[package.dependencies]
jinja2 = ">=3.0.0"
pytest = ">=7.0.0"
pytest-metadata = ">=2.0.0"
[package.extras]
docs = ["pip-tools (>=6.13.0)"]
test = ["assertpy (>=1.1)", "beautifulsoup4 (>=4.11.1)", "black (>=22.1.0)", "flake8 (>=4.0.1)", "pre-commit (>=2.17.0)", "pytest-mock (>=3.7.0)", "pytest-rerunfailures (>=11.1.2)", "pytest-xdist (>=2.4.0)", "selenium (>=4.3.0)", "tox (>=3.24.5)"]
[[package]]
name = "pytest-loguru"
version = "0.4.0"
@@ -2008,6 +2395,24 @@ loguru = "*"
[package.extras]
test = ["pytest", "pytest-cov"]
[[package]]
name = "pytest-metadata"
version = "3.1.1"
description = "pytest plugin for test session metadata"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "pytest_metadata-3.1.1-py3-none-any.whl", hash = "sha256:c8e0844db684ee1c798cfa38908d20d67d0463ecb6137c72e91f418558dd5f4b"},
{file = "pytest_metadata-3.1.1.tar.gz", hash = "sha256:d2a29b0355fbc03f168aa96d41ff88b1a3b44a3b02acbe491801c98a048017c8"},
]
[package.dependencies]
pytest = ">=7.0.0"
[package.extras]
test = ["black (>=22.1.0)", "flake8 (>=4.0.1)", "pre-commit (>=2.17.0)", "tox (>=3.24.5)"]
[[package]]
name = "pytest-mock"
version = "3.14.1"
@@ -2026,6 +2431,59 @@ pytest = ">=6.2.5"
[package.extras]
dev = ["pre-commit", "pytest-asyncio", "tox"]
[[package]]
name = "pytest-ordering"
version = "0.6"
description = "pytest plugin to run your tests in a specific order"
optional = false
python-versions = "*"
groups = ["main"]
files = [
{file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"},
{file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"},
{file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"},
]
[package.dependencies]
pytest = "*"
[[package]]
name = "pytest-rerunfailures"
version = "15.1"
description = "pytest plugin to re-run tests to eliminate flaky failures"
optional = false
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "pytest_rerunfailures-15.1-py3-none-any.whl", hash = "sha256:f674c3594845aba8b23c78e99b1ff8068556cc6a8b277f728071fdc4f4b0b355"},
{file = "pytest_rerunfailures-15.1.tar.gz", hash = "sha256:c6040368abd7b8138c5b67288be17d6e5611b7368755ce0465dda0362c8ece80"},
]
[package.dependencies]
packaging = ">=17.1"
pytest = ">=7.4,<8.2.2 || >8.2.2"
[[package]]
name = "pytest-xdist"
version = "3.7.0"
description = "pytest xdist plugin for distributed testing, most importantly across multiple CPUs"
optional = false
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "pytest_xdist-3.7.0-py3-none-any.whl", hash = "sha256:7d3fbd255998265052435eb9daa4e99b62e6fb9cfb6efd1f858d4d8c0c7f0ca0"},
{file = "pytest_xdist-3.7.0.tar.gz", hash = "sha256:f9248c99a7c15b7d2f90715df93610353a485827bc06eefb6566d23f6400f126"},
]
[package.dependencies]
execnet = ">=2.1"
pytest = ">=7.0.0"
[package.extras]
psutil = ["psutil (>=3.0)"]
setproctitle = ["setproctitle"]
testing = ["filelock"]
[[package]]
name = "python-bitcoinlib"
version = "0.12.2"
@@ -2088,6 +2546,45 @@ Authlib = ">=1.0.0"
dataclasses-json = ">=0.5.7"
requests = ">=2.28"
[[package]]
name = "python-xlib"
version = "0.33"
description = "Python X Library"
optional = false
python-versions = "*"
groups = ["main"]
markers = "platform_system == \"Linux\""
files = [
{file = "python-xlib-0.33.tar.gz", hash = "sha256:55af7906a2c75ce6cb280a584776080602444f75815a7aff4d287bb2d7018b32"},
{file = "python_xlib-0.33-py2.py3-none-any.whl", hash = "sha256:c3534038d42e0df2f1392a1b30a15a4ff5fdc2b86cfa94f072bf11b10a164398"},
]
[package.dependencies]
six = ">=1.10.0"
[[package]]
name = "python3-xlib"
version = "0.15"
description = "Python3 X Library"
optional = false
python-versions = "*"
groups = ["main"]
markers = "platform_system == \"Linux\""
files = [
{file = "python3-xlib-0.15.tar.gz", hash = "sha256:dc4245f3ae4aa5949c1d112ee4723901ade37a96721ba9645f2bfa56e5b383f8"},
]
[[package]]
name = "pytweening"
version = "1.2.0"
description = "A collection of tweening (aka easing) functions."
optional = false
python-versions = "*"
groups = ["main"]
files = [
{file = "pytweening-1.2.0.tar.gz", hash = "sha256:243318b7736698066c5f362ec5c2b6434ecf4297c3c8e7caa8abfe6af4cac71b"},
]
[[package]]
name = "pytz"
version = "2025.2"
@@ -2106,7 +2603,7 @@ version = "6.0.2"
description = "YAML parser and emitter for Python"
optional = false
python-versions = ">=3.8"
groups = ["dev", "docs"]
groups = ["main", "dev", "docs"]
files = [
{file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
{file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
@@ -2397,7 +2894,7 @@ description = "Manipulate well-formed Roman numerals"
optional = false
python-versions = ">=3.9"
groups = ["docs"]
markers = "python_version == \"3.12\""
markers = "python_version >= \"3.11\""
files = [
{file = "roman_numerals_py-3.1.0-py3-none-any.whl", hash = "sha256:9da2ad2fb670bcf24e81070ceb3be72f6c11c440d73bd579fbeca1e9f330954c"},
{file = "roman_numerals_py-3.1.0.tar.gz", hash = "sha256:be4bf804f083a4ce001b5eb7e3c0862479d10f94c936f6c4e5f250aa5ff5bd2d"},
@@ -2498,6 +2995,23 @@ files = [
{file = "ruamel.yaml.clib-0.2.12.tar.gz", hash = "sha256:6c8fbb13ec503f99a91901ab46e0b07ae7941cd527393187039aec586fdfd36f"},
]
[[package]]
name = "rubicon-objc"
version = "0.5.1"
description = "A bridge between an Objective C runtime environment and Python."
optional = false
python-versions = ">=3.9"
groups = ["main"]
markers = "platform_system == \"Darwin\""
files = [
{file = "rubicon_objc-0.5.1-py3-none-any.whl", hash = "sha256:17092756241b8370231cfaad45ad6e8ce99534987f2acbc944d65df5bdf8f6cd"},
{file = "rubicon_objc-0.5.1.tar.gz", hash = "sha256:90bee9fc1de4515e17615e15648989b88bb8d4d2ffc8c7c52748272cd7f30a66"},
]
[package.extras]
dev = ["pre-commit (==4.2.0)", "pytest (==8.3.5)", "setuptools_scm (==8.3.1)", "tox (==4.26.0)"]
docs = ["furo (==2024.8.6)", "pyenchant (==3.2.2)", "sphinx (==8.2.3)", "sphinx-autobuild (==2024.10.3)", "sphinx-copybutton (==0.5.2)", "sphinx_tabs (==3.4.7)", "sphinxcontrib-spelling (==8.0.1)"]
[[package]]
name = "ruff"
version = "0.9.10"
@@ -2544,6 +3058,22 @@ botocore = ">=1.37.4,<2.0a.0"
[package.extras]
crt = ["botocore[crt] (>=1.37.4,<2.0a.0)"]
[[package]]
name = "sbvirtualdisplay"
version = "1.4.0"
description = "A customized pyvirtualdisplay for SeleniumBase."
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "sbvirtualdisplay-1.4.0-py3-none-any.whl", hash = "sha256:516de155219aa342c4e090a3c5126cfe6b12416334bcba3255268e44a5e8a206"},
{file = "sbvirtualdisplay-1.4.0.tar.gz", hash = "sha256:29a365b509cd7bfde4f758603b7b75703909b11cdf4245abc8f828ed35660d9b"},
]
[package.extras]
coverage = ["coverage (>=7.6.1) ; python_version < \"3.9\"", "coverage (>=7.6.9) ; python_version >= \"3.9\"", "pytest-cov (>=5.0.0) ; python_version < \"3.9\"", "pytest-cov (>=6.0.0) ; python_version >= \"3.9\""]
flake8 = ["flake8 (==5.0.4) ; python_version < \"3.9\"", "flake8 (==7.1.1) ; python_version >= \"3.9\"", "mccabe (==0.7.0)", "pycodestyle (==2.12.1) ; python_version >= \"3.9\"", "pycodestyle (==2.9.1) ; python_version < \"3.9\"", "pyflakes (==2.5.0) ; python_version < \"3.9\"", "pyflakes (==3.2.0) ; python_version >= \"3.9\""]
[[package]]
name = "secretstorage"
version = "3.3.3"
@@ -2580,6 +3110,115 @@ typing_extensions = ">=4.13.2,<4.14.0"
urllib3 = {version = ">=2.4.0,<2.5.0", extras = ["socks"]}
websocket-client = ">=1.8.0,<1.9.0"
[[package]]
name = "seleniumbase"
version = "4.39.2"
description = "A complete web automation framework for end-to-end testing."
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "seleniumbase-4.39.2-py3-none-any.whl", hash = "sha256:23b2d071c02ba269a8239b828fd5098edb208d04171143c93b40d8a351ba2861"},
{file = "seleniumbase-4.39.2.tar.gz", hash = "sha256:3a18d582ca90f4d633debb8ec45871db1b7aed71e5876fc634962fba79731967"},
]
[package.dependencies]
attrs = ">=25.3.0"
beautifulsoup4 = "4.13.4"
behave = "1.2.6"
certifi = ">=2025.4.26"
chardet = "5.2.0"
charset-normalizer = ">=3.4.2,<4"
colorama = ">=0.4.6"
cssselect = {version = "1.3.0", markers = "python_version >= \"3.9\""}
exceptiongroup = ">=1.3.0"
execnet = "2.1.1"
fasteners = ">=0.19"
filelock = {version = ">=3.18.0", markers = "python_version >= \"3.9\""}
h11 = "0.16.0"
idna = "3.10"
iniconfig = "2.1.0"
Jinja2 = ">=3.1.6"
markdown-it-py = "3.0.0"
MarkupSafe = {version = ">=3.0.2", markers = "python_version >= \"3.9\""}
mdurl = "0.1.2"
mycdp = ">=1.2.0"
outcome = "1.3.0.post0"
packaging = ">=25.0"
parameterized = "0.9.0"
parse = ">=1.20.2"
parse-type = ">=0.6.4"
pdbp = ">=1.7.0"
pip = {version = ">=25.1.1", markers = "python_version >= \"3.9\""}
platformdirs = {version = ">=4.3.8", markers = "python_version >= \"3.9\""}
pluggy = {version = "1.6.0", markers = "python_version >= \"3.9\""}
pygments = ">=2.19.1"
pynose = ">=1.5.4"
pyotp = "2.9.0"
pyreadline3 = {version = ">=3.5.3", markers = "platform_system == \"Windows\""}
pytest = "8.3.5"
pytest-html = "4.0.2"
pytest-metadata = "3.1.1"
pytest-ordering = "0.6"
pytest-rerunfailures = {version = "15.1", markers = "python_version >= \"3.9\""}
pytest-xdist = {version = "3.7.0", markers = "python_version >= \"3.9\""}
python-xlib = {version = "0.33", markers = "platform_system == \"Linux\""}
pyyaml = ">=6.0.2"
requests = "2.32.3"
rich = ">=14.0.0,<15"
sbvirtualdisplay = ">=1.4.0"
selenium = {version = "4.33.0", markers = "python_version >= \"3.10\""}
setuptools = {version = ">=80.8.0", markers = "python_version >= \"3.10\""}
six = ">=1.17.0"
sniffio = "1.3.1"
sortedcontainers = "2.4.0"
soupsieve = "2.7"
tabcompleter = ">=1.4.0"
trio = {version = "0.30.0", markers = "python_version >= \"3.9\""}
trio-websocket = "0.12.2"
typing-extensions = ">=4.13.2"
urllib3 = {version = ">=1.26.20,<2.5.0", markers = "python_version >= \"3.10\""}
websocket-client = "1.8.0"
websockets = {version = ">=15.0.1", markers = "python_version >= \"3.9\""}
wheel = ">=0.45.1"
wsproto = "1.2.0"
[package.extras]
allure = ["allure-behave (>=2.13.5)", "allure-pytest (>=2.13.5)", "allure-python-commons (>=2.13.5)"]
coverage = ["coverage (>=7.6.1) ; python_version < \"3.9\"", "coverage (>=7.8.2) ; python_version >= \"3.9\"", "pytest-cov (>=5.0.0) ; python_version < \"3.9\"", "pytest-cov (>=6.1.1) ; python_version >= \"3.9\""]
flake8 = ["flake8 (==5.0.4) ; python_version < \"3.9\"", "flake8 (==7.2.0) ; python_version >= \"3.9\"", "mccabe (==0.7.0)", "pycodestyle (==2.13.0) ; python_version >= \"3.9\"", "pycodestyle (==2.9.1) ; python_version < \"3.9\"", "pyflakes (==2.5.0) ; python_version < \"3.9\"", "pyflakes (==3.3.2) ; python_version >= \"3.9\""]
ipdb = ["ipdb (==0.13.13)", "ipython (==7.34.0)"]
mss = ["mss (==10.0.0) ; python_version >= \"3.9\"", "mss (==9.0.2) ; python_version < \"3.9\""]
pdfminer = ["cffi (==1.17.1)", "cryptography (==39.0.2) ; python_version < \"3.9\"", "cryptography (==45.0.3) ; python_version >= \"3.9\"", "pdfminer.six (==20250324) ; python_version < \"3.9\"", "pdfminer.six (==20250506) ; python_version >= \"3.9\"", "pycparser (==2.22)"]
pillow = ["Pillow (>=10.4.0) ; python_version < \"3.9\"", "Pillow (>=11.2.1) ; python_version >= \"3.9\""]
pip-system-certs = ["pip-system-certs (==4.0) ; platform_system == \"Windows\""]
proxy = ["proxy.py (==2.4.3)"]
psutil = ["psutil (==7.0.0)"]
pyautogui = ["PyAutoGUI (==0.9.54)"]
selenium-stealth = ["selenium-stealth (==1.0.6)"]
selenium-wire = ["Brotli (==1.1.0)", "blinker (==1.7.0)", "h2 (==4.1.0)", "hpack (==4.0.0)", "hyperframe (==6.0.1)", "kaitaistruct (==0.10)", "pyOpenSSL (==24.2.1)", "pyasn1 (==0.6.1)", "pyparsing (>=3.1.4)", "selenium-wire (==5.1.0)", "zstandard (==0.23.0)"]
[[package]]
name = "setuptools"
version = "80.9.0"
description = "Easily download, build, install, upgrade, and uninstall Python packages"
optional = false
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922"},
{file = "setuptools-80.9.0.tar.gz", hash = "sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c"},
]
[package.extras]
check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.8.0) ; sys_platform != \"cygwin\""]
core = ["importlib_metadata (>=6) ; python_version < \"3.10\"", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1) ; python_version < \"3.11\"", "wheel (>=0.43.0)"]
cover = ["pytest-cov"]
doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
enabler = ["pytest-enabler (>=2.2)"]
test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"]
type = ["importlib_metadata (>=7.0.2) ; python_version < \"3.10\"", "jaraco.develop (>=7.21) ; sys_platform != \"cygwin\"", "mypy (==1.14.*)", "pytest-mypy"]
[[package]]
name = "six"
version = "1.17.0"
@@ -2647,7 +3286,7 @@ description = "Python documentation generator"
optional = false
python-versions = ">=3.10"
groups = ["docs"]
markers = "python_version < \"3.12\""
markers = "python_version == \"3.10\""
files = [
{file = "sphinx-8.1.3-py3-none-any.whl", hash = "sha256:09719015511837b76bf6e03e42eb7595ac8c2e41eeb9c29c5b755c6b677992a2"},
{file = "sphinx-8.1.3.tar.gz", hash = "sha256:43c1911eecb0d3e161ad78611bc905d1ad0e523e4ddc202a58a821773dc4c927"},
@@ -2684,7 +3323,7 @@ description = "Python documentation generator"
optional = false
python-versions = ">=3.11"
groups = ["docs"]
markers = "python_version == \"3.12\""
markers = "python_version >= \"3.11\""
files = [
{file = "sphinx-8.2.3-py3-none-any.whl", hash = "sha256:4405915165f13521d875a8c29c8970800a0141c14cc5416a38feca4ea5d9b9c3"},
{file = "sphinx-8.2.3.tar.gz", hash = "sha256:398ad29dee7f63a75888314e9424d40f52ce5a6a87ae88e7071e80af296ec348"},
@@ -2935,6 +3574,21 @@ anyio = ">=3.6.2,<5"
[package.extras]
full = ["httpx (>=0.27.0,<0.29.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.18)", "pyyaml"]
[[package]]
name = "tabcompleter"
version = "1.4.0"
description = "tabcompleter --- Autocompletion in the Python console."
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "tabcompleter-1.4.0-py3-none-any.whl", hash = "sha256:d744aa735b49c0a6cc2fb8fcd40077fec47425e4388301010b14e6ce3311368b"},
{file = "tabcompleter-1.4.0.tar.gz", hash = "sha256:7562a9938e62f8e7c3be612c3ac4e14c5ec4307b58ba9031c148260e866e8814"},
]
[package.dependencies]
pyreadline3 = {version = "*", markers = "platform_system == \"Windows\""}
[[package]]
name = "telethon"
version = "1.40.0"
@@ -2972,7 +3626,7 @@ version = "2.2.1"
description = "A lil' TOML parser"
optional = false
python-versions = ">=3.8"
groups = ["dev", "docs"]
groups = ["main", "dev", "docs"]
markers = "python_version == \"3.10\""
files = [
{file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"},
@@ -3418,6 +4072,21 @@ files = [
{file = "websockets-15.0.1.tar.gz", hash = "sha256:82544de02076bafba038ce055ee6412d68da13ab47f0c60cab827346de828dee"},
]
[[package]]
name = "wheel"
version = "0.45.1"
description = "A built-package format for Python"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "wheel-0.45.1-py3-none-any.whl", hash = "sha256:708e7481cc80179af0e556bbf0cc00b8444c7321e2700b8d8580231d13017248"},
{file = "wheel-0.45.1.tar.gz", hash = "sha256:661e1abd9198507b1409a20c02106d9670b2576e916d58f520316666abca6729"},
]
[package.extras]
test = ["pytest (>=6.0.0)", "setuptools (>=65)"]
[[package]]
name = "win32-setctime"
version = "1.2.0"
@@ -3485,4 +4154,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
[metadata]
lock-version = "2.1"
python-versions = ">=3.10,<3.13"
content-hash = "03a5cc0c06de5cc1227dc36895013562697a481f1e41aa405cc33545c29bbef3"
content-hash = "1ab1e4c9b8beb51116052c1e8d180616a0938757f173f05b7355e279902d3350"

View File

@@ -27,7 +27,6 @@ dependencies = [
"bs4 (>=0.0.0)",
"loguru (>=0.0.0)",
"ffmpeg-python (>=0.0.0)",
"selenium (>=0.0.0)",
"telethon (>=0.0.0)",
"google-api-python-client (>=0.0.0)",
"google-auth-httplib2 (>=0.0.0)",
@@ -57,6 +56,8 @@ dependencies = [
"bgutil-ytdlp-pot-provider (>=1.0.0)",
"yt-dlp[curl-cffi,default] (>=2025.5.22,<2026.0.0)",
"secretstorage (>=3.3.3,<4.0.0)",
"seleniumbase (>=4.36.4,<5.0.0)",
"pyautogui (>=0.9.54,<0.10.0)",
]
[tool.poetry.group.dev.dependencies]

View File

@@ -0,0 +1,44 @@
{
"name": "Antibot Extractor/Enricher",
"type": ["extractor", "enricher"],
"requires_setup": False,
"dependencies": {
"python": ["loguru", "seleniumbase"],
},
"configs": {
"save_to_pdf": {
"default": False,
"type": "bool",
"help": "save a PDF snapshot of the page.",
},
"max_download_images": {
"default": 50,
"help": "maximum number of images to download from the page (0 = no download, inf = no limit).",
},
"max_download_videos": {
"default": 50,
"help": "maximum number of videos to download from the page (0 = no download, inf = no limit).",
},
"exclude_media_extensions": {
"default": ".svg,.ico,.gif",
"help": "CSV of media (image/video) file extensions to exclude from download",
},
"proxy": {
"default": None,
"help": "proxy to use for the webdriver, Format: 'SERVER:PORT' or 'USER:PASS@SERVER:PORT'",
},
},
"description": """
Uses a browser controlled by SeleniumBase to capture HTML, media, and screenshots/PDFs of a web page, by bypassing anti-bot measures like Cloudflare's Turnstile.
### Features
- Extracts the HTML source code of the page.
- Takes full-page screenshots of web pages.
- Takes full-page PDF snapshots of web pages.
- Downloads images and videos from the page, excluding specified file extensions.
### Notes
- Requires a WebDriver (e.g., ChromeDriver) installed and accessible via the system's PATH.
- Using a proxy affects Cloudflare Turnstile captcha handling, so it is recommended to use a proxy only if necessary.
""",
}

View File

@@ -0,0 +1,217 @@
import base64
import math
import mimetypes
import os
import sys
import traceback
from urllib.parse import urljoin
from loguru import logger
from seleniumbase import SB
from auto_archiver.core import Extractor, Enricher, Metadata, Media
from auto_archiver.utils.misc import random_str
class AntibotExtractorEnricher(Extractor, Enricher):
def setup(self) -> None:
self.agent = "cool"
if "linux" in sys.platform or "win32" in sys.platform:
self.agent = None # Use the default UserAgent
# parse configuration options
self.exclude_media_mimetypes = set(
[mimetypes.guess_type(f"file{m}")[0] for m in self.exclude_media_extensions.split(",")]
) - {None}
if self.max_download_images == "inf":
self.max_download_images = math.inf
else:
self.max_download_images = int(self.max_download_images)
if self.max_download_videos == "inf":
self.max_download_videos = math.inf
else:
self.max_download_videos = int(self.max_download_videos)
def download(self, item: Metadata) -> Metadata:
result = Metadata()
result.merge(item)
if self.enrich(result):
result.status = "antibot"
return result
def enrich(self, to_enrich: Metadata) -> bool:
url = to_enrich.get_url()
# TODO: implement cookies auth = self.auth_for_site(url) and combine with if UrlUtil.is_auth_wall(url) like in ScreenshotEnricher
url_sample = url[:75]
try:
with SB(uc=True, agent=self.agent, headed=None, proxy=self.proxy) as sb:
logger.info(f"ANTIBOT selenium browser is up with agent {self.agent}, opening {url_sample}...")
sb.uc_open_with_reconnect(url, 4)
logger.debug(f"ANTIBOT handling CAPTCHAs for {url_sample}...")
# TODO: implement other Captcha handling
sb.uc_gui_handle_captcha() # handles Cloudflare Turnstile captcha if detected
# time.sleep(1) # wait for the page to load
if self._hit_auth_wall(sb):
logger.warning(f"ANTIBOT SKIP since auth wall or CAPTCHA was detected for {url_sample}")
return False
logger.debug(f"ANTIBOT no auth wall detected for {url_sample}...")
to_enrich.set_title(sb.get_title())
self._enrich_html_source_code(sb, to_enrich)
self._enrich_full_page_screenshot(sb, to_enrich)
if self.save_to_pdf:
self._enrich_full_page_pdf(sb, to_enrich)
self._enrich_download_media(sb, to_enrich, css_selector="img", max_media=self.max_download_images)
self._enrich_download_media(
sb, to_enrich, css_selector="video, source", max_media=self.max_download_videos
)
logger.success(f"ANTIBOT completed for {url_sample}")
return to_enrich
except Exception as e:
logger.error(f"ANTIBOT runtime error: {e}: {traceback.format_exc()}")
return False
def _hit_auth_wall(self, sb: SB) -> bool:
"""
Tries to detect if the currently loaded page is an auth/login wall.
Returns True if login is likely required.
"""
# TODO: improve this detection logic, currently it is very basic and may not cover all cases
# Common URL patterns
url = sb.get_current_url().lower()
if any(kw in url for kw in ["login", "signin", "signup", "register", "captcha"]):
return True
# Common visible text markers
login_keywords = [
"sign up or log in",
"log in to continue",
"sign in to continue",
"login required",
"please log in",
"please sign up",
"please sign in",
"login to access",
"sign up to access",
"register to access",
"captcha verification",
]
for word in login_keywords + [w.capitalize() for w in login_keywords]:
if sb.is_text_visible(word):
return True
# Common title markers
title = sb.get_title().lower()
if any(
kw in title
for kw in [
"just a moment...",
"tiktok - make your day",
"um momento...",
"log in",
"sign in",
"sign up",
"register",
"captcha",
"verification required",
"access denied",
]
):
return True
# Common form fields
elements = [
"input[type='password']",
"input[type='email']",
"input[type='username']",
"input[type='phone']",
"input[name='username']",
"input[name='email']",
"input[name='password']",
"input[name='login']",
]
if any(sb.is_element_visible(el) for el in elements):
return True
return False
@logger.catch
def _enrich_html_source_code(self, sb: SB, to_enrich: Metadata):
"""
Enriches the HTML source code of the Metadata object.
This method is called by the enrich method.
"""
source = sb.get_page_source()
html_filename = os.path.join(self.tmp_dir, f"source{random_str(6)}.html")
with open(html_filename, "w", encoding="utf-8") as f:
f.write(source)
to_enrich.add_media(Media(filename=html_filename), id="html_source_code")
@logger.catch
def _enrich_full_page_screenshot(self, sb: SB, to_enrich: Metadata):
"""
Enriches the full page screenshot of the Metadata object.
This method is called by the enrich method.
"""
x = sb.execute_script("return document.documentElement.scrollWidth")
y = min(sb.execute_script("return document.documentElement.scrollHeight"), 25_000)
sb.set_window_size(x, y)
screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png")
sb.save_screenshot(screen_filename)
to_enrich.add_media(Media(filename=screen_filename), id="screenshot")
@logger.catch
def _enrich_full_page_pdf(self, sb: SB, to_enrich: Metadata):
"""
Enriches the full page PDF of the Metadata object.
This method is called by the enrich method.
"""
result = sb.driver.execute_cdp_cmd("Page.printToPDF", {"printBackground": True, "landscape": False})
pdf_data = base64.b64decode(result["data"])
pdf_filename = os.path.join(self.tmp_dir, f"pdf{random_str(6)}.pdf")
with open(pdf_filename, "wb") as f:
f.write(pdf_data)
to_enrich.add_media(Media(filename=pdf_filename), id="pdf")
@logger.catch
def _enrich_download_media(self, sb: SB, to_enrich: Metadata, css_selector: str, max_media: int):
"""
Downloads media from the page and adds them to the Metadata object.
This method is called by the enrich method.
"""
if max_media == 0:
return
logger.debug(
f"Downloading media from {to_enrich.get_url()} with selector '{css_selector}' up to {max_media} items."
)
url = to_enrich.get_url()
all_urls = set()
media_elements = sb.find_elements(css_selector)
for media in media_elements:
if len(all_urls) >= max_media:
logger.debug(f"Reached max download limit of {max_media} images/videos.")
break
if src := media.get_attribute("src"):
mimerype = mimetypes.guess_type(src)[0]
if mimerype in self.exclude_media_mimetypes:
continue
full_src = urljoin(url, src)
if full_src not in all_urls and (filename := self.download_from_url(full_src)):
all_urls.add(full_src)
to_enrich.add_media(Media(filename=filename, properties={"url": full_src}))

View File

@@ -62,7 +62,7 @@ If you are having issues with the extractor, you can review the version of `yt-d
},
"end_means_success": {
"default": True,
"help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve.",
"help": "if True, any archived content will mean a 'success', if False this extractor will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent extractors can retrieve.",
"type": "bool",
},
"allow_playlist": {

View File

@@ -33,6 +33,9 @@ class GenericExtractor(Extractor):
def setup(self):
self.check_for_extractor_updates()
self.setup_po_tokens()
# TODO: figure out why the following is not properly recognised by yt-dlp:
# if "generic" not in self.extractor_args:
# self.extractor_args["generic"] = "impersonate"
def check_for_extractor_updates(self):
"""Checks whether yt-dlp or its plugins need updating and triggers a restart if so."""
@@ -590,11 +593,11 @@ class GenericExtractor(Extractor):
# Applying user-defined extractor_args
if self.extractor_args:
for key, args in self.extractor_args.items():
logger.debug(f"Setting extractor_args: {key}")
if isinstance(args, dict):
arg_str = ";".join(f"{k}={v}" for k, v in args.items())
else:
arg_str = str(args)
logger.debug(f"Setting extractor_args: {key}:{arg_str}")
ydl_options.extend(["--extractor-args", f"{key}:{arg_str}"])
if self.ytdlp_args:

View File

@@ -12,6 +12,12 @@
font-family: 'Roboto', sans-serif;
}
h2 {
white-space: normal;
overflow-wrap: break-word;
word-break: break-word;
}
table {
table-layout: fixed;
width: 90%;
@@ -97,13 +103,17 @@
background-color: #f1f1f1;
}
.pem-certificate, .text-preview {
.pem-certificate,
.text-preview {
text-align: left;
font-size: small;
}
.text-preview{
.text-preview {
padding-left: 10px;
padding-right: 10px;
max-height: 300px;
overflow: auto;
white-space: pre-wrap;
}
</style>

View File

@@ -15,7 +15,7 @@
- Skips non-image media or files unsuitable for hashing (e.g., corrupted or unsupported formats).
### Notes
- Best used after enrichers like `thumbnail_enricher` or `screenshot_enricher` to ensure images are available.
- Best used after enrichers like `thumbnail_enricher` or `antibot_extractor_enricher` (takes screenshots) to ensure images are available.
- Uses the `pdqhash` library to compute 256-bit perceptual hashes, which are stored as hexadecimal strings.
""",
}

View File

@@ -6,7 +6,7 @@ objects and calculates perceptual hashes using the PDQ hashing algorithm.
These hashes are designed specifically for images and can be used
for detecting duplicate or near-duplicate visual content.
This enricher is typically used after thumbnail or screenshot enrichers
This enricher is typically used after thumbnail or screenshot (antibot) enrichers
to ensure images are available for hashing.
"""

View File

@@ -40,6 +40,8 @@ class S3Storage(Storage):
try:
if media.mimetype:
extra_args["ContentType"] = media.mimetype
if "text" in media.mimetype:
extra_args["ContentType"] += "; charset=utf-8"
except Exception as e:
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)

View File

@@ -1 +0,0 @@
from .screenshot_enricher import ScreenshotEnricher

View File

@@ -1,44 +0,0 @@
{
"name": "Screenshot Enricher",
"type": ["enricher"],
"requires_setup": True,
"dependencies": {
"python": ["loguru", "selenium"],
},
"configs": {
"width": {"default": 1280, "type": "int", "help": "width of the screenshots"},
"height": {"default": 1024, "type": "int", "help": "height of the screenshots"},
"timeout": {"default": 60, "type": "int", "help": "timeout for taking the screenshot"},
"sleep_before_screenshot": {
"default": 4,
"type": "int",
"help": "seconds to wait for the pages to load before taking screenshot",
},
"http_proxy": {
"default": "",
"help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port",
},
"save_to_pdf": {
"default": False,
"type": "bool",
"help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter",
},
"print_options": {
"default": {},
"help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
"type": "json_loader",
},
},
"description": """
Captures screenshots and optionally saves web pages as PDFs using a WebDriver.
### Features
- Takes screenshots of web pages, with configurable width, height, and timeout settings.
- Optionally saves pages as PDFs, with additional configuration for PDF printing options.
- Bypasses URLs detected as authentication walls.
- Integrates seamlessly with the metadata enrichment pipeline, adding screenshots and PDFs as media.
### Notes
- Requires a WebDriver (e.g., ChromeDriver) installed and accessible via the system's PATH.
""",
}

View File

@@ -1,61 +0,0 @@
from loguru import logger
import time
import os
import base64
from selenium.common.exceptions import TimeoutException
from auto_archiver.core import Enricher
from auto_archiver.utils import Webdriver, url as UrlUtil, random_str
from auto_archiver.core import Media, Metadata
class ScreenshotEnricher(Enricher):
def __init__(self, webdriver_factory=None):
super().__init__()
self.webdriver_factory = webdriver_factory or Webdriver
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
logger.debug(f"Enriching screenshot for {url=}")
auth = self.auth_for_site(url)
# screenshot enricher only supports cookie-type auth (selenium)
has_valid_auth = auth and (auth.get("cookies") or auth.get("cookies_jar") or auth.get("cookie"))
if UrlUtil.is_auth_wall(url) and not has_valid_auth:
logger.warning(f"[SKIP] SCREENSHOT since url is behind AUTH WALL and no login details provided: {url=}")
if any(auth.get(key) for key in ["username", "password", "api_key", "api_secret"]):
logger.warning(
f"Screenshot enricher only supports cookie-type authentication, you have provided {auth.keys()} which are not supported.\
Consider adding 'cookie', 'cookies_file' or 'cookies_from_browser' to your auth for this site."
)
return
with self.webdriver_factory(
self.width,
self.height,
self.timeout,
facebook_accept_cookies="facebook.com" in url,
http_proxy=self.http_proxy,
print_options=self.print_options,
auth=auth,
) as driver:
try:
driver.get(url)
time.sleep(int(self.sleep_before_screenshot))
screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png")
driver.save_screenshot(screenshot_file)
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
if self.save_to_pdf:
pdf_file = os.path.join(self.tmp_dir, f"pdf_{random_str(8)}.pdf")
pdf = driver.print_page(driver.print_options)
with open(pdf_file, "wb") as f:
f.write(base64.b64decode(pdf))
to_enrich.add_media(Media(filename=pdf_file), id="pdf")
except TimeoutException:
logger.info("TimeoutException loading page for screenshot")
except Exception as e:
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")

View File

@@ -2,7 +2,6 @@
# we need to explicitly expose the available imports here
from .misc import *
from .webdriver import Webdriver
# handy utils from ytdlp
from yt_dlp.utils import clean_html, traverse_obj, strip_or_none, url_or_none

View File

@@ -1,167 +0,0 @@
"""This Webdriver class acts as a context manager for the selenium webdriver."""
from __future__ import annotations
import os
import time
import re
# import domain_for_url
from urllib.parse import urlparse, urlunparse
from http.cookiejar import MozillaCookieJar
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common import exceptions as selenium_exceptions
from selenium.webdriver.common.print_page_options import PrintOptions
from selenium.webdriver.common.by import By
from loguru import logger
class CookieSettingDriver(webdriver.Firefox):
facebook_accept_cookies: bool
cookie: str
cookie_jar: MozillaCookieJar
def __init__(self, cookie, cookie_jar, facebook_accept_cookies, *args, **kwargs):
if os.environ.get("RUNNING_IN_DOCKER"):
# Selenium doesn't support linux-aarch64 driver, we need to set this manually
kwargs["service"] = webdriver.FirefoxService(executable_path="/usr/local/bin/geckodriver")
super(CookieSettingDriver, self).__init__(*args, **kwargs)
self.cookie = cookie
self.cookie_jar = cookie_jar
self.facebook_accept_cookies = facebook_accept_cookies
def get(self, url: str):
if self.cookie_jar or self.cookie:
# set up the driver to make it not 'cookie averse' (needs a context/URL)
# get the 'robots.txt' file which should be quick and easy
robots_url = urlunparse(urlparse(url)._replace(path="/robots.txt", query="", fragment=""))
super(CookieSettingDriver, self).get(robots_url)
if self.cookie:
# an explicit cookie is set for this site, use that first
for cookie in self.cookies.split(";"):
for name, value in cookie.split("="):
self.driver.add_cookie({"name": name, "value": value})
elif self.cookie_jar:
domain = urlparse(url).netloc.removeprefix("www.")
regex = re.compile(f"(www)?.?{domain}$")
for cookie in self.cookie_jar:
if regex.match(cookie.domain):
try:
self.add_cookie(
{
"name": cookie.name,
"value": cookie.value,
"path": cookie.path,
"domain": cookie.domain,
"secure": bool(cookie.secure),
"expiry": cookie.expires,
}
)
except Exception as e:
logger.warning(f"Failed to add cookie ({cookie.domain}) to webdriver for url {domain}: {e}")
super(CookieSettingDriver, self).get(url)
time.sleep(2)
# Try and use some common button text to reject/accept cookies
for text in [
"Refuse non-essential cookies",
"Decline optional cookies",
"Reject additional cookies",
"Reject all",
"Accept all cookies",
]:
try:
xpath = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]"
self.find_element(By.XPATH, xpath).click()
time.sleep(2)
except selenium_exceptions.NoSuchElementException:
pass
# now get the actual URL
if self.facebook_accept_cookies:
# try and click the 'close' button on the 'login' window to close it
try:
xpath = "//div[@role='dialog']//div[@aria-label='Close']"
self.find_element(By.XPATH, xpath).click()
time.sleep(2)
except selenium_exceptions.NoSuchElementException:
logger.warning("Unable to find the 'close' button on the facebook login window")
pass
else:
# for all other sites, try and use some common button text to reject/accept cookies
for text in [
"Refuse non-essential cookies",
"Decline optional cookies",
"Reject additional cookies",
"Reject all",
"Accept all cookies",
]:
try:
xpath = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]"
WebDriverWait(self, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))).click()
break
except selenium_exceptions.WebDriverException:
pass
class Webdriver:
def __init__(
self,
width: int,
height: int,
timeout_seconds: int,
facebook_accept_cookies: bool = False,
http_proxy: str = "",
print_options: dict = {},
auth: dict = {},
) -> webdriver:
self.width = width
self.height = height
self.timeout_seconds = timeout_seconds
self.auth = auth
self.facebook_accept_cookies = facebook_accept_cookies
self.http_proxy = http_proxy
# create and set print options
self.print_options = PrintOptions()
for k, v in print_options.items():
setattr(self.print_options, k, v)
def __enter__(self) -> webdriver:
options = webdriver.FirefoxOptions()
options.add_argument("--headless")
options.add_argument(f"--proxy-server={self.http_proxy}")
options.set_preference("network.protocol-handler.external.tg", False)
# if facebook cookie popup is present, force the browser to English since then it's easier to click the 'Decline optional cookies' option
if self.facebook_accept_cookies:
options.add_argument("--lang=en")
try:
self.driver = CookieSettingDriver(
cookie=self.auth.get("cookie"),
cookie_jar=self.auth.get("cookies_jar"),
facebook_accept_cookies=self.facebook_accept_cookies,
options=options,
)
self.driver.set_window_size(self.width, self.height)
self.driver.set_page_load_timeout(self.timeout_seconds)
self.driver.print_options = self.print_options
except selenium_exceptions.TimeoutException as e:
logger.error(
f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}"
)
return self.driver
def __exit__(self, exc_type, exc_val, exc_tb):
self.driver.close()
self.driver.quit()
del self.driver
return True

View File

@@ -1,216 +0,0 @@
import base64
import pytest
from selenium.common.exceptions import TimeoutException
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.screenshot_enricher import ScreenshotEnricher
@pytest.fixture
def mock_selenium_env(mocker):
"""Patches Selenium calls and driver checks in one place."""
# Patch external dependencies
mock_which = mocker.patch("shutil.which")
mock_driver_class = mocker.patch("auto_archiver.utils.webdriver.CookieSettingDriver")
mock_binary_paths = mocker.patch("selenium.webdriver.common.selenium_manager.SeleniumManager.binary_paths")
mocker.patch("pathlib.Path.is_file", return_value=True)
mock_popen = mocker.patch("subprocess.Popen")
mocker.patch("selenium.webdriver.common.service.Service.is_connectable", return_value=True)
mock_firefox_options = mocker.patch("selenium.webdriver.FirefoxOptions")
# Define side effect for `shutil.which`
def mock_which_side_effect(dep):
return "/mock/geckodriver" if dep == "geckodriver" else None
mock_which.side_effect = mock_which_side_effect
# Mock binary paths
mock_binary_paths.return_value = {
"driver_path": "/mock/driver",
"browser_path": "/mock/browser",
}
# Mock `subprocess.Popen`
mock_proc = mocker.MagicMock()
mock_proc.poll.return_value = None
mock_popen.return_value = mock_proc
# Mock `CookieSettingDriver`
mock_driver = mocker.MagicMock()
mock_driver_class.return_value = mock_driver
# Mock `FirefoxOptions`
mock_options_instance = mocker.MagicMock()
mock_firefox_options.return_value = mock_options_instance
yield mock_driver, mock_driver_class, mock_options_instance
@pytest.fixture
def common_patches(tmp_path, mocker):
"""Patches common utilities used across multiple tests."""
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=False)
mocker.patch("os.path.join", return_value=str(tmp_path / "test.png"))
mocker.patch("time.sleep")
yield
@pytest.fixture
def screenshot_enricher(setup_module, mock_binary_dependencies) -> ScreenshotEnricher:
configs: dict = {
"width": 1280,
"height": 720,
"timeout": 60,
"sleep_before_screenshot": 4,
"http_proxy": "",
"save_to_pdf": "False",
"print_options": {},
}
return setup_module("screenshot_enricher", configs)
@pytest.fixture
def metadata_with_video():
m = Metadata()
m.set_url("https://example.com")
m.add_media(Media(filename="video.mp4").set("id", "video1"))
return m
def test_enrich_adds_screenshot(
screenshot_enricher,
metadata_with_video,
mock_selenium_env,
common_patches,
tmp_path,
):
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
screenshot_enricher.enrich(metadata_with_video)
mock_driver_class.assert_called_once_with(
cookie=None,
cookie_jar=None,
facebook_accept_cookies=False,
options=mock_options_instance,
)
# Verify the actual calls on the returned mock_driver
mock_driver.get.assert_called_once_with("https://example.com")
mock_driver.save_screenshot.assert_called_once_with(str(tmp_path / "test.png"))
# Check that the media was added (2 = original video + screenshot)
assert len(metadata_with_video.media) == 2
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
@pytest.mark.parametrize(
"url,is_auth",
[
("https://example.com", False),
("https://private.com", True),
],
)
def test_enrich_auth_wall(
screenshot_enricher, metadata_with_video, mock_selenium_env, common_patches, url, is_auth, mocker
):
# Testing with and without is_auth_wall
mock_driver, mock_driver_class, _ = mock_selenium_env
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=is_auth)
metadata_with_video.set_url(url)
screenshot_enricher.enrich(metadata_with_video)
if is_auth:
mock_driver.get.assert_not_called()
assert len(metadata_with_video.media) == 1
assert metadata_with_video.media[0].properties.get("id") == "video1"
else:
mock_driver.get.assert_called_once_with(url)
assert len(metadata_with_video.media) == 2
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
def test_skip_authwall_no_cookies(screenshot_enricher, caplog):
with caplog.at_level("WARNING"):
screenshot_enricher.enrich(Metadata().set_url("https://instagram.com"))
assert "[SKIP] SCREENSHOT since url" in caplog.text
@pytest.mark.parametrize(
"auth",
[
{"cookie": "cookie"},
{"cookies_jar": "cookie"},
],
)
def test_dont_skip_authwall_with_cookies(screenshot_enricher, caplog, mocker, mock_selenium_env, auth):
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True)
# patch the authentication dict:
screenshot_enricher.authentication = {"example.com": auth}
with caplog.at_level("WARNING"):
screenshot_enricher.enrich(Metadata().set_url("https://example.com"))
assert "[SKIP] SCREENSHOT since url" not in caplog.text
def test_show_warning_wrong_auth_type(screenshot_enricher, caplog, mocker, mock_selenium_env):
mock_driver, mock_driver_class, _ = mock_selenium_env
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True)
screenshot_enricher.authentication = {"example.com": {"username": "user", "password": "pass"}}
with caplog.at_level("WARNING"):
screenshot_enricher.enrich(Metadata().set_url("https://example.com"))
assert "Screenshot enricher only supports cookie-type authentication" in caplog.text
def test_handle_timeout_exception(screenshot_enricher, metadata_with_video, mock_selenium_env, mocker):
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
mock_driver.get.side_effect = TimeoutException
mock_log = mocker.patch("loguru.logger.info")
screenshot_enricher.enrich(metadata_with_video)
mock_log.assert_called_once_with("TimeoutException loading page for screenshot")
assert len(metadata_with_video.media) == 1
def test_handle_general_exception(screenshot_enricher, metadata_with_video, mock_selenium_env, mocker):
"""Test proper handling of unexpected general exceptions"""
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
# Simulate a generic exception when save_screenshot is called
mock_driver.get.return_value = None
mock_driver.save_screenshot.side_effect = Exception("Unexpected Error")
mock_log = mocker.patch("loguru.logger.error")
screenshot_enricher.enrich(metadata_with_video)
# Verify that the exception was logged with the log
mock_log.assert_called_once_with("Got error while loading webdriver for screenshot enricher: Unexpected Error")
# And no new media was added due to the error
assert len(metadata_with_video.media) == 1
def test_pdf_creation(mocker, screenshot_enricher, metadata_with_video, mock_selenium_env):
"""Test PDF creation when save_to_pdf is enabled"""
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
# Override the save_to_pdf option
screenshot_enricher.save_to_pdf = True
# Mock the print_page method to return base64-encoded content
mock_driver.print_page.return_value = base64.b64encode(b"fake_pdf_content").decode("utf-8")
# Patch functions with mocker
mocker.patch("os.path.join", side_effect=lambda *args: f"{args[-1]}")
mocker.patch(
"auto_archiver.modules.screenshot_enricher.screenshot_enricher.random_str",
return_value="fixed123",
)
mock_open = mocker.patch("builtins.open", new_callable=mocker.mock_open)
screenshot_enricher.enrich(metadata_with_video)
# Verify screenshot and PDF creation
mock_driver.save_screenshot.assert_called_once()
mock_driver.print_page.assert_called_once_with(mock_driver.print_options)
# Check that PDF file was opened and written
mock_open.assert_any_call("pdf_fixed123.pdf", "wb")
# Ensure both screenshot and PDF were added as media
assert len(metadata_with_video.media) == 3
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
assert metadata_with_video.media[2].properties.get("id") == "pdf"
@pytest.fixture(autouse=True)
def cleanup_files(tmp_path):
yield
for file in tmp_path.iterdir():
file.unlink()

View File

@@ -0,0 +1,173 @@
import pytest
from auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher import AntibotExtractorEnricher
from .test_extractor_base import TestExtractorBase
class DummySB:
def __init__(self, url="", title="", visible_texts=None, visible_elements=None):
self._url = url
self._title = title
self._visible_texts = visible_texts or set()
self._visible_elements = visible_elements or set()
def get_current_url(self):
return self._url
def get_title(self):
return self._title
def is_text_visible(self, text):
return text in self._visible_texts
def is_element_visible(self, selector):
return selector in self._visible_elements
class TestAntibotExtractorEnricher(TestExtractorBase):
"""Tests Antibot Extractor/Enricher"""
extractor_module = "antibot_extractor_enricher"
extractor: AntibotExtractorEnricher
config = {
"save_to_pdf": False,
"max_download_images": 0,
"max_download_videos": 0,
"exclude_media_extensions": ".svg,.ico,.gif",
"proxy": None,
}
@pytest.mark.download
@pytest.mark.parametrize(
"url,in_title,image_count,video_count",
[
(
"https://en.wikipedia.org/wiki/Western_barn_owl",
"western barn owl",
5,
0,
),
(
"https://www.bellingcat.com/news/2025/04/29/open-sources-show-myanmar-junta-airstrike-damages-despite-post-earthquake-ceasefire/",
"open sources show myanmar",
5,
0,
),
(
"https://www.bellingcat.com/news/2025/03/27/gaza-israel-palestine-shot-killed-injured-destroyed-dangerous-drone-journalists-in-gaza/",
"shot from above",
5,
1,
),
(
"https://www.bellingcat.com/about/general-information",
"general information",
0, # SVGs are ignored
0,
),
],
)
def test_download_pages_with_media(self, setup_module, make_item, url, in_title, image_count, video_count):
"""
Test downloading pages with media.
"""
self.extractor = setup_module(
self.extractor_module,
{
"save_to_pdf": True,
"max_download_images": 5,
"max_download_videos": "inf",
},
)
item = make_item(url)
result = self.extractor.download(item)
assert result.status == "antibot", "Expected status to be 'antibot'"
# Check title contains all required words (case-insensitive)
page_title = result.get_title() or ""
assert in_title in page_title.lower(), f"Expected title to contain '{in_title}', got '{page_title}'"
image_media = [m for m in result.media if m.is_image() and not m.get("id") == "screenshot"]
assert len(image_media) == image_count, f"Expected {image_count} image items, got {len(image_media)}"
video_media = [m for m in result.media if m.is_video()]
assert len(video_media) == video_count, f"Expected {video_count} video items, got {len(video_media)}"
for expected_id in ["screenshot", "pdf", "html_source_code"]:
assert any(m.get("id") == expected_id for m in result.media), (
f"Expected media with id '{expected_id}' not found"
)
@pytest.mark.download
@pytest.mark.parametrize(
"url,in_html",
[
(
"https://myrotvorets.center/about/",
"Центр «Миротворець»",
),
(
"https://seleniumbase.io/apps/turnstile",
'id="captcha-success"',
),
],
)
def test_download_with_cloudflare_turnstile(self, setup_module, make_item, url, in_html):
"""
Test downloading a page with Cloudflare Turnstile captcha.
"""
item = make_item(url)
self.extractor.enrich(item)
assert item.status != "antibot", "Expected status not to be 'antibot' after handling Cloudflare Turnstile"
html_media = item.get_media_by_id("html_source_code")
with open(html_media.filename, "r", encoding="utf-8") as f:
html_content = f.read()
assert in_html.lower() in html_content.lower(), f"Expected HTML to contain '{in_html}'"
@pytest.mark.parametrize(
"url,title,visible_texts,visible_elements,expected",
[
# URL triggers
("https://example.com/login", "Welcome", set(), set(), True),
("https://example.com/somepage", "Just a moment...", set(), set(), True),
("https://example.com/", "Welcome", {"Please log in"}, set(), True),
("https://example.com/", "Welcome", set(), {"input[type='password']"}, True),
("https://example.com/", "Welcome", set("No issue here"), set(), False),
# Title triggers
("https://example.com/", "Log in", set(), set(), True),
("https://example.com/", "Verification required", set(), set(), True),
# Text triggers (case-insensitive)
("https://example.com/", "Welcome", {"Sign up or log in"}, set(), True),
("https://example.com/", "Welcome", {"sign up or log in"}, set(), True),
# Element triggers
("https://example.com/", "Welcome", set(), {"input[name='email']"}, True),
# No triggers
("https://example.com/", "Welcome", set(), set(), False),
],
)
def test_hit_auth_wall(self, url, title, visible_texts, visible_elements, expected):
extractor = AntibotExtractorEnricher()
sb = DummySB(url=url, title=title, visible_texts=visible_texts, visible_elements=visible_elements)
assert extractor._hit_auth_wall(sb) == expected
def test_enrich_handles_sb_exception(self, make_item, mocker):
"""
Test that enrich returns False and logs error if SB raises an exception.
"""
# Patch SB to raise an exception on context enter
mock_sb = mocker.patch("auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher.SB")
mock_logger = mocker.patch("auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher.logger")
mock_sb.side_effect = Exception("SB failed")
item = make_item("https://example.com/")
result = self.extractor.enrich(item)
assert result is False
mock_logger.error.assert_called()

View File

@@ -97,7 +97,7 @@ class TestGenericExtractor(TestExtractorBase):
)
def test_download_nonexistent_media(self, make_item, url):
"""
Test to make sure that the extractor doesn't break on non-existend posts/media
Test to make sure that the extractor doesn't break on non-existent posts/media
It should return 'False'
"""

View File

@@ -45,6 +45,19 @@ class TestS3Storage:
assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg"
def test_uploadf_sets_acl_public(self, mocker):
media = Media("test.png")
mock_file = mocker.MagicMock()
mock_s3_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
mocker.patch.object(self.storage, "is_upload_needed", return_value=True)
self.storage.uploadf(mock_file, media)
mock_s3_upload.assert_called_once_with(
mock_file,
Bucket="test-bucket",
Key=media.key,
ExtraArgs={"ACL": "public-read", "ContentType": "image/png"},
)
def test_uploadf_detects_charset_for_text_files(self, mocker):
media = Media("test.txt")
mock_file = mocker.MagicMock()
mock_s3_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
@@ -54,7 +67,7 @@ class TestS3Storage:
mock_file,
Bucket="test-bucket",
Key=media.key,
ExtraArgs={"ACL": "public-read", "ContentType": "text/plain"},
ExtraArgs={"ACL": "public-read", "ContentType": "text/plain; charset=utf-8"},
)
def test_upload_decision_logic(self, mocker):