mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-07 19:08:30 +03:00
Merge pull request #311 from bellingcat/feat/seleniumbase
Replaces ScreenshotEnricher with AntibotExtractorEnricher, removes VkExtractor
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -36,4 +36,4 @@ docs/source/autoapi/
|
||||
docs/source/modules/autogen/
|
||||
scripts/settings_page.html
|
||||
scripts/settings/src/schema.json
|
||||
.vite
|
||||
.vite
|
||||
22
Dockerfile
22
Dockerfile
@@ -11,26 +11,8 @@ ENV RUNNING_IN_DOCKER=1 \
|
||||
ARG TARGETARCH
|
||||
|
||||
# Installing system dependencies
|
||||
RUN add-apt-repository ppa:mozillateam/ppa && \
|
||||
apt-get update && \
|
||||
apt-get install -y --no-install-recommends gcc ffmpeg fonts-noto exiftool && \
|
||||
apt-get install -y --no-install-recommends firefox-esr && \
|
||||
ln -s /usr/bin/firefox-esr /usr/bin/firefox
|
||||
|
||||
ARG GECKODRIVER_VERSION=0.36.0
|
||||
|
||||
RUN if [ $(uname -m) = "aarch64" ]; then \
|
||||
GECKODRIVER_ARCH=linux-aarch64; \
|
||||
else \
|
||||
GECKODRIVER_ARCH=linux64; \
|
||||
fi && \
|
||||
wget https://github.com/mozilla/geckodriver/releases/download/v${GECKODRIVER_VERSION}/geckodriver-v${GECKODRIVER_VERSION}-${GECKODRIVER_ARCH}.tar.gz && \
|
||||
tar -xvzf geckodriver* -C /usr/local/bin && \
|
||||
chmod +x /usr/local/bin/geckodriver && \
|
||||
rm geckodriver-v* && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends gcc ffmpeg fonts-noto exiftool python3-tk
|
||||
|
||||
# Poetry and runtime
|
||||
FROM base AS runtime
|
||||
|
||||
@@ -8,7 +8,7 @@ The archiver archives web pages using the following workflow
|
||||
4. **Formatter** creates a report from all the archived content (HTML, PDF, ...)
|
||||
5. **Database** knows what's been archived and also stores the archive result (spreadsheet, CSV, or just the console)
|
||||
|
||||
Each step in the workflow is handled by 'modules' that interact with the data in different ways. For example, the Twitter Extractor Module would extract information from the Twitter website. The Screenshot Enricher Module will take screenshots of the given page. See the [core modules page](core_modules.md) for an overview of all the modules that are available.
|
||||
Each step in the workflow is handled by 'modules' that interact with the data in different ways. For example, the Twitter Extractor Module would extract information from the Twitter website. The AntiBot Module will download HTML and take screenshots of the given page. See the [core modules page](core_modules.md) for an overview of all the modules that are available.
|
||||
|
||||
Auto-archiver must have at least one module defined for each step of the workflow. This is done by setting the [configuration](installation/configurations.md) for your auto-archiver instance.
|
||||
|
||||
|
||||
@@ -51,8 +51,8 @@ After this, you're ready to set up your [your configuration file](configurations
|
||||
If using the local installation method, you will also need to install the following dependencies locally:
|
||||
|
||||
1.[ffmpeg](https://www.ffmpeg.org/) - for handling of downloaded videos
|
||||
2. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin` - for taking webpage screenshots with the screenshot enricher
|
||||
3. (optional) [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
|
||||
<!-- 2. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin` - for taking webpage screenshots with the screenshot enricher -->
|
||||
3. (optional) [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium screenshots: `sudo apt install fonts-noto -y`.
|
||||
4. [Browsertrix Crawler docker image](https://hub.docker.com/r/webrecorder/browsertrix-crawler) for the WACZ enricher/archiver
|
||||
|
||||
|
||||
|
||||
695
poetry.lock
generated
695
poetry.lock
generated
@@ -158,6 +158,27 @@ charset-normalizer = ["charset-normalizer"]
|
||||
html5lib = ["html5lib"]
|
||||
lxml = ["lxml"]
|
||||
|
||||
[[package]]
|
||||
name = "behave"
|
||||
version = "1.2.6"
|
||||
description = "behave is behaviour-driven development, Python style"
|
||||
optional = false
|
||||
python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "behave-1.2.6-py2.py3-none-any.whl", hash = "sha256:ebda1a6c9e5bfe95c5f9f0a2794e01c7098b3dde86c10a95d8621c5907ff6f1c"},
|
||||
{file = "behave-1.2.6.tar.gz", hash = "sha256:b9662327aa53294c1351b0a9c369093ccec1d21026f050c3bd9b3e5cccf81a86"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
parse = ">=1.8.2"
|
||||
parse-type = ">=0.4.2"
|
||||
six = ">=1.11"
|
||||
|
||||
[package.extras]
|
||||
develop = ["coverage", "invoke (>=0.21.0)", "modernize (>=0.5)", "path.py (>=8.1.2)", "pathlib", "pycmd", "pylint", "pytest (>=3.0)", "pytest-cov", "tox"]
|
||||
docs = ["sphinx (>=1.6)", "sphinx-bootstrap-theme (>=0.6)"]
|
||||
|
||||
[[package]]
|
||||
name = "bgutil-ytdlp-pot-provider"
|
||||
version = "1.1.0"
|
||||
@@ -518,6 +539,18 @@ files = [
|
||||
{file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "chardet"
|
||||
version = "5.2.0"
|
||||
description = "Universal encoding detector for Python 3"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"},
|
||||
{file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "charset-normalizer"
|
||||
version = "3.4.2"
|
||||
@@ -646,7 +679,7 @@ files = [
|
||||
{file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
|
||||
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
||||
]
|
||||
markers = {main = "sys_platform == \"win32\" or platform_system == \"Windows\"", dev = "sys_platform == \"win32\""}
|
||||
markers = {dev = "sys_platform == \"win32\""}
|
||||
|
||||
[[package]]
|
||||
name = "cryptography"
|
||||
@@ -708,6 +741,18 @@ ssh = ["bcrypt (>=3.1.5)"]
|
||||
test = ["certifi (>=2024)", "cryptography-vectors (==44.0.3)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"]
|
||||
test-randomorder = ["pytest-randomly"]
|
||||
|
||||
[[package]]
|
||||
name = "cssselect"
|
||||
version = "1.3.0"
|
||||
description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "cssselect-1.3.0-py3-none-any.whl", hash = "sha256:56d1bf3e198080cc1667e137bc51de9cadfca259f03c2d4e09037b3e01e30f0d"},
|
||||
{file = "cssselect-1.3.0.tar.gz", hash = "sha256:57f8a99424cfab289a1b6a816a43075a4b00948c86b4dcf3ef4ee7e15f7ab0c7"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "curl-cffi"
|
||||
version = "0.10.0"
|
||||
@@ -808,11 +853,11 @@ description = "Backport of PEP 654 (exception groups)"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main", "dev", "docs"]
|
||||
markers = "python_version == \"3.10\""
|
||||
files = [
|
||||
{file = "exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10"},
|
||||
{file = "exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88"},
|
||||
]
|
||||
markers = {dev = "python_version == \"3.10\"", docs = "python_version == \"3.10\""}
|
||||
|
||||
[package.dependencies]
|
||||
typing-extensions = {version = ">=4.6.0", markers = "python_version < \"3.13\""}
|
||||
@@ -820,6 +865,33 @@ typing-extensions = {version = ">=4.6.0", markers = "python_version < \"3.13\""}
|
||||
[package.extras]
|
||||
test = ["pytest (>=6)"]
|
||||
|
||||
[[package]]
|
||||
name = "execnet"
|
||||
version = "2.1.1"
|
||||
description = "execnet: rapid multi-Python deployment"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "execnet-2.1.1-py3-none-any.whl", hash = "sha256:26dee51f1b80cebd6d0ca8e74dd8745419761d3bef34163928cbebbdc4749fdc"},
|
||||
{file = "execnet-2.1.1.tar.gz", hash = "sha256:5189b52c6121c24feae288166ab41b32549c7e2348652736540b9e6e7d4e72e3"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
testing = ["hatch", "pre-commit", "pytest", "tox"]
|
||||
|
||||
[[package]]
|
||||
name = "fasteners"
|
||||
version = "0.19"
|
||||
description = "A python package that provides useful locks"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "fasteners-0.19-py3-none-any.whl", hash = "sha256:758819cb5d94cdedf4e836988b74de396ceacb8e2794d21f82d131fd9ee77237"},
|
||||
{file = "fasteners-0.19.tar.gz", hash = "sha256:b4f37c3ac52d8a445af3a66bce57b33b5e90b97c696b7b984f530cf8f0ded09c"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ffmpeg-python"
|
||||
version = "0.2.0"
|
||||
@@ -844,7 +916,7 @@ version = "3.18.0"
|
||||
description = "A platform independent file lock."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["dev"]
|
||||
groups = ["main", "dev"]
|
||||
files = [
|
||||
{file = "filelock-3.18.0-py3-none-any.whl", hash = "sha256:c401f4f8377c4464e6db25fff06205fd89bdd83b65eb0488ed1b160f780e21de"},
|
||||
{file = "filelock-3.18.0.tar.gz", hash = "sha256:adbc88eabb99d2fec8c9c1b229b171f18afa655400173ddc653d5d01501fb9f2"},
|
||||
@@ -1082,7 +1154,7 @@ version = "2.1.0"
|
||||
description = "brain-dead simple config-ini parsing"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["dev"]
|
||||
groups = ["main", "dev"]
|
||||
files = [
|
||||
{file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"},
|
||||
{file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"},
|
||||
@@ -1355,6 +1427,22 @@ files = [
|
||||
{file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mouseinfo"
|
||||
version = "0.1.3"
|
||||
description = "An application to display XY position and RGB color information for the pixel currently under the mouse. Works on Python 2 and 3."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "MouseInfo-0.1.3.tar.gz", hash = "sha256:2c62fb8885062b8e520a3cce0a297c657adcc08c60952eb05bc8256ef6f7f6e7"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pyperclip = "*"
|
||||
python3-Xlib = {version = "*", markers = "platform_system == \"Linux\" and python_version >= \"3.0\""}
|
||||
rubicon-objc = {version = "*", markers = "platform_system == \"Darwin\""}
|
||||
|
||||
[[package]]
|
||||
name = "mutagen"
|
||||
version = "1.47.0"
|
||||
@@ -1367,6 +1455,18 @@ files = [
|
||||
{file = "mutagen-1.47.0.tar.gz", hash = "sha256:719fadef0a978c31b4cf3c956261b3c58b6948b32023078a2117b1de09f0fc99"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mycdp"
|
||||
version = "1.2.0"
|
||||
description = "Autogenerated CDP utilities for Python"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "mycdp-1.2.0-py3-none-any.whl", hash = "sha256:8f9ef628fa68e391f59ad9cd555ae75746bd3a48947017c9ecc65a63624a1d41"},
|
||||
{file = "mycdp-1.2.0.tar.gz", hash = "sha256:0603fd8e3454147c4f549edaa13f5294f57ecb481640c03f808ed548a03f796f"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mypy-extensions"
|
||||
version = "1.1.0"
|
||||
@@ -1562,6 +1662,71 @@ files = [
|
||||
{file = "packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parameterized"
|
||||
version = "0.9.0"
|
||||
description = "Parameterized testing with any Python test framework"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "parameterized-0.9.0-py2.py3-none-any.whl", hash = "sha256:4e0758e3d41bea3bbd05ec14fc2c24736723f243b28d702081aef438c9372b1b"},
|
||||
{file = "parameterized-0.9.0.tar.gz", hash = "sha256:7fc905272cefa4f364c1a3429cbbe9c0f98b793988efb5bf90aac80f08db09b1"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
dev = ["jinja2"]
|
||||
|
||||
[[package]]
|
||||
name = "parse"
|
||||
version = "1.20.2"
|
||||
description = "parse() is the opposite of format()"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "parse-1.20.2-py2.py3-none-any.whl", hash = "sha256:967095588cb802add9177d0c0b6133b5ba33b1ea9007ca800e526f42a85af558"},
|
||||
{file = "parse-1.20.2.tar.gz", hash = "sha256:b41d604d16503c79d81af5165155c0b20f6c8d6c559efa66b4b695c3e5a0a0ce"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parse-type"
|
||||
version = "0.6.4"
|
||||
description = "Simplifies to build parse types based on the parse module"
|
||||
optional = false
|
||||
python-versions = "!=3.0.*,!=3.1.*,>=2.7"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "parse_type-0.6.4-py2.py3-none-any.whl", hash = "sha256:83d41144a82d6b8541127bf212dd76c7f01baff680b498ce8a4d052a7a5bce4c"},
|
||||
{file = "parse_type-0.6.4.tar.gz", hash = "sha256:5e1ec10440b000c3f818006033372939e693a9ec0176f446d9303e4db88489a6"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
parse = {version = ">=1.18.0", markers = "python_version >= \"3.0\""}
|
||||
six = ">=1.15"
|
||||
|
||||
[package.extras]
|
||||
develop = ["build (>=0.5.1)", "coverage (>=4.4)", "pylint", "pytest (<5.0) ; python_version < \"3.0\"", "pytest (>=5.0) ; python_version >= \"3.0\"", "pytest-cov", "pytest-html (>=1.19.0)", "ruff ; python_version >= \"3.7\"", "setuptools", "setuptools-scm", "tox (>=2.8,<4.0)", "twine (>=1.13.0)", "virtualenv (<20.22.0) ; python_version <= \"3.6\"", "virtualenv (>=20.0.0) ; python_version > \"3.6\"", "wheel"]
|
||||
docs = ["Sphinx (>=1.6)", "sphinx-bootstrap-theme (>=0.6.0)"]
|
||||
testing = ["pytest (<5.0) ; python_version < \"3.0\"", "pytest (>=5.0) ; python_version >= \"3.0\"", "pytest-html (>=1.19.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "pdbp"
|
||||
version = "1.7.0"
|
||||
description = "pdbp (Pdb+): A drop-in replacement for pdb and pdbpp."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "pdbp-1.7.0-py3-none-any.whl", hash = "sha256:6ad99cb4e9f2fc1a5b4ef4f2e0acdb28b18b271bf71f6c9f997b652d935caa19"},
|
||||
{file = "pdbp-1.7.0.tar.gz", hash = "sha256:d0a5b275720c451f5574427e35523aeb61c244f3faf622a80fe03019ef82d380"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
colorama = {version = ">=0.4.6", markers = "platform_system == \"Windows\""}
|
||||
pygments = ">=2.19.1"
|
||||
tabcompleter = ">=1.4.0"
|
||||
|
||||
[[package]]
|
||||
name = "pdqhash"
|
||||
version = "0.2.8"
|
||||
@@ -1681,13 +1846,25 @@ tests = ["check-manifest", "coverage (>=7.4.2)", "defusedxml", "markdown2", "ole
|
||||
typing = ["typing-extensions ; python_version < \"3.10\""]
|
||||
xmp = ["defusedxml"]
|
||||
|
||||
[[package]]
|
||||
name = "pip"
|
||||
version = "25.1.1"
|
||||
description = "The PyPA recommended tool for installing Python packages."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "pip-25.1.1-py3-none-any.whl", hash = "sha256:2913a38a2abf4ea6b64ab507bd9e967f3b53dc1ede74b01b0931e1ce548751af"},
|
||||
{file = "pip-25.1.1.tar.gz", hash = "sha256:3de45d411d308d5054c2168185d8da7f9a2cd753dbac8acbfa88a8909ecd9077"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "platformdirs"
|
||||
version = "4.3.8"
|
||||
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["dev"]
|
||||
groups = ["main", "dev"]
|
||||
files = [
|
||||
{file = "platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4"},
|
||||
{file = "platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc"},
|
||||
@@ -1704,7 +1881,7 @@ version = "1.6.0"
|
||||
description = "plugin and hook calling mechanisms for python"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["dev"]
|
||||
groups = ["main", "dev"]
|
||||
files = [
|
||||
{file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"},
|
||||
{file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"},
|
||||
@@ -1808,6 +1985,27 @@ files = [
|
||||
[package.dependencies]
|
||||
pyasn1 = ">=0.6.1,<0.7.0"
|
||||
|
||||
[[package]]
|
||||
name = "pyautogui"
|
||||
version = "0.9.54"
|
||||
description = "PyAutoGUI lets Python control the mouse and keyboard, and other GUI automation tasks. For Windows, macOS, and Linux, on Python 3 and 2."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "PyAutoGUI-0.9.54.tar.gz", hash = "sha256:dd1d29e8fd118941cb193f74df57e5c6ff8e9253b99c7b04f39cfc69f3ae04b2"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
mouseinfo = "*"
|
||||
pygetwindow = ">=0.0.5"
|
||||
pymsgbox = "*"
|
||||
pyobjc-core = {version = "*", markers = "platform_system == \"Darwin\""}
|
||||
pyobjc-framework-quartz = {version = "*", markers = "platform_system == \"Darwin\""}
|
||||
pyscreeze = ">=0.1.21"
|
||||
python3-Xlib = {version = "*", markers = "platform_system == \"Linux\" and python_version >= \"3.0\""}
|
||||
pytweening = ">=1.0.4"
|
||||
|
||||
[[package]]
|
||||
name = "pycodestyle"
|
||||
version = "2.13.0"
|
||||
@@ -1912,6 +2110,20 @@ doc = ["ablog (>=0.11.8)", "colorama", "graphviz", "ipykernel", "ipyleaflet", "i
|
||||
i18n = ["Babel", "jinja2"]
|
||||
test = ["pytest", "pytest-cov", "pytest-regressions", "sphinx[test]"]
|
||||
|
||||
[[package]]
|
||||
name = "pygetwindow"
|
||||
version = "0.0.9"
|
||||
description = "A simple, cross-platform module for obtaining GUI information on application's windows."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "PyGetWindow-0.0.9.tar.gz", hash = "sha256:17894355e7d2b305cd832d717708384017c1698a90ce24f6f7fbf0242dd0a688"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pyrect = "*"
|
||||
|
||||
[[package]]
|
||||
name = "pygments"
|
||||
version = "2.19.1"
|
||||
@@ -1927,6 +2139,105 @@ files = [
|
||||
[package.extras]
|
||||
windows-terminal = ["colorama (>=0.4.6)"]
|
||||
|
||||
[[package]]
|
||||
name = "pymsgbox"
|
||||
version = "1.0.9"
|
||||
description = "A simple, cross-platform, pure Python module for JavaScript-like message boxes."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "PyMsgBox-1.0.9.tar.gz", hash = "sha256:2194227de8bff7a3d6da541848705a155dcbb2a06ee120d9f280a1d7f51263ff"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pynose"
|
||||
version = "1.5.4"
|
||||
description = "pynose fixes nose to extend unittest and make testing easier"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "pynose-1.5.4-py3-none-any.whl", hash = "sha256:ee4ae91c9e2b54d46647f5d81b95392dd27e02ed26f016dadb5f1ac10f949d96"},
|
||||
{file = "pynose-1.5.4.tar.gz", hash = "sha256:97dd0b7e85cf990120a01147e83ccd960c09ffcd69f6822f18e14128c6655e67"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyobjc-core"
|
||||
version = "11.0"
|
||||
description = "Python<->ObjC Interoperability Module"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
markers = "platform_system == \"Darwin\""
|
||||
files = [
|
||||
{file = "pyobjc_core-11.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:10866b3a734d47caf48e456eea0d4815c2c9b21856157db5917b61dee06893a1"},
|
||||
{file = "pyobjc_core-11.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:50675c0bb8696fe960a28466f9baf6943df2928a1fd85625d678fa2f428bd0bd"},
|
||||
{file = "pyobjc_core-11.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a03061d4955c62ddd7754224a80cdadfdf17b6b5f60df1d9169a3b1b02923f0b"},
|
||||
{file = "pyobjc_core-11.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:c338c1deb7ab2e9436d4175d1127da2eeed4a1b564b3d83b9f3ae4844ba97e86"},
|
||||
{file = "pyobjc_core-11.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b4e9dc4296110f251a4033ff3f40320b35873ea7f876bd29a1c9705bb5e08c59"},
|
||||
{file = "pyobjc_core-11.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:02406ece449d0f41b31e579e47ca77ced3eb57533df955281bfcecc99da74fba"},
|
||||
{file = "pyobjc_core-11.0.tar.gz", hash = "sha256:63bced211cb8a8fb5c8ff46473603da30e51112861bd02c438fbbbc8578d9a70"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyobjc-framework-cocoa"
|
||||
version = "11.0"
|
||||
description = "Wrappers for the Cocoa frameworks on macOS"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
markers = "platform_system == \"Darwin\""
|
||||
files = [
|
||||
{file = "pyobjc_framework_Cocoa-11.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:fbc65f260d617d5463c7fb9dbaaffc23c9a4fabfe3b1a50b039b61870b8daefd"},
|
||||
{file = "pyobjc_framework_Cocoa-11.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3ea7be6e6dd801b297440de02d312ba3fa7fd3c322db747ae1cb237e975f5d33"},
|
||||
{file = "pyobjc_framework_Cocoa-11.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:280a577b83c68175a28b2b7138d1d2d3111f2b2b66c30e86f81a19c2b02eae71"},
|
||||
{file = "pyobjc_framework_Cocoa-11.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:15b2bd977ed340074f930f1330f03d42912d5882b697d78bd06f8ebe263ef92e"},
|
||||
{file = "pyobjc_framework_Cocoa-11.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:5750001db544e67f2b66f02067d8f0da96bb2ef71732bde104f01b8628f9d7ea"},
|
||||
{file = "pyobjc_framework_Cocoa-11.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ddff25b0755d59873d186e1e07d6aaddb19d55e3ae890d69ff2d9babf8627657"},
|
||||
{file = "pyobjc_framework_cocoa-11.0.tar.gz", hash = "sha256:00346a8cb81ad7b017b32ff7bf596000f9faa905807b1bd234644ebd47f692c5"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pyobjc-core = ">=11.0"
|
||||
|
||||
[[package]]
|
||||
name = "pyobjc-framework-quartz"
|
||||
version = "11.0"
|
||||
description = "Wrappers for the Quartz frameworks on macOS"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
markers = "platform_system == \"Darwin\""
|
||||
files = [
|
||||
{file = "pyobjc_framework_Quartz-11.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:da3ab13c9f92361959b41b0ad4cdd41ae872f90a6d8c58a9ed699bc08ab1c45c"},
|
||||
{file = "pyobjc_framework_Quartz-11.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d251696bfd8e8ef72fbc90eb29fec95cb9d1cc409008a183d5cc3246130ae8c2"},
|
||||
{file = "pyobjc_framework_Quartz-11.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:cb4a9f2d9d580ea15e25e6b270f47681afb5689cafc9e25712445ce715bcd18e"},
|
||||
{file = "pyobjc_framework_Quartz-11.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:973b4f9b8ab844574461a038bd5269f425a7368d6e677e3cc81fcc9b27b65498"},
|
||||
{file = "pyobjc_framework_Quartz-11.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:66ab58d65348863b8707e63b2ec5cdc54569ee8189d1af90d52f29f5fdf6272c"},
|
||||
{file = "pyobjc_framework_Quartz-11.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1032f63f2a4ee98366764e69c249f1d93813821e17d224cf626cf11fb1801fc4"},
|
||||
{file = "pyobjc_framework_quartz-11.0.tar.gz", hash = "sha256:3205bf7795fb9ae34747f701486b3db6dfac71924894d1f372977c4d70c3c619"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pyobjc-core = ">=11.0"
|
||||
pyobjc-framework-Cocoa = ">=11.0"
|
||||
|
||||
[[package]]
|
||||
name = "pyotp"
|
||||
version = "2.9.0"
|
||||
description = "Python One Time Password Library"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "pyotp-2.9.0-py3-none-any.whl", hash = "sha256:81c2e5865b8ac55e825b0358e496e1d9387c811e85bb40e71a3b29b288963612"},
|
||||
{file = "pyotp-2.9.0.tar.gz", hash = "sha256:346b6642e0dbdde3b4ff5a930b664ca82abfa116356ed48cc42c7d6590d36f63"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
test = ["coverage", "mypy", "ruff", "wheel"]
|
||||
|
||||
[[package]]
|
||||
name = "pyparsing"
|
||||
version = "3.2.3"
|
||||
@@ -1942,6 +2253,61 @@ files = [
|
||||
[package.extras]
|
||||
diagrams = ["jinja2", "railroad-diagrams"]
|
||||
|
||||
[[package]]
|
||||
name = "pyperclip"
|
||||
version = "1.9.0"
|
||||
description = "A cross-platform clipboard module for Python. (Only handles plain text for now.)"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "pyperclip-1.9.0.tar.gz", hash = "sha256:b7de0142ddc81bfc5c7507eea19da920b92252b548b96186caf94a5e2527d310"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyreadline3"
|
||||
version = "3.5.4"
|
||||
description = "A python implementation of GNU readline."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
markers = "platform_system == \"Windows\""
|
||||
files = [
|
||||
{file = "pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6"},
|
||||
{file = "pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
dev = ["build", "flake8", "mypy", "pytest", "twine"]
|
||||
|
||||
[[package]]
|
||||
name = "pyrect"
|
||||
version = "0.2.0"
|
||||
description = "PyRect is a simple module with a Rect class for Pygame-like rectangular areas."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "PyRect-0.2.0.tar.gz", hash = "sha256:f65155f6df9b929b67caffbd57c0947c5ae5449d3b580d178074bffb47a09b78"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyscreeze"
|
||||
version = "1.0.1"
|
||||
description = "A simple, cross-platform screenshot module for Python 2 and 3."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "pyscreeze-1.0.1.tar.gz", hash = "sha256:cf1662710f1b46aa5ff229ee23f367da9e20af4a78e6e365bee973cad0ead4be"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
Pillow = [
|
||||
{version = ">=9.3.0", markers = "python_version == \"3.11\""},
|
||||
{version = ">=9.2.0", markers = "python_version == \"3.10\""},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pysocks"
|
||||
version = "1.7.1"
|
||||
@@ -1973,7 +2339,7 @@ version = "8.3.5"
|
||||
description = "pytest: simple powerful testing with Python"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["dev"]
|
||||
groups = ["main", "dev"]
|
||||
files = [
|
||||
{file = "pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820"},
|
||||
{file = "pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845"},
|
||||
@@ -1990,6 +2356,27 @@ tomli = {version = ">=1", markers = "python_version < \"3.11\""}
|
||||
[package.extras]
|
||||
dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
|
||||
|
||||
[[package]]
|
||||
name = "pytest-html"
|
||||
version = "4.0.2"
|
||||
description = "pytest plugin for generating HTML reports"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "pytest_html-4.0.2-py3-none-any.whl", hash = "sha256:907c3e68462df129d3ee96dee58bd63f70216b06421836b22fd3fd57ef314acb"},
|
||||
{file = "pytest_html-4.0.2.tar.gz", hash = "sha256:88682b9e8e51392472546a70a2139b27d6bc1834a4afd3e41da33c9d9f91e4a4"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
jinja2 = ">=3.0.0"
|
||||
pytest = ">=7.0.0"
|
||||
pytest-metadata = ">=2.0.0"
|
||||
|
||||
[package.extras]
|
||||
docs = ["pip-tools (>=6.13.0)"]
|
||||
test = ["assertpy (>=1.1)", "beautifulsoup4 (>=4.11.1)", "black (>=22.1.0)", "flake8 (>=4.0.1)", "pre-commit (>=2.17.0)", "pytest-mock (>=3.7.0)", "pytest-rerunfailures (>=11.1.2)", "pytest-xdist (>=2.4.0)", "selenium (>=4.3.0)", "tox (>=3.24.5)"]
|
||||
|
||||
[[package]]
|
||||
name = "pytest-loguru"
|
||||
version = "0.4.0"
|
||||
@@ -2008,6 +2395,24 @@ loguru = "*"
|
||||
[package.extras]
|
||||
test = ["pytest", "pytest-cov"]
|
||||
|
||||
[[package]]
|
||||
name = "pytest-metadata"
|
||||
version = "3.1.1"
|
||||
description = "pytest plugin for test session metadata"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "pytest_metadata-3.1.1-py3-none-any.whl", hash = "sha256:c8e0844db684ee1c798cfa38908d20d67d0463ecb6137c72e91f418558dd5f4b"},
|
||||
{file = "pytest_metadata-3.1.1.tar.gz", hash = "sha256:d2a29b0355fbc03f168aa96d41ff88b1a3b44a3b02acbe491801c98a048017c8"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pytest = ">=7.0.0"
|
||||
|
||||
[package.extras]
|
||||
test = ["black (>=22.1.0)", "flake8 (>=4.0.1)", "pre-commit (>=2.17.0)", "tox (>=3.24.5)"]
|
||||
|
||||
[[package]]
|
||||
name = "pytest-mock"
|
||||
version = "3.14.1"
|
||||
@@ -2026,6 +2431,59 @@ pytest = ">=6.2.5"
|
||||
[package.extras]
|
||||
dev = ["pre-commit", "pytest-asyncio", "tox"]
|
||||
|
||||
[[package]]
|
||||
name = "pytest-ordering"
|
||||
version = "0.6"
|
||||
description = "pytest plugin to run your tests in a specific order"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"},
|
||||
{file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"},
|
||||
{file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pytest = "*"
|
||||
|
||||
[[package]]
|
||||
name = "pytest-rerunfailures"
|
||||
version = "15.1"
|
||||
description = "pytest plugin to re-run tests to eliminate flaky failures"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "pytest_rerunfailures-15.1-py3-none-any.whl", hash = "sha256:f674c3594845aba8b23c78e99b1ff8068556cc6a8b277f728071fdc4f4b0b355"},
|
||||
{file = "pytest_rerunfailures-15.1.tar.gz", hash = "sha256:c6040368abd7b8138c5b67288be17d6e5611b7368755ce0465dda0362c8ece80"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
packaging = ">=17.1"
|
||||
pytest = ">=7.4,<8.2.2 || >8.2.2"
|
||||
|
||||
[[package]]
|
||||
name = "pytest-xdist"
|
||||
version = "3.7.0"
|
||||
description = "pytest xdist plugin for distributed testing, most importantly across multiple CPUs"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "pytest_xdist-3.7.0-py3-none-any.whl", hash = "sha256:7d3fbd255998265052435eb9daa4e99b62e6fb9cfb6efd1f858d4d8c0c7f0ca0"},
|
||||
{file = "pytest_xdist-3.7.0.tar.gz", hash = "sha256:f9248c99a7c15b7d2f90715df93610353a485827bc06eefb6566d23f6400f126"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
execnet = ">=2.1"
|
||||
pytest = ">=7.0.0"
|
||||
|
||||
[package.extras]
|
||||
psutil = ["psutil (>=3.0)"]
|
||||
setproctitle = ["setproctitle"]
|
||||
testing = ["filelock"]
|
||||
|
||||
[[package]]
|
||||
name = "python-bitcoinlib"
|
||||
version = "0.12.2"
|
||||
@@ -2088,6 +2546,45 @@ Authlib = ">=1.0.0"
|
||||
dataclasses-json = ">=0.5.7"
|
||||
requests = ">=2.28"
|
||||
|
||||
[[package]]
|
||||
name = "python-xlib"
|
||||
version = "0.33"
|
||||
description = "Python X Library"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
markers = "platform_system == \"Linux\""
|
||||
files = [
|
||||
{file = "python-xlib-0.33.tar.gz", hash = "sha256:55af7906a2c75ce6cb280a584776080602444f75815a7aff4d287bb2d7018b32"},
|
||||
{file = "python_xlib-0.33-py2.py3-none-any.whl", hash = "sha256:c3534038d42e0df2f1392a1b30a15a4ff5fdc2b86cfa94f072bf11b10a164398"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
six = ">=1.10.0"
|
||||
|
||||
[[package]]
|
||||
name = "python3-xlib"
|
||||
version = "0.15"
|
||||
description = "Python3 X Library"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
markers = "platform_system == \"Linux\""
|
||||
files = [
|
||||
{file = "python3-xlib-0.15.tar.gz", hash = "sha256:dc4245f3ae4aa5949c1d112ee4723901ade37a96721ba9645f2bfa56e5b383f8"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pytweening"
|
||||
version = "1.2.0"
|
||||
description = "A collection of tweening (aka easing) functions."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "pytweening-1.2.0.tar.gz", hash = "sha256:243318b7736698066c5f362ec5c2b6434ecf4297c3c8e7caa8abfe6af4cac71b"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pytz"
|
||||
version = "2025.2"
|
||||
@@ -2106,7 +2603,7 @@ version = "6.0.2"
|
||||
description = "YAML parser and emitter for Python"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["dev", "docs"]
|
||||
groups = ["main", "dev", "docs"]
|
||||
files = [
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
|
||||
@@ -2397,7 +2894,7 @@ description = "Manipulate well-formed Roman numerals"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["docs"]
|
||||
markers = "python_version == \"3.12\""
|
||||
markers = "python_version >= \"3.11\""
|
||||
files = [
|
||||
{file = "roman_numerals_py-3.1.0-py3-none-any.whl", hash = "sha256:9da2ad2fb670bcf24e81070ceb3be72f6c11c440d73bd579fbeca1e9f330954c"},
|
||||
{file = "roman_numerals_py-3.1.0.tar.gz", hash = "sha256:be4bf804f083a4ce001b5eb7e3c0862479d10f94c936f6c4e5f250aa5ff5bd2d"},
|
||||
@@ -2498,6 +2995,23 @@ files = [
|
||||
{file = "ruamel.yaml.clib-0.2.12.tar.gz", hash = "sha256:6c8fbb13ec503f99a91901ab46e0b07ae7941cd527393187039aec586fdfd36f"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rubicon-objc"
|
||||
version = "0.5.1"
|
||||
description = "A bridge between an Objective C runtime environment and Python."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
markers = "platform_system == \"Darwin\""
|
||||
files = [
|
||||
{file = "rubicon_objc-0.5.1-py3-none-any.whl", hash = "sha256:17092756241b8370231cfaad45ad6e8ce99534987f2acbc944d65df5bdf8f6cd"},
|
||||
{file = "rubicon_objc-0.5.1.tar.gz", hash = "sha256:90bee9fc1de4515e17615e15648989b88bb8d4d2ffc8c7c52748272cd7f30a66"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
dev = ["pre-commit (==4.2.0)", "pytest (==8.3.5)", "setuptools_scm (==8.3.1)", "tox (==4.26.0)"]
|
||||
docs = ["furo (==2024.8.6)", "pyenchant (==3.2.2)", "sphinx (==8.2.3)", "sphinx-autobuild (==2024.10.3)", "sphinx-copybutton (==0.5.2)", "sphinx_tabs (==3.4.7)", "sphinxcontrib-spelling (==8.0.1)"]
|
||||
|
||||
[[package]]
|
||||
name = "ruff"
|
||||
version = "0.9.10"
|
||||
@@ -2544,6 +3058,22 @@ botocore = ">=1.37.4,<2.0a.0"
|
||||
[package.extras]
|
||||
crt = ["botocore[crt] (>=1.37.4,<2.0a.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "sbvirtualdisplay"
|
||||
version = "1.4.0"
|
||||
description = "A customized pyvirtualdisplay for SeleniumBase."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "sbvirtualdisplay-1.4.0-py3-none-any.whl", hash = "sha256:516de155219aa342c4e090a3c5126cfe6b12416334bcba3255268e44a5e8a206"},
|
||||
{file = "sbvirtualdisplay-1.4.0.tar.gz", hash = "sha256:29a365b509cd7bfde4f758603b7b75703909b11cdf4245abc8f828ed35660d9b"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
coverage = ["coverage (>=7.6.1) ; python_version < \"3.9\"", "coverage (>=7.6.9) ; python_version >= \"3.9\"", "pytest-cov (>=5.0.0) ; python_version < \"3.9\"", "pytest-cov (>=6.0.0) ; python_version >= \"3.9\""]
|
||||
flake8 = ["flake8 (==5.0.4) ; python_version < \"3.9\"", "flake8 (==7.1.1) ; python_version >= \"3.9\"", "mccabe (==0.7.0)", "pycodestyle (==2.12.1) ; python_version >= \"3.9\"", "pycodestyle (==2.9.1) ; python_version < \"3.9\"", "pyflakes (==2.5.0) ; python_version < \"3.9\"", "pyflakes (==3.2.0) ; python_version >= \"3.9\""]
|
||||
|
||||
[[package]]
|
||||
name = "secretstorage"
|
||||
version = "3.3.3"
|
||||
@@ -2580,6 +3110,115 @@ typing_extensions = ">=4.13.2,<4.14.0"
|
||||
urllib3 = {version = ">=2.4.0,<2.5.0", extras = ["socks"]}
|
||||
websocket-client = ">=1.8.0,<1.9.0"
|
||||
|
||||
[[package]]
|
||||
name = "seleniumbase"
|
||||
version = "4.39.2"
|
||||
description = "A complete web automation framework for end-to-end testing."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "seleniumbase-4.39.2-py3-none-any.whl", hash = "sha256:23b2d071c02ba269a8239b828fd5098edb208d04171143c93b40d8a351ba2861"},
|
||||
{file = "seleniumbase-4.39.2.tar.gz", hash = "sha256:3a18d582ca90f4d633debb8ec45871db1b7aed71e5876fc634962fba79731967"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
attrs = ">=25.3.0"
|
||||
beautifulsoup4 = "4.13.4"
|
||||
behave = "1.2.6"
|
||||
certifi = ">=2025.4.26"
|
||||
chardet = "5.2.0"
|
||||
charset-normalizer = ">=3.4.2,<4"
|
||||
colorama = ">=0.4.6"
|
||||
cssselect = {version = "1.3.0", markers = "python_version >= \"3.9\""}
|
||||
exceptiongroup = ">=1.3.0"
|
||||
execnet = "2.1.1"
|
||||
fasteners = ">=0.19"
|
||||
filelock = {version = ">=3.18.0", markers = "python_version >= \"3.9\""}
|
||||
h11 = "0.16.0"
|
||||
idna = "3.10"
|
||||
iniconfig = "2.1.0"
|
||||
Jinja2 = ">=3.1.6"
|
||||
markdown-it-py = "3.0.0"
|
||||
MarkupSafe = {version = ">=3.0.2", markers = "python_version >= \"3.9\""}
|
||||
mdurl = "0.1.2"
|
||||
mycdp = ">=1.2.0"
|
||||
outcome = "1.3.0.post0"
|
||||
packaging = ">=25.0"
|
||||
parameterized = "0.9.0"
|
||||
parse = ">=1.20.2"
|
||||
parse-type = ">=0.6.4"
|
||||
pdbp = ">=1.7.0"
|
||||
pip = {version = ">=25.1.1", markers = "python_version >= \"3.9\""}
|
||||
platformdirs = {version = ">=4.3.8", markers = "python_version >= \"3.9\""}
|
||||
pluggy = {version = "1.6.0", markers = "python_version >= \"3.9\""}
|
||||
pygments = ">=2.19.1"
|
||||
pynose = ">=1.5.4"
|
||||
pyotp = "2.9.0"
|
||||
pyreadline3 = {version = ">=3.5.3", markers = "platform_system == \"Windows\""}
|
||||
pytest = "8.3.5"
|
||||
pytest-html = "4.0.2"
|
||||
pytest-metadata = "3.1.1"
|
||||
pytest-ordering = "0.6"
|
||||
pytest-rerunfailures = {version = "15.1", markers = "python_version >= \"3.9\""}
|
||||
pytest-xdist = {version = "3.7.0", markers = "python_version >= \"3.9\""}
|
||||
python-xlib = {version = "0.33", markers = "platform_system == \"Linux\""}
|
||||
pyyaml = ">=6.0.2"
|
||||
requests = "2.32.3"
|
||||
rich = ">=14.0.0,<15"
|
||||
sbvirtualdisplay = ">=1.4.0"
|
||||
selenium = {version = "4.33.0", markers = "python_version >= \"3.10\""}
|
||||
setuptools = {version = ">=80.8.0", markers = "python_version >= \"3.10\""}
|
||||
six = ">=1.17.0"
|
||||
sniffio = "1.3.1"
|
||||
sortedcontainers = "2.4.0"
|
||||
soupsieve = "2.7"
|
||||
tabcompleter = ">=1.4.0"
|
||||
trio = {version = "0.30.0", markers = "python_version >= \"3.9\""}
|
||||
trio-websocket = "0.12.2"
|
||||
typing-extensions = ">=4.13.2"
|
||||
urllib3 = {version = ">=1.26.20,<2.5.0", markers = "python_version >= \"3.10\""}
|
||||
websocket-client = "1.8.0"
|
||||
websockets = {version = ">=15.0.1", markers = "python_version >= \"3.9\""}
|
||||
wheel = ">=0.45.1"
|
||||
wsproto = "1.2.0"
|
||||
|
||||
[package.extras]
|
||||
allure = ["allure-behave (>=2.13.5)", "allure-pytest (>=2.13.5)", "allure-python-commons (>=2.13.5)"]
|
||||
coverage = ["coverage (>=7.6.1) ; python_version < \"3.9\"", "coverage (>=7.8.2) ; python_version >= \"3.9\"", "pytest-cov (>=5.0.0) ; python_version < \"3.9\"", "pytest-cov (>=6.1.1) ; python_version >= \"3.9\""]
|
||||
flake8 = ["flake8 (==5.0.4) ; python_version < \"3.9\"", "flake8 (==7.2.0) ; python_version >= \"3.9\"", "mccabe (==0.7.0)", "pycodestyle (==2.13.0) ; python_version >= \"3.9\"", "pycodestyle (==2.9.1) ; python_version < \"3.9\"", "pyflakes (==2.5.0) ; python_version < \"3.9\"", "pyflakes (==3.3.2) ; python_version >= \"3.9\""]
|
||||
ipdb = ["ipdb (==0.13.13)", "ipython (==7.34.0)"]
|
||||
mss = ["mss (==10.0.0) ; python_version >= \"3.9\"", "mss (==9.0.2) ; python_version < \"3.9\""]
|
||||
pdfminer = ["cffi (==1.17.1)", "cryptography (==39.0.2) ; python_version < \"3.9\"", "cryptography (==45.0.3) ; python_version >= \"3.9\"", "pdfminer.six (==20250324) ; python_version < \"3.9\"", "pdfminer.six (==20250506) ; python_version >= \"3.9\"", "pycparser (==2.22)"]
|
||||
pillow = ["Pillow (>=10.4.0) ; python_version < \"3.9\"", "Pillow (>=11.2.1) ; python_version >= \"3.9\""]
|
||||
pip-system-certs = ["pip-system-certs (==4.0) ; platform_system == \"Windows\""]
|
||||
proxy = ["proxy.py (==2.4.3)"]
|
||||
psutil = ["psutil (==7.0.0)"]
|
||||
pyautogui = ["PyAutoGUI (==0.9.54)"]
|
||||
selenium-stealth = ["selenium-stealth (==1.0.6)"]
|
||||
selenium-wire = ["Brotli (==1.1.0)", "blinker (==1.7.0)", "h2 (==4.1.0)", "hpack (==4.0.0)", "hyperframe (==6.0.1)", "kaitaistruct (==0.10)", "pyOpenSSL (==24.2.1)", "pyasn1 (==0.6.1)", "pyparsing (>=3.1.4)", "selenium-wire (==5.1.0)", "zstandard (==0.23.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "setuptools"
|
||||
version = "80.9.0"
|
||||
description = "Easily download, build, install, upgrade, and uninstall Python packages"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922"},
|
||||
{file = "setuptools-80.9.0.tar.gz", hash = "sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.8.0) ; sys_platform != \"cygwin\""]
|
||||
core = ["importlib_metadata (>=6) ; python_version < \"3.10\"", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1) ; python_version < \"3.11\"", "wheel (>=0.43.0)"]
|
||||
cover = ["pytest-cov"]
|
||||
doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
|
||||
enabler = ["pytest-enabler (>=2.2)"]
|
||||
test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"]
|
||||
type = ["importlib_metadata (>=7.0.2) ; python_version < \"3.10\"", "jaraco.develop (>=7.21) ; sys_platform != \"cygwin\"", "mypy (==1.14.*)", "pytest-mypy"]
|
||||
|
||||
[[package]]
|
||||
name = "six"
|
||||
version = "1.17.0"
|
||||
@@ -2647,7 +3286,7 @@ description = "Python documentation generator"
|
||||
optional = false
|
||||
python-versions = ">=3.10"
|
||||
groups = ["docs"]
|
||||
markers = "python_version < \"3.12\""
|
||||
markers = "python_version == \"3.10\""
|
||||
files = [
|
||||
{file = "sphinx-8.1.3-py3-none-any.whl", hash = "sha256:09719015511837b76bf6e03e42eb7595ac8c2e41eeb9c29c5b755c6b677992a2"},
|
||||
{file = "sphinx-8.1.3.tar.gz", hash = "sha256:43c1911eecb0d3e161ad78611bc905d1ad0e523e4ddc202a58a821773dc4c927"},
|
||||
@@ -2684,7 +3323,7 @@ description = "Python documentation generator"
|
||||
optional = false
|
||||
python-versions = ">=3.11"
|
||||
groups = ["docs"]
|
||||
markers = "python_version == \"3.12\""
|
||||
markers = "python_version >= \"3.11\""
|
||||
files = [
|
||||
{file = "sphinx-8.2.3-py3-none-any.whl", hash = "sha256:4405915165f13521d875a8c29c8970800a0141c14cc5416a38feca4ea5d9b9c3"},
|
||||
{file = "sphinx-8.2.3.tar.gz", hash = "sha256:398ad29dee7f63a75888314e9424d40f52ce5a6a87ae88e7071e80af296ec348"},
|
||||
@@ -2935,6 +3574,21 @@ anyio = ">=3.6.2,<5"
|
||||
[package.extras]
|
||||
full = ["httpx (>=0.27.0,<0.29.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.18)", "pyyaml"]
|
||||
|
||||
[[package]]
|
||||
name = "tabcompleter"
|
||||
version = "1.4.0"
|
||||
description = "tabcompleter --- Autocompletion in the Python console."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "tabcompleter-1.4.0-py3-none-any.whl", hash = "sha256:d744aa735b49c0a6cc2fb8fcd40077fec47425e4388301010b14e6ce3311368b"},
|
||||
{file = "tabcompleter-1.4.0.tar.gz", hash = "sha256:7562a9938e62f8e7c3be612c3ac4e14c5ec4307b58ba9031c148260e866e8814"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pyreadline3 = {version = "*", markers = "platform_system == \"Windows\""}
|
||||
|
||||
[[package]]
|
||||
name = "telethon"
|
||||
version = "1.40.0"
|
||||
@@ -2972,7 +3626,7 @@ version = "2.2.1"
|
||||
description = "A lil' TOML parser"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["dev", "docs"]
|
||||
groups = ["main", "dev", "docs"]
|
||||
markers = "python_version == \"3.10\""
|
||||
files = [
|
||||
{file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"},
|
||||
@@ -3418,6 +4072,21 @@ files = [
|
||||
{file = "websockets-15.0.1.tar.gz", hash = "sha256:82544de02076bafba038ce055ee6412d68da13ab47f0c60cab827346de828dee"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wheel"
|
||||
version = "0.45.1"
|
||||
description = "A built-package format for Python"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "wheel-0.45.1-py3-none-any.whl", hash = "sha256:708e7481cc80179af0e556bbf0cc00b8444c7321e2700b8d8580231d13017248"},
|
||||
{file = "wheel-0.45.1.tar.gz", hash = "sha256:661e1abd9198507b1409a20c02106d9670b2576e916d58f520316666abca6729"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
test = ["pytest (>=6.0.0)", "setuptools (>=65)"]
|
||||
|
||||
[[package]]
|
||||
name = "win32-setctime"
|
||||
version = "1.2.0"
|
||||
@@ -3485,4 +4154,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
|
||||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = ">=3.10,<3.13"
|
||||
content-hash = "03a5cc0c06de5cc1227dc36895013562697a481f1e41aa405cc33545c29bbef3"
|
||||
content-hash = "1ab1e4c9b8beb51116052c1e8d180616a0938757f173f05b7355e279902d3350"
|
||||
|
||||
@@ -27,7 +27,6 @@ dependencies = [
|
||||
"bs4 (>=0.0.0)",
|
||||
"loguru (>=0.0.0)",
|
||||
"ffmpeg-python (>=0.0.0)",
|
||||
"selenium (>=0.0.0)",
|
||||
"telethon (>=0.0.0)",
|
||||
"google-api-python-client (>=0.0.0)",
|
||||
"google-auth-httplib2 (>=0.0.0)",
|
||||
@@ -57,6 +56,8 @@ dependencies = [
|
||||
"bgutil-ytdlp-pot-provider (>=1.0.0)",
|
||||
"yt-dlp[curl-cffi,default] (>=2025.5.22,<2026.0.0)",
|
||||
"secretstorage (>=3.3.3,<4.0.0)",
|
||||
"seleniumbase (>=4.36.4,<5.0.0)",
|
||||
"pyautogui (>=0.9.54,<0.10.0)",
|
||||
]
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
|
||||
@@ -0,0 +1,44 @@
|
||||
{
|
||||
"name": "Antibot Extractor/Enricher",
|
||||
"type": ["extractor", "enricher"],
|
||||
"requires_setup": False,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "seleniumbase"],
|
||||
},
|
||||
"configs": {
|
||||
"save_to_pdf": {
|
||||
"default": False,
|
||||
"type": "bool",
|
||||
"help": "save a PDF snapshot of the page.",
|
||||
},
|
||||
"max_download_images": {
|
||||
"default": 50,
|
||||
"help": "maximum number of images to download from the page (0 = no download, inf = no limit).",
|
||||
},
|
||||
"max_download_videos": {
|
||||
"default": 50,
|
||||
"help": "maximum number of videos to download from the page (0 = no download, inf = no limit).",
|
||||
},
|
||||
"exclude_media_extensions": {
|
||||
"default": ".svg,.ico,.gif",
|
||||
"help": "CSV of media (image/video) file extensions to exclude from download",
|
||||
},
|
||||
"proxy": {
|
||||
"default": None,
|
||||
"help": "proxy to use for the webdriver, Format: 'SERVER:PORT' or 'USER:PASS@SERVER:PORT'",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Uses a browser controlled by SeleniumBase to capture HTML, media, and screenshots/PDFs of a web page, by bypassing anti-bot measures like Cloudflare's Turnstile.
|
||||
|
||||
### Features
|
||||
- Extracts the HTML source code of the page.
|
||||
- Takes full-page screenshots of web pages.
|
||||
- Takes full-page PDF snapshots of web pages.
|
||||
- Downloads images and videos from the page, excluding specified file extensions.
|
||||
|
||||
### Notes
|
||||
- Requires a WebDriver (e.g., ChromeDriver) installed and accessible via the system's PATH.
|
||||
- Using a proxy affects Cloudflare Turnstile captcha handling, so it is recommended to use a proxy only if necessary.
|
||||
""",
|
||||
}
|
||||
@@ -0,0 +1,217 @@
|
||||
import base64
|
||||
import math
|
||||
import mimetypes
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from loguru import logger
|
||||
from seleniumbase import SB
|
||||
|
||||
from auto_archiver.core import Extractor, Enricher, Metadata, Media
|
||||
from auto_archiver.utils.misc import random_str
|
||||
|
||||
|
||||
class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
def setup(self) -> None:
|
||||
self.agent = "cool"
|
||||
if "linux" in sys.platform or "win32" in sys.platform:
|
||||
self.agent = None # Use the default UserAgent
|
||||
|
||||
# parse configuration options
|
||||
self.exclude_media_mimetypes = set(
|
||||
[mimetypes.guess_type(f"file{m}")[0] for m in self.exclude_media_extensions.split(",")]
|
||||
) - {None}
|
||||
|
||||
if self.max_download_images == "inf":
|
||||
self.max_download_images = math.inf
|
||||
else:
|
||||
self.max_download_images = int(self.max_download_images)
|
||||
|
||||
if self.max_download_videos == "inf":
|
||||
self.max_download_videos = math.inf
|
||||
else:
|
||||
self.max_download_videos = int(self.max_download_videos)
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
result = Metadata()
|
||||
result.merge(item)
|
||||
if self.enrich(result):
|
||||
result.status = "antibot"
|
||||
return result
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> bool:
|
||||
url = to_enrich.get_url()
|
||||
# TODO: implement cookies auth = self.auth_for_site(url) and combine with if UrlUtil.is_auth_wall(url) like in ScreenshotEnricher
|
||||
url_sample = url[:75]
|
||||
try:
|
||||
with SB(uc=True, agent=self.agent, headed=None, proxy=self.proxy) as sb:
|
||||
logger.info(f"ANTIBOT selenium browser is up with agent {self.agent}, opening {url_sample}...")
|
||||
sb.uc_open_with_reconnect(url, 4)
|
||||
|
||||
logger.debug(f"ANTIBOT handling CAPTCHAs for {url_sample}...")
|
||||
|
||||
# TODO: implement other Captcha handling
|
||||
sb.uc_gui_handle_captcha() # handles Cloudflare Turnstile captcha if detected
|
||||
|
||||
# time.sleep(1) # wait for the page to load
|
||||
if self._hit_auth_wall(sb):
|
||||
logger.warning(f"ANTIBOT SKIP since auth wall or CAPTCHA was detected for {url_sample}")
|
||||
return False
|
||||
logger.debug(f"ANTIBOT no auth wall detected for {url_sample}...")
|
||||
|
||||
to_enrich.set_title(sb.get_title())
|
||||
self._enrich_html_source_code(sb, to_enrich)
|
||||
self._enrich_full_page_screenshot(sb, to_enrich)
|
||||
if self.save_to_pdf:
|
||||
self._enrich_full_page_pdf(sb, to_enrich)
|
||||
|
||||
self._enrich_download_media(sb, to_enrich, css_selector="img", max_media=self.max_download_images)
|
||||
self._enrich_download_media(
|
||||
sb, to_enrich, css_selector="video, source", max_media=self.max_download_videos
|
||||
)
|
||||
|
||||
logger.success(f"ANTIBOT completed for {url_sample}")
|
||||
|
||||
return to_enrich
|
||||
except Exception as e:
|
||||
logger.error(f"ANTIBOT runtime error: {e}: {traceback.format_exc()}")
|
||||
return False
|
||||
|
||||
def _hit_auth_wall(self, sb: SB) -> bool:
|
||||
"""
|
||||
Tries to detect if the currently loaded page is an auth/login wall.
|
||||
Returns True if login is likely required.
|
||||
"""
|
||||
# TODO: improve this detection logic, currently it is very basic and may not cover all cases
|
||||
|
||||
# Common URL patterns
|
||||
url = sb.get_current_url().lower()
|
||||
if any(kw in url for kw in ["login", "signin", "signup", "register", "captcha"]):
|
||||
return True
|
||||
|
||||
# Common visible text markers
|
||||
login_keywords = [
|
||||
"sign up or log in",
|
||||
"log in to continue",
|
||||
"sign in to continue",
|
||||
"login required",
|
||||
"please log in",
|
||||
"please sign up",
|
||||
"please sign in",
|
||||
"login to access",
|
||||
"sign up to access",
|
||||
"register to access",
|
||||
"captcha verification",
|
||||
]
|
||||
for word in login_keywords + [w.capitalize() for w in login_keywords]:
|
||||
if sb.is_text_visible(word):
|
||||
return True
|
||||
|
||||
# Common title markers
|
||||
title = sb.get_title().lower()
|
||||
if any(
|
||||
kw in title
|
||||
for kw in [
|
||||
"just a moment...",
|
||||
"tiktok - make your day",
|
||||
"um momento...",
|
||||
"log in",
|
||||
"sign in",
|
||||
"sign up",
|
||||
"register",
|
||||
"captcha",
|
||||
"verification required",
|
||||
"access denied",
|
||||
]
|
||||
):
|
||||
return True
|
||||
|
||||
# Common form fields
|
||||
elements = [
|
||||
"input[type='password']",
|
||||
"input[type='email']",
|
||||
"input[type='username']",
|
||||
"input[type='phone']",
|
||||
"input[name='username']",
|
||||
"input[name='email']",
|
||||
"input[name='password']",
|
||||
"input[name='login']",
|
||||
]
|
||||
if any(sb.is_element_visible(el) for el in elements):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
@logger.catch
|
||||
def _enrich_html_source_code(self, sb: SB, to_enrich: Metadata):
|
||||
"""
|
||||
Enriches the HTML source code of the Metadata object.
|
||||
This method is called by the enrich method.
|
||||
"""
|
||||
source = sb.get_page_source()
|
||||
|
||||
html_filename = os.path.join(self.tmp_dir, f"source{random_str(6)}.html")
|
||||
with open(html_filename, "w", encoding="utf-8") as f:
|
||||
f.write(source)
|
||||
|
||||
to_enrich.add_media(Media(filename=html_filename), id="html_source_code")
|
||||
|
||||
@logger.catch
|
||||
def _enrich_full_page_screenshot(self, sb: SB, to_enrich: Metadata):
|
||||
"""
|
||||
Enriches the full page screenshot of the Metadata object.
|
||||
This method is called by the enrich method.
|
||||
"""
|
||||
x = sb.execute_script("return document.documentElement.scrollWidth")
|
||||
y = min(sb.execute_script("return document.documentElement.scrollHeight"), 25_000)
|
||||
sb.set_window_size(x, y)
|
||||
|
||||
screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png")
|
||||
sb.save_screenshot(screen_filename)
|
||||
|
||||
to_enrich.add_media(Media(filename=screen_filename), id="screenshot")
|
||||
|
||||
@logger.catch
|
||||
def _enrich_full_page_pdf(self, sb: SB, to_enrich: Metadata):
|
||||
"""
|
||||
Enriches the full page PDF of the Metadata object.
|
||||
This method is called by the enrich method.
|
||||
"""
|
||||
result = sb.driver.execute_cdp_cmd("Page.printToPDF", {"printBackground": True, "landscape": False})
|
||||
|
||||
pdf_data = base64.b64decode(result["data"])
|
||||
|
||||
pdf_filename = os.path.join(self.tmp_dir, f"pdf{random_str(6)}.pdf")
|
||||
with open(pdf_filename, "wb") as f:
|
||||
f.write(pdf_data)
|
||||
|
||||
to_enrich.add_media(Media(filename=pdf_filename), id="pdf")
|
||||
|
||||
@logger.catch
|
||||
def _enrich_download_media(self, sb: SB, to_enrich: Metadata, css_selector: str, max_media: int):
|
||||
"""
|
||||
Downloads media from the page and adds them to the Metadata object.
|
||||
This method is called by the enrich method.
|
||||
"""
|
||||
if max_media == 0:
|
||||
return
|
||||
logger.debug(
|
||||
f"Downloading media from {to_enrich.get_url()} with selector '{css_selector}' up to {max_media} items."
|
||||
)
|
||||
url = to_enrich.get_url()
|
||||
all_urls = set()
|
||||
media_elements = sb.find_elements(css_selector)
|
||||
for media in media_elements:
|
||||
if len(all_urls) >= max_media:
|
||||
logger.debug(f"Reached max download limit of {max_media} images/videos.")
|
||||
break
|
||||
if src := media.get_attribute("src"):
|
||||
mimerype = mimetypes.guess_type(src)[0]
|
||||
if mimerype in self.exclude_media_mimetypes:
|
||||
continue
|
||||
full_src = urljoin(url, src)
|
||||
if full_src not in all_urls and (filename := self.download_from_url(full_src)):
|
||||
all_urls.add(full_src)
|
||||
to_enrich.add_media(Media(filename=filename, properties={"url": full_src}))
|
||||
@@ -62,7 +62,7 @@ If you are having issues with the extractor, you can review the version of `yt-d
|
||||
},
|
||||
"end_means_success": {
|
||||
"default": True,
|
||||
"help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve.",
|
||||
"help": "if True, any archived content will mean a 'success', if False this extractor will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent extractors can retrieve.",
|
||||
"type": "bool",
|
||||
},
|
||||
"allow_playlist": {
|
||||
|
||||
@@ -33,6 +33,9 @@ class GenericExtractor(Extractor):
|
||||
def setup(self):
|
||||
self.check_for_extractor_updates()
|
||||
self.setup_po_tokens()
|
||||
# TODO: figure out why the following is not properly recognised by yt-dlp:
|
||||
# if "generic" not in self.extractor_args:
|
||||
# self.extractor_args["generic"] = "impersonate"
|
||||
|
||||
def check_for_extractor_updates(self):
|
||||
"""Checks whether yt-dlp or its plugins need updating and triggers a restart if so."""
|
||||
@@ -590,11 +593,11 @@ class GenericExtractor(Extractor):
|
||||
# Applying user-defined extractor_args
|
||||
if self.extractor_args:
|
||||
for key, args in self.extractor_args.items():
|
||||
logger.debug(f"Setting extractor_args: {key}")
|
||||
if isinstance(args, dict):
|
||||
arg_str = ";".join(f"{k}={v}" for k, v in args.items())
|
||||
else:
|
||||
arg_str = str(args)
|
||||
logger.debug(f"Setting extractor_args: {key}:{arg_str}")
|
||||
ydl_options.extend(["--extractor-args", f"{key}:{arg_str}"])
|
||||
|
||||
if self.ytdlp_args:
|
||||
|
||||
@@ -12,6 +12,12 @@
|
||||
font-family: 'Roboto', sans-serif;
|
||||
}
|
||||
|
||||
h2 {
|
||||
white-space: normal;
|
||||
overflow-wrap: break-word;
|
||||
word-break: break-word;
|
||||
}
|
||||
|
||||
table {
|
||||
table-layout: fixed;
|
||||
width: 90%;
|
||||
@@ -97,13 +103,17 @@
|
||||
background-color: #f1f1f1;
|
||||
}
|
||||
|
||||
.pem-certificate, .text-preview {
|
||||
.pem-certificate,
|
||||
.text-preview {
|
||||
text-align: left;
|
||||
font-size: small;
|
||||
}
|
||||
.text-preview{
|
||||
|
||||
.text-preview {
|
||||
padding-left: 10px;
|
||||
padding-right: 10px;
|
||||
max-height: 300px;
|
||||
overflow: auto;
|
||||
white-space: pre-wrap;
|
||||
}
|
||||
</style>
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
- Skips non-image media or files unsuitable for hashing (e.g., corrupted or unsupported formats).
|
||||
|
||||
### Notes
|
||||
- Best used after enrichers like `thumbnail_enricher` or `screenshot_enricher` to ensure images are available.
|
||||
- Best used after enrichers like `thumbnail_enricher` or `antibot_extractor_enricher` (takes screenshots) to ensure images are available.
|
||||
- Uses the `pdqhash` library to compute 256-bit perceptual hashes, which are stored as hexadecimal strings.
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ objects and calculates perceptual hashes using the PDQ hashing algorithm.
|
||||
These hashes are designed specifically for images and can be used
|
||||
for detecting duplicate or near-duplicate visual content.
|
||||
|
||||
This enricher is typically used after thumbnail or screenshot enrichers
|
||||
This enricher is typically used after thumbnail or screenshot (antibot) enrichers
|
||||
to ensure images are available for hashing.
|
||||
|
||||
"""
|
||||
|
||||
@@ -40,6 +40,8 @@ class S3Storage(Storage):
|
||||
try:
|
||||
if media.mimetype:
|
||||
extra_args["ContentType"] = media.mimetype
|
||||
if "text" in media.mimetype:
|
||||
extra_args["ContentType"] += "; charset=utf-8"
|
||||
except Exception as e:
|
||||
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
|
||||
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
from .screenshot_enricher import ScreenshotEnricher
|
||||
@@ -1,44 +0,0 @@
|
||||
{
|
||||
"name": "Screenshot Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "selenium"],
|
||||
},
|
||||
"configs": {
|
||||
"width": {"default": 1280, "type": "int", "help": "width of the screenshots"},
|
||||
"height": {"default": 1024, "type": "int", "help": "height of the screenshots"},
|
||||
"timeout": {"default": 60, "type": "int", "help": "timeout for taking the screenshot"},
|
||||
"sleep_before_screenshot": {
|
||||
"default": 4,
|
||||
"type": "int",
|
||||
"help": "seconds to wait for the pages to load before taking screenshot",
|
||||
},
|
||||
"http_proxy": {
|
||||
"default": "",
|
||||
"help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port",
|
||||
},
|
||||
"save_to_pdf": {
|
||||
"default": False,
|
||||
"type": "bool",
|
||||
"help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter",
|
||||
},
|
||||
"print_options": {
|
||||
"default": {},
|
||||
"help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
|
||||
"type": "json_loader",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Captures screenshots and optionally saves web pages as PDFs using a WebDriver.
|
||||
|
||||
### Features
|
||||
- Takes screenshots of web pages, with configurable width, height, and timeout settings.
|
||||
- Optionally saves pages as PDFs, with additional configuration for PDF printing options.
|
||||
- Bypasses URLs detected as authentication walls.
|
||||
- Integrates seamlessly with the metadata enrichment pipeline, adding screenshots and PDFs as media.
|
||||
|
||||
### Notes
|
||||
- Requires a WebDriver (e.g., ChromeDriver) installed and accessible via the system's PATH.
|
||||
""",
|
||||
}
|
||||
@@ -1,61 +0,0 @@
|
||||
from loguru import logger
|
||||
import time
|
||||
import os
|
||||
import base64
|
||||
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.utils import Webdriver, url as UrlUtil, random_str
|
||||
from auto_archiver.core import Media, Metadata
|
||||
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
def __init__(self, webdriver_factory=None):
|
||||
super().__init__()
|
||||
self.webdriver_factory = webdriver_factory or Webdriver
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
|
||||
logger.debug(f"Enriching screenshot for {url=}")
|
||||
auth = self.auth_for_site(url)
|
||||
|
||||
# screenshot enricher only supports cookie-type auth (selenium)
|
||||
has_valid_auth = auth and (auth.get("cookies") or auth.get("cookies_jar") or auth.get("cookie"))
|
||||
|
||||
if UrlUtil.is_auth_wall(url) and not has_valid_auth:
|
||||
logger.warning(f"[SKIP] SCREENSHOT since url is behind AUTH WALL and no login details provided: {url=}")
|
||||
if any(auth.get(key) for key in ["username", "password", "api_key", "api_secret"]):
|
||||
logger.warning(
|
||||
f"Screenshot enricher only supports cookie-type authentication, you have provided {auth.keys()} which are not supported.\
|
||||
Consider adding 'cookie', 'cookies_file' or 'cookies_from_browser' to your auth for this site."
|
||||
)
|
||||
return
|
||||
|
||||
with self.webdriver_factory(
|
||||
self.width,
|
||||
self.height,
|
||||
self.timeout,
|
||||
facebook_accept_cookies="facebook.com" in url,
|
||||
http_proxy=self.http_proxy,
|
||||
print_options=self.print_options,
|
||||
auth=auth,
|
||||
) as driver:
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(int(self.sleep_before_screenshot))
|
||||
screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png")
|
||||
driver.save_screenshot(screenshot_file)
|
||||
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
|
||||
if self.save_to_pdf:
|
||||
pdf_file = os.path.join(self.tmp_dir, f"pdf_{random_str(8)}.pdf")
|
||||
pdf = driver.print_page(driver.print_options)
|
||||
with open(pdf_file, "wb") as f:
|
||||
f.write(base64.b64decode(pdf))
|
||||
to_enrich.add_media(Media(filename=pdf_file), id="pdf")
|
||||
except TimeoutException:
|
||||
logger.info("TimeoutException loading page for screenshot")
|
||||
except Exception as e:
|
||||
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
|
||||
@@ -2,7 +2,6 @@
|
||||
|
||||
# we need to explicitly expose the available imports here
|
||||
from .misc import *
|
||||
from .webdriver import Webdriver
|
||||
|
||||
# handy utils from ytdlp
|
||||
from yt_dlp.utils import clean_html, traverse_obj, strip_or_none, url_or_none
|
||||
|
||||
@@ -1,167 +0,0 @@
|
||||
"""This Webdriver class acts as a context manager for the selenium webdriver."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import time
|
||||
import re
|
||||
|
||||
# import domain_for_url
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
from http.cookiejar import MozillaCookieJar
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common import exceptions as selenium_exceptions
|
||||
from selenium.webdriver.common.print_page_options import PrintOptions
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class CookieSettingDriver(webdriver.Firefox):
|
||||
facebook_accept_cookies: bool
|
||||
cookie: str
|
||||
cookie_jar: MozillaCookieJar
|
||||
|
||||
def __init__(self, cookie, cookie_jar, facebook_accept_cookies, *args, **kwargs):
|
||||
if os.environ.get("RUNNING_IN_DOCKER"):
|
||||
# Selenium doesn't support linux-aarch64 driver, we need to set this manually
|
||||
kwargs["service"] = webdriver.FirefoxService(executable_path="/usr/local/bin/geckodriver")
|
||||
|
||||
super(CookieSettingDriver, self).__init__(*args, **kwargs)
|
||||
self.cookie = cookie
|
||||
self.cookie_jar = cookie_jar
|
||||
self.facebook_accept_cookies = facebook_accept_cookies
|
||||
|
||||
def get(self, url: str):
|
||||
if self.cookie_jar or self.cookie:
|
||||
# set up the driver to make it not 'cookie averse' (needs a context/URL)
|
||||
# get the 'robots.txt' file which should be quick and easy
|
||||
robots_url = urlunparse(urlparse(url)._replace(path="/robots.txt", query="", fragment=""))
|
||||
super(CookieSettingDriver, self).get(robots_url)
|
||||
|
||||
if self.cookie:
|
||||
# an explicit cookie is set for this site, use that first
|
||||
for cookie in self.cookies.split(";"):
|
||||
for name, value in cookie.split("="):
|
||||
self.driver.add_cookie({"name": name, "value": value})
|
||||
elif self.cookie_jar:
|
||||
domain = urlparse(url).netloc.removeprefix("www.")
|
||||
regex = re.compile(f"(www)?.?{domain}$")
|
||||
for cookie in self.cookie_jar:
|
||||
if regex.match(cookie.domain):
|
||||
try:
|
||||
self.add_cookie(
|
||||
{
|
||||
"name": cookie.name,
|
||||
"value": cookie.value,
|
||||
"path": cookie.path,
|
||||
"domain": cookie.domain,
|
||||
"secure": bool(cookie.secure),
|
||||
"expiry": cookie.expires,
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to add cookie ({cookie.domain}) to webdriver for url {domain}: {e}")
|
||||
|
||||
super(CookieSettingDriver, self).get(url)
|
||||
time.sleep(2)
|
||||
|
||||
# Try and use some common button text to reject/accept cookies
|
||||
for text in [
|
||||
"Refuse non-essential cookies",
|
||||
"Decline optional cookies",
|
||||
"Reject additional cookies",
|
||||
"Reject all",
|
||||
"Accept all cookies",
|
||||
]:
|
||||
try:
|
||||
xpath = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]"
|
||||
self.find_element(By.XPATH, xpath).click()
|
||||
time.sleep(2)
|
||||
except selenium_exceptions.NoSuchElementException:
|
||||
pass
|
||||
|
||||
# now get the actual URL
|
||||
if self.facebook_accept_cookies:
|
||||
# try and click the 'close' button on the 'login' window to close it
|
||||
try:
|
||||
xpath = "//div[@role='dialog']//div[@aria-label='Close']"
|
||||
self.find_element(By.XPATH, xpath).click()
|
||||
time.sleep(2)
|
||||
except selenium_exceptions.NoSuchElementException:
|
||||
logger.warning("Unable to find the 'close' button on the facebook login window")
|
||||
pass
|
||||
|
||||
else:
|
||||
# for all other sites, try and use some common button text to reject/accept cookies
|
||||
for text in [
|
||||
"Refuse non-essential cookies",
|
||||
"Decline optional cookies",
|
||||
"Reject additional cookies",
|
||||
"Reject all",
|
||||
"Accept all cookies",
|
||||
]:
|
||||
try:
|
||||
xpath = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]"
|
||||
WebDriverWait(self, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))).click()
|
||||
break
|
||||
except selenium_exceptions.WebDriverException:
|
||||
pass
|
||||
|
||||
|
||||
class Webdriver:
|
||||
def __init__(
|
||||
self,
|
||||
width: int,
|
||||
height: int,
|
||||
timeout_seconds: int,
|
||||
facebook_accept_cookies: bool = False,
|
||||
http_proxy: str = "",
|
||||
print_options: dict = {},
|
||||
auth: dict = {},
|
||||
) -> webdriver:
|
||||
self.width = width
|
||||
self.height = height
|
||||
self.timeout_seconds = timeout_seconds
|
||||
self.auth = auth
|
||||
self.facebook_accept_cookies = facebook_accept_cookies
|
||||
self.http_proxy = http_proxy
|
||||
# create and set print options
|
||||
self.print_options = PrintOptions()
|
||||
for k, v in print_options.items():
|
||||
setattr(self.print_options, k, v)
|
||||
|
||||
def __enter__(self) -> webdriver:
|
||||
options = webdriver.FirefoxOptions()
|
||||
options.add_argument("--headless")
|
||||
options.add_argument(f"--proxy-server={self.http_proxy}")
|
||||
options.set_preference("network.protocol-handler.external.tg", False)
|
||||
# if facebook cookie popup is present, force the browser to English since then it's easier to click the 'Decline optional cookies' option
|
||||
if self.facebook_accept_cookies:
|
||||
options.add_argument("--lang=en")
|
||||
|
||||
try:
|
||||
self.driver = CookieSettingDriver(
|
||||
cookie=self.auth.get("cookie"),
|
||||
cookie_jar=self.auth.get("cookies_jar"),
|
||||
facebook_accept_cookies=self.facebook_accept_cookies,
|
||||
options=options,
|
||||
)
|
||||
self.driver.set_window_size(self.width, self.height)
|
||||
self.driver.set_page_load_timeout(self.timeout_seconds)
|
||||
self.driver.print_options = self.print_options
|
||||
except selenium_exceptions.TimeoutException as e:
|
||||
logger.error(
|
||||
f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}"
|
||||
)
|
||||
|
||||
return self.driver
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.driver.close()
|
||||
self.driver.quit()
|
||||
del self.driver
|
||||
return True
|
||||
@@ -1,216 +0,0 @@
|
||||
import base64
|
||||
|
||||
import pytest
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.modules.screenshot_enricher import ScreenshotEnricher
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_selenium_env(mocker):
|
||||
"""Patches Selenium calls and driver checks in one place."""
|
||||
|
||||
# Patch external dependencies
|
||||
mock_which = mocker.patch("shutil.which")
|
||||
mock_driver_class = mocker.patch("auto_archiver.utils.webdriver.CookieSettingDriver")
|
||||
mock_binary_paths = mocker.patch("selenium.webdriver.common.selenium_manager.SeleniumManager.binary_paths")
|
||||
mocker.patch("pathlib.Path.is_file", return_value=True)
|
||||
mock_popen = mocker.patch("subprocess.Popen")
|
||||
mocker.patch("selenium.webdriver.common.service.Service.is_connectable", return_value=True)
|
||||
mock_firefox_options = mocker.patch("selenium.webdriver.FirefoxOptions")
|
||||
|
||||
# Define side effect for `shutil.which`
|
||||
def mock_which_side_effect(dep):
|
||||
return "/mock/geckodriver" if dep == "geckodriver" else None
|
||||
|
||||
mock_which.side_effect = mock_which_side_effect
|
||||
|
||||
# Mock binary paths
|
||||
mock_binary_paths.return_value = {
|
||||
"driver_path": "/mock/driver",
|
||||
"browser_path": "/mock/browser",
|
||||
}
|
||||
# Mock `subprocess.Popen`
|
||||
mock_proc = mocker.MagicMock()
|
||||
mock_proc.poll.return_value = None
|
||||
mock_popen.return_value = mock_proc
|
||||
# Mock `CookieSettingDriver`
|
||||
mock_driver = mocker.MagicMock()
|
||||
mock_driver_class.return_value = mock_driver
|
||||
# Mock `FirefoxOptions`
|
||||
mock_options_instance = mocker.MagicMock()
|
||||
mock_firefox_options.return_value = mock_options_instance
|
||||
yield mock_driver, mock_driver_class, mock_options_instance
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def common_patches(tmp_path, mocker):
|
||||
"""Patches common utilities used across multiple tests."""
|
||||
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=False)
|
||||
mocker.patch("os.path.join", return_value=str(tmp_path / "test.png"))
|
||||
mocker.patch("time.sleep")
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def screenshot_enricher(setup_module, mock_binary_dependencies) -> ScreenshotEnricher:
|
||||
configs: dict = {
|
||||
"width": 1280,
|
||||
"height": 720,
|
||||
"timeout": 60,
|
||||
"sleep_before_screenshot": 4,
|
||||
"http_proxy": "",
|
||||
"save_to_pdf": "False",
|
||||
"print_options": {},
|
||||
}
|
||||
return setup_module("screenshot_enricher", configs)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def metadata_with_video():
|
||||
m = Metadata()
|
||||
m.set_url("https://example.com")
|
||||
m.add_media(Media(filename="video.mp4").set("id", "video1"))
|
||||
return m
|
||||
|
||||
|
||||
def test_enrich_adds_screenshot(
|
||||
screenshot_enricher,
|
||||
metadata_with_video,
|
||||
mock_selenium_env,
|
||||
common_patches,
|
||||
tmp_path,
|
||||
):
|
||||
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
|
||||
screenshot_enricher.enrich(metadata_with_video)
|
||||
mock_driver_class.assert_called_once_with(
|
||||
cookie=None,
|
||||
cookie_jar=None,
|
||||
facebook_accept_cookies=False,
|
||||
options=mock_options_instance,
|
||||
)
|
||||
# Verify the actual calls on the returned mock_driver
|
||||
mock_driver.get.assert_called_once_with("https://example.com")
|
||||
mock_driver.save_screenshot.assert_called_once_with(str(tmp_path / "test.png"))
|
||||
# Check that the media was added (2 = original video + screenshot)
|
||||
assert len(metadata_with_video.media) == 2
|
||||
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url,is_auth",
|
||||
[
|
||||
("https://example.com", False),
|
||||
("https://private.com", True),
|
||||
],
|
||||
)
|
||||
def test_enrich_auth_wall(
|
||||
screenshot_enricher, metadata_with_video, mock_selenium_env, common_patches, url, is_auth, mocker
|
||||
):
|
||||
# Testing with and without is_auth_wall
|
||||
mock_driver, mock_driver_class, _ = mock_selenium_env
|
||||
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=is_auth)
|
||||
metadata_with_video.set_url(url)
|
||||
screenshot_enricher.enrich(metadata_with_video)
|
||||
|
||||
if is_auth:
|
||||
mock_driver.get.assert_not_called()
|
||||
assert len(metadata_with_video.media) == 1
|
||||
assert metadata_with_video.media[0].properties.get("id") == "video1"
|
||||
else:
|
||||
mock_driver.get.assert_called_once_with(url)
|
||||
assert len(metadata_with_video.media) == 2
|
||||
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
|
||||
|
||||
|
||||
def test_skip_authwall_no_cookies(screenshot_enricher, caplog):
|
||||
with caplog.at_level("WARNING"):
|
||||
screenshot_enricher.enrich(Metadata().set_url("https://instagram.com"))
|
||||
assert "[SKIP] SCREENSHOT since url" in caplog.text
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"auth",
|
||||
[
|
||||
{"cookie": "cookie"},
|
||||
{"cookies_jar": "cookie"},
|
||||
],
|
||||
)
|
||||
def test_dont_skip_authwall_with_cookies(screenshot_enricher, caplog, mocker, mock_selenium_env, auth):
|
||||
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True)
|
||||
|
||||
# patch the authentication dict:
|
||||
screenshot_enricher.authentication = {"example.com": auth}
|
||||
with caplog.at_level("WARNING"):
|
||||
screenshot_enricher.enrich(Metadata().set_url("https://example.com"))
|
||||
assert "[SKIP] SCREENSHOT since url" not in caplog.text
|
||||
|
||||
|
||||
def test_show_warning_wrong_auth_type(screenshot_enricher, caplog, mocker, mock_selenium_env):
|
||||
mock_driver, mock_driver_class, _ = mock_selenium_env
|
||||
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True)
|
||||
screenshot_enricher.authentication = {"example.com": {"username": "user", "password": "pass"}}
|
||||
with caplog.at_level("WARNING"):
|
||||
screenshot_enricher.enrich(Metadata().set_url("https://example.com"))
|
||||
assert "Screenshot enricher only supports cookie-type authentication" in caplog.text
|
||||
|
||||
|
||||
def test_handle_timeout_exception(screenshot_enricher, metadata_with_video, mock_selenium_env, mocker):
|
||||
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
|
||||
|
||||
mock_driver.get.side_effect = TimeoutException
|
||||
mock_log = mocker.patch("loguru.logger.info")
|
||||
screenshot_enricher.enrich(metadata_with_video)
|
||||
mock_log.assert_called_once_with("TimeoutException loading page for screenshot")
|
||||
assert len(metadata_with_video.media) == 1
|
||||
|
||||
|
||||
def test_handle_general_exception(screenshot_enricher, metadata_with_video, mock_selenium_env, mocker):
|
||||
"""Test proper handling of unexpected general exceptions"""
|
||||
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
|
||||
# Simulate a generic exception when save_screenshot is called
|
||||
mock_driver.get.return_value = None
|
||||
mock_driver.save_screenshot.side_effect = Exception("Unexpected Error")
|
||||
|
||||
mock_log = mocker.patch("loguru.logger.error")
|
||||
screenshot_enricher.enrich(metadata_with_video)
|
||||
# Verify that the exception was logged with the log
|
||||
mock_log.assert_called_once_with("Got error while loading webdriver for screenshot enricher: Unexpected Error")
|
||||
# And no new media was added due to the error
|
||||
assert len(metadata_with_video.media) == 1
|
||||
|
||||
|
||||
def test_pdf_creation(mocker, screenshot_enricher, metadata_with_video, mock_selenium_env):
|
||||
"""Test PDF creation when save_to_pdf is enabled"""
|
||||
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
|
||||
# Override the save_to_pdf option
|
||||
screenshot_enricher.save_to_pdf = True
|
||||
# Mock the print_page method to return base64-encoded content
|
||||
mock_driver.print_page.return_value = base64.b64encode(b"fake_pdf_content").decode("utf-8")
|
||||
# Patch functions with mocker
|
||||
mocker.patch("os.path.join", side_effect=lambda *args: f"{args[-1]}")
|
||||
mocker.patch(
|
||||
"auto_archiver.modules.screenshot_enricher.screenshot_enricher.random_str",
|
||||
return_value="fixed123",
|
||||
)
|
||||
mock_open = mocker.patch("builtins.open", new_callable=mocker.mock_open)
|
||||
|
||||
screenshot_enricher.enrich(metadata_with_video)
|
||||
# Verify screenshot and PDF creation
|
||||
mock_driver.save_screenshot.assert_called_once()
|
||||
mock_driver.print_page.assert_called_once_with(mock_driver.print_options)
|
||||
# Check that PDF file was opened and written
|
||||
mock_open.assert_any_call("pdf_fixed123.pdf", "wb")
|
||||
|
||||
# Ensure both screenshot and PDF were added as media
|
||||
assert len(metadata_with_video.media) == 3
|
||||
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
|
||||
assert metadata_with_video.media[2].properties.get("id") == "pdf"
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def cleanup_files(tmp_path):
|
||||
yield
|
||||
for file in tmp_path.iterdir():
|
||||
file.unlink()
|
||||
173
tests/extractors/test_antibot_extractor_enricher.py
Normal file
173
tests/extractors/test_antibot_extractor_enricher.py
Normal file
@@ -0,0 +1,173 @@
|
||||
import pytest
|
||||
|
||||
from auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher import AntibotExtractorEnricher
|
||||
from .test_extractor_base import TestExtractorBase
|
||||
|
||||
|
||||
class DummySB:
|
||||
def __init__(self, url="", title="", visible_texts=None, visible_elements=None):
|
||||
self._url = url
|
||||
self._title = title
|
||||
self._visible_texts = visible_texts or set()
|
||||
self._visible_elements = visible_elements or set()
|
||||
|
||||
def get_current_url(self):
|
||||
return self._url
|
||||
|
||||
def get_title(self):
|
||||
return self._title
|
||||
|
||||
def is_text_visible(self, text):
|
||||
return text in self._visible_texts
|
||||
|
||||
def is_element_visible(self, selector):
|
||||
return selector in self._visible_elements
|
||||
|
||||
|
||||
class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
"""Tests Antibot Extractor/Enricher"""
|
||||
|
||||
extractor_module = "antibot_extractor_enricher"
|
||||
extractor: AntibotExtractorEnricher
|
||||
|
||||
config = {
|
||||
"save_to_pdf": False,
|
||||
"max_download_images": 0,
|
||||
"max_download_videos": 0,
|
||||
"exclude_media_extensions": ".svg,.ico,.gif",
|
||||
"proxy": None,
|
||||
}
|
||||
|
||||
@pytest.mark.download
|
||||
@pytest.mark.parametrize(
|
||||
"url,in_title,image_count,video_count",
|
||||
[
|
||||
(
|
||||
"https://en.wikipedia.org/wiki/Western_barn_owl",
|
||||
"western barn owl",
|
||||
5,
|
||||
0,
|
||||
),
|
||||
(
|
||||
"https://www.bellingcat.com/news/2025/04/29/open-sources-show-myanmar-junta-airstrike-damages-despite-post-earthquake-ceasefire/",
|
||||
"open sources show myanmar",
|
||||
5,
|
||||
0,
|
||||
),
|
||||
(
|
||||
"https://www.bellingcat.com/news/2025/03/27/gaza-israel-palestine-shot-killed-injured-destroyed-dangerous-drone-journalists-in-gaza/",
|
||||
"shot from above",
|
||||
5,
|
||||
1,
|
||||
),
|
||||
(
|
||||
"https://www.bellingcat.com/about/general-information",
|
||||
"general information",
|
||||
0, # SVGs are ignored
|
||||
0,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_download_pages_with_media(self, setup_module, make_item, url, in_title, image_count, video_count):
|
||||
"""
|
||||
Test downloading pages with media.
|
||||
"""
|
||||
|
||||
self.extractor = setup_module(
|
||||
self.extractor_module,
|
||||
{
|
||||
"save_to_pdf": True,
|
||||
"max_download_images": 5,
|
||||
"max_download_videos": "inf",
|
||||
},
|
||||
)
|
||||
|
||||
item = make_item(url)
|
||||
result = self.extractor.download(item)
|
||||
|
||||
assert result.status == "antibot", "Expected status to be 'antibot'"
|
||||
|
||||
# Check title contains all required words (case-insensitive)
|
||||
page_title = result.get_title() or ""
|
||||
assert in_title in page_title.lower(), f"Expected title to contain '{in_title}', got '{page_title}'"
|
||||
|
||||
image_media = [m for m in result.media if m.is_image() and not m.get("id") == "screenshot"]
|
||||
assert len(image_media) == image_count, f"Expected {image_count} image items, got {len(image_media)}"
|
||||
video_media = [m for m in result.media if m.is_video()]
|
||||
assert len(video_media) == video_count, f"Expected {video_count} video items, got {len(video_media)}"
|
||||
|
||||
for expected_id in ["screenshot", "pdf", "html_source_code"]:
|
||||
assert any(m.get("id") == expected_id for m in result.media), (
|
||||
f"Expected media with id '{expected_id}' not found"
|
||||
)
|
||||
|
||||
@pytest.mark.download
|
||||
@pytest.mark.parametrize(
|
||||
"url,in_html",
|
||||
[
|
||||
(
|
||||
"https://myrotvorets.center/about/",
|
||||
"Центр «Миротворець»",
|
||||
),
|
||||
(
|
||||
"https://seleniumbase.io/apps/turnstile",
|
||||
'id="captcha-success"',
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_download_with_cloudflare_turnstile(self, setup_module, make_item, url, in_html):
|
||||
"""
|
||||
Test downloading a page with Cloudflare Turnstile captcha.
|
||||
"""
|
||||
|
||||
item = make_item(url)
|
||||
self.extractor.enrich(item)
|
||||
|
||||
assert item.status != "antibot", "Expected status not to be 'antibot' after handling Cloudflare Turnstile"
|
||||
|
||||
html_media = item.get_media_by_id("html_source_code")
|
||||
with open(html_media.filename, "r", encoding="utf-8") as f:
|
||||
html_content = f.read()
|
||||
assert in_html.lower() in html_content.lower(), f"Expected HTML to contain '{in_html}'"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url,title,visible_texts,visible_elements,expected",
|
||||
[
|
||||
# URL triggers
|
||||
("https://example.com/login", "Welcome", set(), set(), True),
|
||||
("https://example.com/somepage", "Just a moment...", set(), set(), True),
|
||||
("https://example.com/", "Welcome", {"Please log in"}, set(), True),
|
||||
("https://example.com/", "Welcome", set(), {"input[type='password']"}, True),
|
||||
("https://example.com/", "Welcome", set("No issue here"), set(), False),
|
||||
# Title triggers
|
||||
("https://example.com/", "Log in", set(), set(), True),
|
||||
("https://example.com/", "Verification required", set(), set(), True),
|
||||
# Text triggers (case-insensitive)
|
||||
("https://example.com/", "Welcome", {"Sign up or log in"}, set(), True),
|
||||
("https://example.com/", "Welcome", {"sign up or log in"}, set(), True),
|
||||
# Element triggers
|
||||
("https://example.com/", "Welcome", set(), {"input[name='email']"}, True),
|
||||
# No triggers
|
||||
("https://example.com/", "Welcome", set(), set(), False),
|
||||
],
|
||||
)
|
||||
def test_hit_auth_wall(self, url, title, visible_texts, visible_elements, expected):
|
||||
extractor = AntibotExtractorEnricher()
|
||||
sb = DummySB(url=url, title=title, visible_texts=visible_texts, visible_elements=visible_elements)
|
||||
assert extractor._hit_auth_wall(sb) == expected
|
||||
|
||||
def test_enrich_handles_sb_exception(self, make_item, mocker):
|
||||
"""
|
||||
Test that enrich returns False and logs error if SB raises an exception.
|
||||
"""
|
||||
|
||||
# Patch SB to raise an exception on context enter
|
||||
mock_sb = mocker.patch("auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher.SB")
|
||||
mock_logger = mocker.patch("auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher.logger")
|
||||
mock_sb.side_effect = Exception("SB failed")
|
||||
|
||||
item = make_item("https://example.com/")
|
||||
result = self.extractor.enrich(item)
|
||||
|
||||
assert result is False
|
||||
mock_logger.error.assert_called()
|
||||
@@ -97,7 +97,7 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
)
|
||||
def test_download_nonexistent_media(self, make_item, url):
|
||||
"""
|
||||
Test to make sure that the extractor doesn't break on non-existend posts/media
|
||||
Test to make sure that the extractor doesn't break on non-existent posts/media
|
||||
|
||||
It should return 'False'
|
||||
"""
|
||||
|
||||
@@ -45,6 +45,19 @@ class TestS3Storage:
|
||||
assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg"
|
||||
|
||||
def test_uploadf_sets_acl_public(self, mocker):
|
||||
media = Media("test.png")
|
||||
mock_file = mocker.MagicMock()
|
||||
mock_s3_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
|
||||
mocker.patch.object(self.storage, "is_upload_needed", return_value=True)
|
||||
self.storage.uploadf(mock_file, media)
|
||||
mock_s3_upload.assert_called_once_with(
|
||||
mock_file,
|
||||
Bucket="test-bucket",
|
||||
Key=media.key,
|
||||
ExtraArgs={"ACL": "public-read", "ContentType": "image/png"},
|
||||
)
|
||||
|
||||
def test_uploadf_detects_charset_for_text_files(self, mocker):
|
||||
media = Media("test.txt")
|
||||
mock_file = mocker.MagicMock()
|
||||
mock_s3_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
|
||||
@@ -54,7 +67,7 @@ class TestS3Storage:
|
||||
mock_file,
|
||||
Bucket="test-bucket",
|
||||
Key=media.key,
|
||||
ExtraArgs={"ACL": "public-read", "ContentType": "text/plain"},
|
||||
ExtraArgs={"ACL": "public-read", "ContentType": "text/plain; charset=utf-8"},
|
||||
)
|
||||
|
||||
def test_upload_decision_logic(self, mocker):
|
||||
|
||||
Reference in New Issue
Block a user