mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-07-01 22:28:37 +03:00
Merge branch 'main' into webdriver-cookies
This commit is contained in:
24
.github/workflows/ruff.yaml
vendored
Normal file
24
.github/workflows/ruff.yaml
vendored
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
name: Ruff Formatting & Linting
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [ main ]
|
||||||
|
pull_request:
|
||||||
|
branches: [ main ]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Install Python
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: "3.11"
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install ruff
|
||||||
|
|
||||||
|
- name: Run Ruff
|
||||||
|
run: ruff check --output-format=github . && ruff format --check
|
||||||
10
.pre-commit-config.yaml
Normal file
10
.pre-commit-config.yaml
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
# Run Ruff formatter on commits.
|
||||||
|
repos:
|
||||||
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
|
rev: v0.9.10
|
||||||
|
hooks:
|
||||||
|
- id: ruff-format
|
||||||
|
|
||||||
|
# Runs Ruff linting - just checks without fixing, but blocks commit if errors are found.
|
||||||
|
# - id: ruff
|
||||||
|
# args: ["--output-format=concise"]
|
||||||
79
Makefile
Normal file
79
Makefile
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
# Variables
|
||||||
|
SPHINXOPTS ?=
|
||||||
|
SPHINXBUILD ?= sphinx-build
|
||||||
|
SOURCEDIR = docs/source
|
||||||
|
BUILDDIR = docs/_build
|
||||||
|
|
||||||
|
.PHONY: help
|
||||||
|
help:
|
||||||
|
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||||
|
@echo "Additional Commands:"
|
||||||
|
@echo " make test - Run all tests in 'tests/' with pytest"
|
||||||
|
@echo " make ruff-check - Run Ruff linting and formatting checks (safe)"
|
||||||
|
@echo " make ruff-clean - Auto-fix Ruff linting and formatting issues"
|
||||||
|
@echo " make docs - Generate documentation (same as 'make html')"
|
||||||
|
@echo " make clean-docs - Remove generated docs"
|
||||||
|
@echo " make docker-build - Build the Auto Archiver Docker image"
|
||||||
|
@echo " make docker-compose - Run Auto Archiver with Docker Compose"
|
||||||
|
@echo " make docker-compose-rebuild - Rebuild and run Auto Archiver with Docker Compose"
|
||||||
|
@echo " make show-docs - Build and open the documentation in a browser"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
.PHONY: test
|
||||||
|
test:
|
||||||
|
@echo "Running tests..."
|
||||||
|
@pytest tests --disable-warnings
|
||||||
|
|
||||||
|
|
||||||
|
.PHONY: ruff-check
|
||||||
|
ruff-check:
|
||||||
|
@echo "Checking code style with Ruff (safe)..."
|
||||||
|
@ruff check .
|
||||||
|
|
||||||
|
|
||||||
|
.PHONY: ruff-clean
|
||||||
|
ruff-clean:
|
||||||
|
@echo "Fixing lint and formatting issues with Ruff..."
|
||||||
|
@ruff check . --fix
|
||||||
|
@ruff format .
|
||||||
|
|
||||||
|
|
||||||
|
.PHONY: docs
|
||||||
|
docs:
|
||||||
|
@echo "Building documentation..."
|
||||||
|
@$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)"
|
||||||
|
|
||||||
|
|
||||||
|
.PHONY: clean-docs
|
||||||
|
clean-docs:
|
||||||
|
@echo "Cleaning up generated documentation files..."
|
||||||
|
@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||||
|
@rm -rf "$(SOURCEDIR)/autoapi/" "$(SOURCEDIR)/modules/autogen/"
|
||||||
|
@echo "Cleanup complete."
|
||||||
|
|
||||||
|
|
||||||
|
.PHONY: show-docs
|
||||||
|
show-docs:
|
||||||
|
@echo "Opening documentation in browser..."
|
||||||
|
@open "$(BUILDDIR)/html/index.html"
|
||||||
|
|
||||||
|
.PHONY: docker-build
|
||||||
|
docker-build:
|
||||||
|
@echo "Building local Auto Archiver Docker image..."
|
||||||
|
@docker compose build # Uses the same build context as docker-compose.yml
|
||||||
|
|
||||||
|
.PHONY: docker-compose
|
||||||
|
docker-compose:
|
||||||
|
@echo "Running Auto Archiver with Docker Compose..."
|
||||||
|
@docker compose up
|
||||||
|
|
||||||
|
.PHONY: docker-compose-rebuild
|
||||||
|
docker-compose-rebuild:
|
||||||
|
@echo "Rebuilding and running Auto Archiver with Docker Compose..."
|
||||||
|
@docker compose up --build
|
||||||
|
|
||||||
|
# Catch-all for Sphinx commands
|
||||||
|
.PHONY: Makefile
|
||||||
|
%: Makefile
|
||||||
|
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||||
@@ -1,20 +0,0 @@
|
|||||||
# Minimal makefile for Sphinx documentation
|
|
||||||
#
|
|
||||||
|
|
||||||
# You can set these variables from the command line, and also
|
|
||||||
# from the environment for the first two.
|
|
||||||
SPHINXOPTS ?=
|
|
||||||
SPHINXBUILD ?= sphinx-build
|
|
||||||
SOURCEDIR = source
|
|
||||||
BUILDDIR = _build
|
|
||||||
|
|
||||||
# Put it first so that "make" without argument is like "make help".
|
|
||||||
help:
|
|
||||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
|
||||||
|
|
||||||
.PHONY: help Makefile
|
|
||||||
|
|
||||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
|
||||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
|
||||||
%: Makefile
|
|
||||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
|
||||||
@@ -10,12 +10,12 @@ MODULES_FOLDER = Path(__file__).parent.parent.parent.parent / "src" / "auto_arch
|
|||||||
SAVE_FOLDER = Path(__file__).parent.parent / "source" / "modules" / "autogen"
|
SAVE_FOLDER = Path(__file__).parent.parent / "source" / "modules" / "autogen"
|
||||||
|
|
||||||
type_color = {
|
type_color = {
|
||||||
'feeder': "<span style='color: #FFA500'>[feeder](/core_modules.md#feeder-modules)</a></span>",
|
"feeder": "<span style='color: #FFA500'>[feeder](/core_modules.md#feeder-modules)</a></span>",
|
||||||
'extractor': "<span style='color: #00FF00'>[extractor](/core_modules.md#extractor-modules)</a></span>",
|
"extractor": "<span style='color: #00FF00'>[extractor](/core_modules.md#extractor-modules)</a></span>",
|
||||||
'enricher': "<span style='color: #0000FF'>[enricher](/core_modules.md#enricher-modules)</a></span>",
|
"enricher": "<span style='color: #0000FF'>[enricher](/core_modules.md#enricher-modules)</a></span>",
|
||||||
'database': "<span style='color: #FF00FF'>[database](/core_modules.md#database-modules)</a></span>",
|
"database": "<span style='color: #FF00FF'>[database](/core_modules.md#database-modules)</a></span>",
|
||||||
'storage': "<span style='color: #FFFF00'>[storage](/core_modules.md#storage-modules)</a></span>",
|
"storage": "<span style='color: #FFFF00'>[storage](/core_modules.md#storage-modules)</a></span>",
|
||||||
'formatter': "<span style='color: #00FFFF'>[formatter](/core_modules.md#formatter-modules)</a></span>",
|
"formatter": "<span style='color: #00FFFF'>[formatter](/core_modules.md#formatter-modules)</a></span>",
|
||||||
}
|
}
|
||||||
|
|
||||||
TABLE_HEADER = ("Option", "Description", "Default", "Type")
|
TABLE_HEADER = ("Option", "Description", "Default", "Type")
|
||||||
@@ -34,6 +34,7 @@ steps:
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def generate_module_docs():
|
def generate_module_docs():
|
||||||
yaml = YAML()
|
yaml = YAML()
|
||||||
SAVE_FOLDER.mkdir(exist_ok=True)
|
SAVE_FOLDER.mkdir(exist_ok=True)
|
||||||
@@ -48,49 +49,49 @@ def generate_module_docs():
|
|||||||
# generate the markdown file from the __manifest__.py file.
|
# generate the markdown file from the __manifest__.py file.
|
||||||
|
|
||||||
manifest = module.manifest
|
manifest = module.manifest
|
||||||
for type in manifest['type']:
|
for type in manifest["type"]:
|
||||||
modules_by_type.setdefault(type, []).append(module)
|
modules_by_type.setdefault(type, []).append(module)
|
||||||
|
|
||||||
description = "\n".join(l.lstrip() for l in manifest['description'].split("\n"))
|
description = "\n".join(line.lstrip() for line in manifest["description"].split("\n"))
|
||||||
types = ", ".join(type_color[t] for t in manifest['type'])
|
types = ", ".join(type_color[t] for t in manifest["type"])
|
||||||
readme_str = f"""
|
readme_str = f"""
|
||||||
# {manifest['name']}
|
# {manifest["name"]}
|
||||||
```{{admonition}} Module type
|
```{{admonition}} Module type
|
||||||
|
|
||||||
{types}
|
{types}
|
||||||
```
|
```
|
||||||
{description}
|
{description}
|
||||||
"""
|
"""
|
||||||
steps_str = "\n".join(f" {t}s:\n - {module.name}" for t in manifest['type'])
|
steps_str = "\n".join(f" {t}s:\n - {module.name}" for t in manifest["type"])
|
||||||
|
|
||||||
if not manifest['configs']:
|
if not manifest["configs"]:
|
||||||
config_string = f"# No configuration options for {module.name}.*\n"
|
config_string = f"# No configuration options for {module.name}.*\n"
|
||||||
else:
|
else:
|
||||||
|
|
||||||
config_table = header_row
|
config_table = header_row
|
||||||
config_yaml = {}
|
config_yaml = {}
|
||||||
|
|
||||||
global_yaml[module.name] = CommentedMap()
|
global_yaml[module.name] = CommentedMap()
|
||||||
global_yaml.yaml_set_comment_before_after_key(module.name, f"\n\n{module.display_name} configuration options")
|
global_yaml.yaml_set_comment_before_after_key(
|
||||||
|
module.name, f"\n\n{module.display_name} configuration options"
|
||||||
|
)
|
||||||
|
|
||||||
|
for key, value in manifest["configs"].items():
|
||||||
for key, value in manifest['configs'].items():
|
type = value.get("type", "string")
|
||||||
type = value.get('type', 'string')
|
if type == "json_loader":
|
||||||
if type == 'json_loader':
|
value["type"] = "json"
|
||||||
value['type'] = 'json'
|
elif type == "str":
|
||||||
elif type == 'str':
|
|
||||||
type = "string"
|
type = "string"
|
||||||
|
|
||||||
default = value.get('default', '')
|
default = value.get("default", "")
|
||||||
config_yaml[key] = default
|
config_yaml[key] = default
|
||||||
|
|
||||||
global_yaml[module.name][key] = default
|
global_yaml[module.name][key] = default
|
||||||
|
|
||||||
if value.get('help', ''):
|
if value.get("help", ""):
|
||||||
global_yaml[module.name].yaml_add_eol_comment(value.get('help', ''), key)
|
global_yaml[module.name].yaml_add_eol_comment(value.get("help", ""), key)
|
||||||
|
|
||||||
help = "**Required**. " if value.get('required', False) else "Optional. "
|
help = "**Required**. " if value.get("required", False) else "Optional. "
|
||||||
help += value.get('help', '')
|
help += value.get("help", "")
|
||||||
config_table += f"| `{module.name}.{key}` | {help} | {value.get('default', '')} | {type} |\n"
|
config_table += f"| `{module.name}.{key}` | {help} | {value.get('default', '')} | {type} |\n"
|
||||||
global_table += f"| `{module.name}.{key}` | {help} | {default} | {type} |\n"
|
global_table += f"| `{module.name}.{key}` | {help} | {default} | {type} |\n"
|
||||||
readme_str += "\n## Configuration Options\n"
|
readme_str += "\n## Configuration Options\n"
|
||||||
@@ -98,18 +99,18 @@ def generate_module_docs():
|
|||||||
|
|
||||||
config_string = io.BytesIO()
|
config_string = io.BytesIO()
|
||||||
yaml.dump({module.name: config_yaml}, config_string)
|
yaml.dump({module.name: config_yaml}, config_string)
|
||||||
config_string = config_string.getvalue().decode('utf-8')
|
config_string = config_string.getvalue().decode("utf-8")
|
||||||
yaml_string = EXAMPLE_YAML.format(steps_str=steps_str, config_string=config_string)
|
yaml_string = EXAMPLE_YAML.format(steps_str=steps_str, config_string=config_string)
|
||||||
readme_str += f"```{{code}} yaml\n{yaml_string}\n```\n"
|
readme_str += f"```{{code}} yaml\n{yaml_string}\n```\n"
|
||||||
|
|
||||||
if manifest['configs']:
|
if manifest["configs"]:
|
||||||
readme_str += "\n### Command Line:\n"
|
readme_str += "\n### Command Line:\n"
|
||||||
readme_str += config_table
|
readme_str += config_table
|
||||||
|
|
||||||
# add a link to the autodoc refs
|
# add a link to the autodoc refs
|
||||||
readme_str += f"\n[API Reference](../../../autoapi/{module.name}/index)\n"
|
readme_str += f"\n[API Reference](../../../autoapi/{module.name}/index)\n"
|
||||||
# create the module.type folder, use the first type just for where to store the file
|
# create the module.type folder, use the first type just for where to store the file
|
||||||
for type in manifest['type']:
|
for type in manifest["type"]:
|
||||||
type_folder = SAVE_FOLDER / type
|
type_folder = SAVE_FOLDER / type
|
||||||
type_folder.mkdir(exist_ok=True)
|
type_folder.mkdir(exist_ok=True)
|
||||||
with open(type_folder / f"{module.name}.md", "w") as f:
|
with open(type_folder / f"{module.name}.md", "w") as f:
|
||||||
@@ -117,10 +118,10 @@ def generate_module_docs():
|
|||||||
f.write(readme_str)
|
f.write(readme_str)
|
||||||
generate_index(modules_by_type)
|
generate_index(modules_by_type)
|
||||||
|
|
||||||
del global_yaml['placeholder']
|
del global_yaml["placeholder"]
|
||||||
global_string = io.BytesIO()
|
global_string = io.BytesIO()
|
||||||
global_yaml = yaml.dump(global_yaml, global_string)
|
global_yaml = yaml.dump(global_yaml, global_string)
|
||||||
global_string = global_string.getvalue().decode('utf-8')
|
global_string = global_string.getvalue().decode("utf-8")
|
||||||
global_yaml = f"```yaml\n{global_string}\n```"
|
global_yaml = f"```yaml\n{global_string}\n```"
|
||||||
with open(SAVE_FOLDER / "configs_cheatsheet.md", "w") as f:
|
with open(SAVE_FOLDER / "configs_cheatsheet.md", "w") as f:
|
||||||
f.write("### Configuration File\n" + global_yaml + "\n### Command Line\n" + global_table)
|
f.write("### Configuration File\n" + global_yaml + "\n### Command Line\n" + global_table)
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import os
|
|||||||
from importlib.metadata import metadata
|
from importlib.metadata import metadata
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
sys.path.append(os.path.abspath('../scripts'))
|
sys.path.append(os.path.abspath("../scripts"))
|
||||||
from scripts import generate_module_docs
|
from scripts import generate_module_docs
|
||||||
from auto_archiver.version import __version__
|
from auto_archiver.version import __version__
|
||||||
|
|
||||||
@@ -20,33 +20,35 @@ project = package_metadata["name"]
|
|||||||
copyright = str(datetime.now().year)
|
copyright = str(datetime.now().year)
|
||||||
author = "Bellingcat"
|
author = "Bellingcat"
|
||||||
release = package_metadata["version"]
|
release = package_metadata["version"]
|
||||||
language = 'en'
|
language = "en"
|
||||||
|
|
||||||
# -- General configuration ---------------------------------------------------
|
# -- General configuration ---------------------------------------------------
|
||||||
extensions = [
|
extensions = [
|
||||||
"myst_parser", # Markdown support
|
"myst_parser", # Markdown support
|
||||||
"autoapi.extension", # Generate API documentation from docstrings
|
"autoapi.extension", # Generate API documentation from docstrings
|
||||||
"sphinxcontrib.mermaid", # Mermaid diagrams
|
"sphinxcontrib.mermaid", # Mermaid diagrams
|
||||||
"sphinx.ext.viewcode", # Source code links
|
"sphinx.ext.viewcode", # Source code links
|
||||||
"sphinx_copybutton",
|
"sphinx_copybutton",
|
||||||
"sphinx.ext.napoleon", # Google-style and NumPy-style docstrings
|
"sphinx.ext.napoleon", # Google-style and NumPy-style docstrings
|
||||||
"sphinx.ext.autosectionlabel",
|
"sphinx.ext.autosectionlabel",
|
||||||
# 'sphinx.ext.autosummary', # Summarize module/class/function docs
|
# 'sphinx.ext.autosummary', # Summarize module/class/function docs
|
||||||
]
|
]
|
||||||
|
|
||||||
templates_path = ['_templates']
|
templates_path = ["_templates"]
|
||||||
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", ""]
|
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", ""]
|
||||||
|
|
||||||
|
|
||||||
# -- AutoAPI Configuration ---------------------------------------------------
|
# -- AutoAPI Configuration ---------------------------------------------------
|
||||||
autoapi_type = 'python'
|
autoapi_type = "python"
|
||||||
autoapi_dirs = ["../../src/auto_archiver/core/", "../../src/auto_archiver/utils/"]
|
autoapi_dirs = ["../../src/auto_archiver/core/", "../../src/auto_archiver/utils/"]
|
||||||
# get all the modules and add them to the autoapi_dirs
|
# get all the modules and add them to the autoapi_dirs
|
||||||
autoapi_dirs.extend([f"../../src/auto_archiver/modules/{m}" for m in os.listdir("../../src/auto_archiver/modules")])
|
autoapi_dirs.extend([f"../../src/auto_archiver/modules/{m}" for m in os.listdir("../../src/auto_archiver/modules")])
|
||||||
autodoc_typehints = "signature" # Include type hints in the signature
|
autodoc_typehints = "signature" # Include type hints in the signature
|
||||||
autoapi_ignore = ["*/version.py", ] # Ignore specific modules
|
autoapi_ignore = [
|
||||||
autoapi_keep_files = True # Option to retain intermediate JSON files for debugging
|
"*/version.py",
|
||||||
autoapi_add_toctree_entry = True # Include API docs in the TOC
|
] # Ignore specific modules
|
||||||
|
autoapi_keep_files = True # Option to retain intermediate JSON files for debugging
|
||||||
|
autoapi_add_toctree_entry = True # Include API docs in the TOC
|
||||||
autoapi_python_use_implicit_namespaces = True
|
autoapi_python_use_implicit_namespaces = True
|
||||||
autoapi_template_dir = "../_templates/autoapi"
|
autoapi_template_dir = "../_templates/autoapi"
|
||||||
autoapi_options = [
|
autoapi_options = [
|
||||||
@@ -59,13 +61,13 @@ autoapi_options = [
|
|||||||
|
|
||||||
# -- Markdown Support --------------------------------------------------------
|
# -- Markdown Support --------------------------------------------------------
|
||||||
myst_enable_extensions = [
|
myst_enable_extensions = [
|
||||||
"deflist", # Definition lists
|
"deflist", # Definition lists
|
||||||
"html_admonition", # HTML-style admonitions
|
"html_admonition", # HTML-style admonitions
|
||||||
"html_image", # Inline HTML images
|
"html_image", # Inline HTML images
|
||||||
"replacements", # Substitutions like (C)
|
"replacements", # Substitutions like (C)
|
||||||
"smartquotes", # Smart quotes
|
"smartquotes", # Smart quotes
|
||||||
"linkify", # Auto-detect links
|
"linkify", # Auto-detect links
|
||||||
"substitution", # Text substitutions
|
"substitution", # Text substitutions
|
||||||
]
|
]
|
||||||
myst_heading_anchors = 2
|
myst_heading_anchors = 2
|
||||||
myst_fence_as_directive = ["mermaid"]
|
myst_fence_as_directive = ["mermaid"]
|
||||||
@@ -76,7 +78,7 @@ source_suffix = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# -- Options for HTML output -------------------------------------------------
|
# -- Options for HTML output -------------------------------------------------
|
||||||
html_theme = 'sphinx_book_theme'
|
html_theme = "sphinx_book_theme"
|
||||||
html_static_path = ["../_static"]
|
html_static_path = ["../_static"]
|
||||||
html_css_files = ["custom.css"]
|
html_css_files = ["custom.css"]
|
||||||
html_title = f"Auto Archiver v{__version__}"
|
html_title = f"Auto Archiver v{__version__}"
|
||||||
@@ -87,7 +89,6 @@ html_theme_options = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
copybutton_prompt_text = r">>> |\.\.\."
|
copybutton_prompt_text = r">>> |\.\.\."
|
||||||
copybutton_prompt_is_regexp = True
|
copybutton_prompt_is_regexp = True
|
||||||
copybutton_only_copy_prompt_lines = False
|
copybutton_only_copy_prompt_lines = False
|
||||||
@@ -32,4 +32,5 @@ testing
|
|||||||
docs
|
docs
|
||||||
release
|
release
|
||||||
settings_page
|
settings_page
|
||||||
|
style_guide
|
||||||
```
|
```
|
||||||
67
docs/source/development/style_guide.md
Normal file
67
docs/source/development/style_guide.md
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
# Style Guide
|
||||||
|
|
||||||
|
|
||||||
|
The project uses [Ruff](https://docs.astral.sh/ruff/) for linting and formatting.
|
||||||
|
Our style configurations are set in the `pyproject.toml` file. If needed, you can modify them there.
|
||||||
|
|
||||||
|
|
||||||
|
### **Formatting (Auto-Run Before Commit) 🛠️**
|
||||||
|
|
||||||
|
We have a pre-commit hook to run the formatter before you commit.
|
||||||
|
This requires you to set it up once locally, then it will run automatically when you commit changes.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
poetry run pre-commit install
|
||||||
|
```
|
||||||
|
|
||||||
|
Ruff can also be to run automatically.
|
||||||
|
Alternative: Ruff can also be [integrated with most editors](https://docs.astral.sh/ruff/editors/setup/) for real-time formatting.
|
||||||
|
|
||||||
|
### **Linting (Check Before Pushing) 🔍**
|
||||||
|
|
||||||
|
We recommend you also run the linter before pushing code.
|
||||||
|
|
||||||
|
We have [Makefile](../../../Makefile) commands to run common tasks.
|
||||||
|
|
||||||
|
Tip: if you're on Windows you might need to install `make` first, or alternatively you can use ruff commands directly.
|
||||||
|
|
||||||
|
|
||||||
|
**Lint Check:** This outputs a report of any issues found, without attempting to fix them:
|
||||||
|
```shell
|
||||||
|
make ruff-check
|
||||||
|
```
|
||||||
|
|
||||||
|
Tip: To see a more detailed linting report, you can remove the following line from the `pyproject.toml` file:
|
||||||
|
```toml
|
||||||
|
[tool.ruff]
|
||||||
|
|
||||||
|
# Remove this for a more detailed lint report
|
||||||
|
output-format = "concise"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Lint Fix:** This command will attempt to fix some of the issues it picked up with the lint check.
|
||||||
|
|
||||||
|
Note not all warnings can be fixed automatically.
|
||||||
|
|
||||||
|
⚠️ Warning: This can cause breaking changes. ⚠️
|
||||||
|
|
||||||
|
Most fixes are safe, but some non-standard practices such as dynamic loading are not picked up by linters. Ensure you check any modifications by this before committing them.
|
||||||
|
```shell
|
||||||
|
make ruff-fix
|
||||||
|
```
|
||||||
|
|
||||||
|
**Changing Configurations ⚙️**
|
||||||
|
|
||||||
|
|
||||||
|
Our rules are quite lenient for general usage, but if you want to run more rigorous checks you can then run checks with additional rules to see more nuanced errors which you can review manually.
|
||||||
|
Check out the [ruff documentation](https://docs.astral.sh/ruff/configuration/) for the full list of rules.
|
||||||
|
One example is to extend the selected rules for linting the `pyproject.toml` file:
|
||||||
|
|
||||||
|
```toml
|
||||||
|
[tool.ruff.lint]
|
||||||
|
# Extend the rules to check for by adding them to this option:
|
||||||
|
# See documentation for more details: https://docs.astral.sh/ruff/rules/
|
||||||
|
extend-select = ["B"]
|
||||||
|
```
|
||||||
|
|
||||||
|
Then re-run the `make ruff-check` command to see the new rules in action.
|
||||||
@@ -51,6 +51,7 @@ The invocations below will run the auto-archiver Docker image using a configurat
|
|||||||
docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver
|
docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver
|
||||||
|
|
||||||
# uses the same configuration, but with the `gsheet_feeder`, a header on row 2 and with some different column names
|
# uses the same configuration, but with the `gsheet_feeder`, a header on row 2 and with some different column names
|
||||||
|
# Note this expects you to have followed the [Google Sheets setup](how_to/google_sheets.md) and added your service_account.json to the `secrets/` folder
|
||||||
# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided
|
# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided
|
||||||
docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --feeders=gsheet_feeder --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}'
|
docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --feeders=gsheet_feeder --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}'
|
||||||
# Runs auto-archiver for the first time, but in 'full' mode, enabling all modules to get a full settings file
|
# Runs auto-archiver for the first time, but in 'full' mode, enabling all modules to get a full settings file
|
||||||
|
|||||||
157
poetry.lock
generated
157
poetry.lock
generated
@@ -481,6 +481,18 @@ files = [
|
|||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
pycparser = "*"
|
pycparser = "*"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cfgv"
|
||||||
|
version = "3.4.0"
|
||||||
|
description = "Validate configuration and produce human readable error messages."
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
groups = ["dev"]
|
||||||
|
files = [
|
||||||
|
{file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"},
|
||||||
|
{file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "charset-normalizer"
|
name = "charset-normalizer"
|
||||||
version = "3.4.1"
|
version = "3.4.1"
|
||||||
@@ -696,6 +708,18 @@ calendars = ["convertdate (>=2.2.1)", "hijridate"]
|
|||||||
fasttext = ["fasttext (>=0.9.1)", "numpy (>=1.19.3,<2)"]
|
fasttext = ["fasttext (>=0.9.1)", "numpy (>=1.19.3,<2)"]
|
||||||
langdetect = ["langdetect (>=1.0.0)"]
|
langdetect = ["langdetect (>=1.0.0)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "distlib"
|
||||||
|
version = "0.3.9"
|
||||||
|
description = "Distribution utilities"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
groups = ["dev"]
|
||||||
|
files = [
|
||||||
|
{file = "distlib-0.3.9-py2.py3-none-any.whl", hash = "sha256:47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87"},
|
||||||
|
{file = "distlib-0.3.9.tar.gz", hash = "sha256:a60f20dea646b8a33f3e7772f74dc0b2d0772d2837ee1342a00645c81edf9403"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "docutils"
|
name = "docutils"
|
||||||
version = "0.21.2"
|
version = "0.21.2"
|
||||||
@@ -742,6 +766,23 @@ future = "*"
|
|||||||
[package.extras]
|
[package.extras]
|
||||||
dev = ["Sphinx (==2.1.0)", "future (==0.17.1)", "numpy (==1.16.4)", "pytest (==4.6.1)", "pytest-mock (==1.10.4)", "tox (==3.12.1)"]
|
dev = ["Sphinx (==2.1.0)", "future (==0.17.1)", "numpy (==1.16.4)", "pytest (==4.6.1)", "pytest-mock (==1.10.4)", "tox (==3.12.1)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "filelock"
|
||||||
|
version = "3.17.0"
|
||||||
|
description = "A platform independent file lock."
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.9"
|
||||||
|
groups = ["dev"]
|
||||||
|
files = [
|
||||||
|
{file = "filelock-3.17.0-py3-none-any.whl", hash = "sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338"},
|
||||||
|
{file = "filelock-3.17.0.tar.gz", hash = "sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
docs = ["furo (>=2024.8.6)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"]
|
||||||
|
testing = ["covdefaults (>=2.3)", "coverage (>=7.6.10)", "diff-cover (>=9.2.1)", "pytest (>=8.3.4)", "pytest-asyncio (>=0.25.2)", "pytest-cov (>=6)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.28.1)"]
|
||||||
|
typing = ["typing-extensions (>=4.12.2)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "future"
|
name = "future"
|
||||||
version = "1.0.0"
|
version = "1.0.0"
|
||||||
@@ -919,6 +960,21 @@ files = [
|
|||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
pyparsing = {version = ">=2.4.2,<3.0.0 || >3.0.0,<3.0.1 || >3.0.1,<3.0.2 || >3.0.2,<3.0.3 || >3.0.3,<4", markers = "python_version > \"3.0\""}
|
pyparsing = {version = ">=2.4.2,<3.0.0 || >3.0.0,<3.0.1 || >3.0.1,<3.0.2 || >3.0.2,<3.0.3 || >3.0.3,<4", markers = "python_version > \"3.0\""}
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "identify"
|
||||||
|
version = "2.6.9"
|
||||||
|
description = "File identification library for Python"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.9"
|
||||||
|
groups = ["dev"]
|
||||||
|
files = [
|
||||||
|
{file = "identify-2.6.9-py2.py3-none-any.whl", hash = "sha256:c98b4322da415a8e5a70ff6e51fbc2d2932c015532d77e9f8537b4ba7813b150"},
|
||||||
|
{file = "identify-2.6.9.tar.gz", hash = "sha256:d40dfe3142a1421d8518e3d3985ef5ac42890683e32306ad614a29490abeb6bf"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
license = ["ukkonen"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "idna"
|
name = "idna"
|
||||||
version = "3.10"
|
version = "3.10"
|
||||||
@@ -1260,6 +1316,18 @@ rtd = ["ipython", "sphinx (>=7)", "sphinx-autodoc2 (>=0.5.0,<0.6.0)", "sphinx-bo
|
|||||||
testing = ["beautifulsoup4", "coverage[toml]", "defusedxml", "pygments (<2.19)", "pytest (>=8,<9)", "pytest-cov", "pytest-param-files (>=0.6.0,<0.7.0)", "pytest-regressions", "sphinx-pytest"]
|
testing = ["beautifulsoup4", "coverage[toml]", "defusedxml", "pygments (<2.19)", "pytest (>=8,<9)", "pytest-cov", "pytest-param-files (>=0.6.0,<0.7.0)", "pytest-regressions", "sphinx-pytest"]
|
||||||
testing-docutils = ["pygments", "pytest (>=8,<9)", "pytest-param-files (>=0.6.0,<0.7.0)"]
|
testing-docutils = ["pygments", "pytest (>=8,<9)", "pytest-param-files (>=0.6.0,<0.7.0)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "nodeenv"
|
||||||
|
version = "1.9.1"
|
||||||
|
description = "Node.js virtual environment builder"
|
||||||
|
optional = false
|
||||||
|
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
|
||||||
|
groups = ["dev"]
|
||||||
|
files = [
|
||||||
|
{file = "nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9"},
|
||||||
|
{file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "numpy"
|
name = "numpy"
|
||||||
version = "2.1.3"
|
version = "2.1.3"
|
||||||
@@ -1513,6 +1581,23 @@ tests = ["check-manifest", "coverage (>=7.4.2)", "defusedxml", "markdown2", "ole
|
|||||||
typing = ["typing-extensions"]
|
typing = ["typing-extensions"]
|
||||||
xmp = ["defusedxml"]
|
xmp = ["defusedxml"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "platformdirs"
|
||||||
|
version = "4.3.6"
|
||||||
|
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
groups = ["dev"]
|
||||||
|
files = [
|
||||||
|
{file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"},
|
||||||
|
{file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"]
|
||||||
|
test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"]
|
||||||
|
type = ["mypy (>=1.11.2)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pluggy"
|
name = "pluggy"
|
||||||
version = "1.5.0"
|
version = "1.5.0"
|
||||||
@@ -1529,6 +1614,25 @@ files = [
|
|||||||
dev = ["pre-commit", "tox"]
|
dev = ["pre-commit", "tox"]
|
||||||
testing = ["pytest", "pytest-benchmark"]
|
testing = ["pytest", "pytest-benchmark"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pre-commit"
|
||||||
|
version = "4.1.0"
|
||||||
|
description = "A framework for managing and maintaining multi-language pre-commit hooks."
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.9"
|
||||||
|
groups = ["dev"]
|
||||||
|
files = [
|
||||||
|
{file = "pre_commit-4.1.0-py2.py3-none-any.whl", hash = "sha256:d29e7cb346295bcc1cc75fc3e92e343495e3ea0196c9ec6ba53f49f10ab6ae7b"},
|
||||||
|
{file = "pre_commit-4.1.0.tar.gz", hash = "sha256:ae3f018575a588e30dfddfab9a05448bfbd6b73d78709617b5a2b853549716d4"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
cfgv = ">=2.0.0"
|
||||||
|
identify = ">=1.0.0"
|
||||||
|
nodeenv = ">=0.11.1"
|
||||||
|
pyyaml = ">=5.1"
|
||||||
|
virtualenv = ">=20.10.0"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "proto-plus"
|
name = "proto-plus"
|
||||||
version = "1.26.0"
|
version = "1.26.0"
|
||||||
@@ -1902,7 +2006,7 @@ version = "6.0.2"
|
|||||||
description = "YAML parser and emitter for Python"
|
description = "YAML parser and emitter for Python"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
groups = ["docs"]
|
groups = ["dev", "docs"]
|
||||||
files = [
|
files = [
|
||||||
{file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
|
{file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
|
||||||
{file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
|
{file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
|
||||||
@@ -2246,6 +2350,34 @@ files = [
|
|||||||
{file = "ruamel.yaml.clib-0.2.12.tar.gz", hash = "sha256:6c8fbb13ec503f99a91901ab46e0b07ae7941cd527393187039aec586fdfd36f"},
|
{file = "ruamel.yaml.clib-0.2.12.tar.gz", hash = "sha256:6c8fbb13ec503f99a91901ab46e0b07ae7941cd527393187039aec586fdfd36f"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ruff"
|
||||||
|
version = "0.9.10"
|
||||||
|
description = "An extremely fast Python linter and code formatter, written in Rust."
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.7"
|
||||||
|
groups = ["dev"]
|
||||||
|
files = [
|
||||||
|
{file = "ruff-0.9.10-py3-none-linux_armv6l.whl", hash = "sha256:eb4d25532cfd9fe461acc83498361ec2e2252795b4f40b17e80692814329e42d"},
|
||||||
|
{file = "ruff-0.9.10-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:188a6638dab1aa9bb6228a7302387b2c9954e455fb25d6b4470cb0641d16759d"},
|
||||||
|
{file = "ruff-0.9.10-py3-none-macosx_11_0_arm64.whl", hash = "sha256:5284dcac6b9dbc2fcb71fdfc26a217b2ca4ede6ccd57476f52a587451ebe450d"},
|
||||||
|
{file = "ruff-0.9.10-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:47678f39fa2a3da62724851107f438c8229a3470f533894b5568a39b40029c0c"},
|
||||||
|
{file = "ruff-0.9.10-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:99713a6e2766b7a17147b309e8c915b32b07a25c9efd12ada79f217c9c778b3e"},
|
||||||
|
{file = "ruff-0.9.10-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:524ee184d92f7c7304aa568e2db20f50c32d1d0caa235d8ddf10497566ea1a12"},
|
||||||
|
{file = "ruff-0.9.10-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:df92aeac30af821f9acf819fc01b4afc3dfb829d2782884f8739fb52a8119a16"},
|
||||||
|
{file = "ruff-0.9.10-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de42e4edc296f520bb84954eb992a07a0ec5a02fecb834498415908469854a52"},
|
||||||
|
{file = "ruff-0.9.10-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d257f95b65806104b6b1ffca0ea53f4ef98454036df65b1eda3693534813ecd1"},
|
||||||
|
{file = "ruff-0.9.10-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b60dec7201c0b10d6d11be00e8f2dbb6f40ef1828ee75ed739923799513db24c"},
|
||||||
|
{file = "ruff-0.9.10-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:d838b60007da7a39c046fcdd317293d10b845001f38bcb55ba766c3875b01e43"},
|
||||||
|
{file = "ruff-0.9.10-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:ccaf903108b899beb8e09a63ffae5869057ab649c1e9231c05ae354ebc62066c"},
|
||||||
|
{file = "ruff-0.9.10-py3-none-musllinux_1_2_i686.whl", hash = "sha256:f9567d135265d46e59d62dc60c0bfad10e9a6822e231f5b24032dba5a55be6b5"},
|
||||||
|
{file = "ruff-0.9.10-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5f202f0d93738c28a89f8ed9eaba01b7be339e5d8d642c994347eaa81c6d75b8"},
|
||||||
|
{file = "ruff-0.9.10-py3-none-win32.whl", hash = "sha256:bfb834e87c916521ce46b1788fbb8484966e5113c02df216680102e9eb960029"},
|
||||||
|
{file = "ruff-0.9.10-py3-none-win_amd64.whl", hash = "sha256:f2160eeef3031bf4b17df74e307d4c5fb689a6f3a26a2de3f7ef4044e3c484f1"},
|
||||||
|
{file = "ruff-0.9.10-py3-none-win_arm64.whl", hash = "sha256:5fd804c0327a5e5ea26615550e706942f348b197d5475ff34c19733aee4b2e69"},
|
||||||
|
{file = "ruff-0.9.10.tar.gz", hash = "sha256:9bacb735d7bada9cfb0f2c227d3658fc443d90a727b47f206fb33f52f3c0eac7"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "s3transfer"
|
name = "s3transfer"
|
||||||
version = "0.11.4"
|
version = "0.11.4"
|
||||||
@@ -2883,6 +3015,27 @@ typing-extensions = {version = ">=4.0", markers = "python_version < \"3.11\""}
|
|||||||
[package.extras]
|
[package.extras]
|
||||||
standard = ["colorama (>=0.4)", "httptools (>=0.6.3)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"]
|
standard = ["colorama (>=0.4)", "httptools (>=0.6.3)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "virtualenv"
|
||||||
|
version = "20.29.3"
|
||||||
|
description = "Virtual Python Environment builder"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
groups = ["dev"]
|
||||||
|
files = [
|
||||||
|
{file = "virtualenv-20.29.3-py3-none-any.whl", hash = "sha256:3e3d00f5807e83b234dfb6122bf37cfadf4be216c53a49ac059d02414f819170"},
|
||||||
|
{file = "virtualenv-20.29.3.tar.gz", hash = "sha256:95e39403fcf3940ac45bc717597dba16110b74506131845d9b687d5e73d947ac"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
distlib = ">=0.3.7,<1"
|
||||||
|
filelock = ">=3.12.2,<4"
|
||||||
|
platformdirs = ">=3.9.1,<5"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
|
||||||
|
test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "vk-api"
|
name = "vk-api"
|
||||||
version = "11.9.9"
|
version = "11.9.9"
|
||||||
@@ -3185,4 +3338,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.1"
|
lock-version = "2.1"
|
||||||
python-versions = ">=3.10,<3.13"
|
python-versions = ">=3.10,<3.13"
|
||||||
content-hash = "2d0a953383901fe12e97f6f56a76a9d8008788695425792eedbf739a18585188"
|
content-hash = "fbd6cdff4eb38021115a8cd361df7c292733028822f92f45cb667971c4bce901"
|
||||||
|
|||||||
@@ -64,6 +64,8 @@ pytest = "^8.3.4"
|
|||||||
autopep8 = "^2.3.1"
|
autopep8 = "^2.3.1"
|
||||||
pytest-loguru = "^0.4.0"
|
pytest-loguru = "^0.4.0"
|
||||||
pytest-mock = "^3.14.0"
|
pytest-mock = "^3.14.0"
|
||||||
|
ruff = "^0.9.10"
|
||||||
|
pre-commit = "^4.1.0"
|
||||||
|
|
||||||
[tool.poetry.group.docs.dependencies]
|
[tool.poetry.group.docs.dependencies]
|
||||||
sphinx = "^8.1.3"
|
sphinx = "^8.1.3"
|
||||||
@@ -90,3 +92,28 @@ markers = [
|
|||||||
"download: marks tests that download content from the network",
|
"download: marks tests that download content from the network",
|
||||||
"incremental: marks a class to run tests incrementally. If a test fails in the class, the remaining tests will be skipped",
|
"incremental: marks a class to run tests incrementally. If a test fails in the class, the remaining tests will be skipped",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
#exclude = ["docs"]
|
||||||
|
line-length = 120
|
||||||
|
# Remove this for a more detailed lint report
|
||||||
|
output-format = "concise"
|
||||||
|
# TODO: temp ignore rule for timestamping_enricher to allow for open PR
|
||||||
|
exclude = ["src/auto_archiver/modules/timestamping_enricher/*"]
|
||||||
|
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
# Extend the rules to check for by adding them to this option:
|
||||||
|
# See documentation for more details: https://docs.astral.sh/ruff/rules/
|
||||||
|
#extend-select = ["B"]
|
||||||
|
|
||||||
|
[tool.ruff.lint.per-file-ignores]
|
||||||
|
# Ignore import violations in __init__.py files
|
||||||
|
"__init__.py" = ["F401", "F403"]
|
||||||
|
# Ignore 'useless expression' in manifest files.
|
||||||
|
"__manifest__.py" = ["B018"]
|
||||||
|
|
||||||
|
|
||||||
|
[tool.ruff.format]
|
||||||
|
docstring-code-format = false
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import os.path
|
import os.path
|
||||||
import click, json
|
import click
|
||||||
|
import json
|
||||||
|
|
||||||
from google.auth.transport.requests import Request
|
from google.auth.transport.requests import Request
|
||||||
from google.oauth2.credentials import Credentials
|
from google.oauth2.credentials import Credentials
|
||||||
@@ -70,11 +71,7 @@ def main(credentials, token):
|
|||||||
print(emailAddress)
|
print(emailAddress)
|
||||||
|
|
||||||
# Call the Drive v3 API and return some files
|
# Call the Drive v3 API and return some files
|
||||||
results = (
|
results = service.files().list(pageSize=10, fields="nextPageToken, files(id, name)").execute()
|
||||||
service.files()
|
|
||||||
.list(pageSize=10, fields="nextPageToken, files(id, name)")
|
|
||||||
.execute()
|
|
||||||
)
|
|
||||||
items = results.get("files", [])
|
items = results.get("files", [])
|
||||||
|
|
||||||
if not items:
|
if not items:
|
||||||
|
|||||||
@@ -8,12 +8,14 @@ from auto_archiver.core.module import ModuleFactory
|
|||||||
from auto_archiver.core.consts import MODULE_TYPES
|
from auto_archiver.core.consts import MODULE_TYPES
|
||||||
from auto_archiver.core.config import EMPTY_CONFIG
|
from auto_archiver.core.config import EMPTY_CONFIG
|
||||||
|
|
||||||
|
|
||||||
class SchemaEncoder(json.JSONEncoder):
|
class SchemaEncoder(json.JSONEncoder):
|
||||||
def default(self, obj):
|
def default(self, obj):
|
||||||
if isinstance(obj, set):
|
if isinstance(obj, set):
|
||||||
return list(obj)
|
return list(obj)
|
||||||
return json.JSONEncoder.default(self, obj)
|
return json.JSONEncoder.default(self, obj)
|
||||||
|
|
||||||
|
|
||||||
# Get available modules
|
# Get available modules
|
||||||
module_factory = ModuleFactory()
|
module_factory = ModuleFactory()
|
||||||
available_modules = module_factory.available_modules()
|
available_modules = module_factory.available_modules()
|
||||||
@@ -21,32 +23,40 @@ available_modules = module_factory.available_modules()
|
|||||||
modules_by_type = {}
|
modules_by_type = {}
|
||||||
# Categorize modules by type
|
# Categorize modules by type
|
||||||
for module in available_modules:
|
for module in available_modules:
|
||||||
for type in module.manifest.get('type', []):
|
for type in module.manifest.get("type", []):
|
||||||
modules_by_type.setdefault(type, []).append(module)
|
modules_by_type.setdefault(type, []).append(module)
|
||||||
|
|
||||||
all_modules_ordered_by_type = sorted(available_modules, key=lambda x: (MODULE_TYPES.index(x.type[0]), not x.requires_setup))
|
all_modules_ordered_by_type = sorted(
|
||||||
|
available_modules, key=lambda x: (MODULE_TYPES.index(x.type[0]), not x.requires_setup)
|
||||||
|
)
|
||||||
|
|
||||||
yaml: YAML = YAML()
|
yaml: YAML = YAML()
|
||||||
|
|
||||||
config_string = io.BytesIO()
|
config_string = io.BytesIO()
|
||||||
yaml.dump(EMPTY_CONFIG, config_string)
|
yaml.dump(EMPTY_CONFIG, config_string)
|
||||||
config_string = config_string.getvalue().decode('utf-8')
|
config_string = config_string.getvalue().decode("utf-8")
|
||||||
output_schema = {
|
output_schema = {
|
||||||
'modules': dict((module.name,
|
"modules": dict(
|
||||||
{
|
(
|
||||||
'name': module.name,
|
module.name,
|
||||||
'display_name': module.display_name,
|
{
|
||||||
'manifest': module.manifest,
|
"name": module.name,
|
||||||
'configs': module.configs or None
|
"display_name": module.display_name,
|
||||||
}
|
"manifest": module.manifest,
|
||||||
) for module in all_modules_ordered_by_type),
|
"configs": module.configs or None,
|
||||||
'steps': dict((f"{module_type}s", [module.name for module in modules_by_type[module_type]]) for module_type in MODULE_TYPES),
|
},
|
||||||
'configs': [m.name for m in all_modules_ordered_by_type if m.configs],
|
)
|
||||||
'module_types': MODULE_TYPES,
|
for module in all_modules_ordered_by_type
|
||||||
'empty_config': config_string
|
),
|
||||||
|
"steps": dict(
|
||||||
|
(f"{module_type}s", [module.name for module in modules_by_type[module_type]]) for module_type in MODULE_TYPES
|
||||||
|
),
|
||||||
|
"configs": [m.name for m in all_modules_ordered_by_type if m.configs],
|
||||||
|
"module_types": MODULE_TYPES,
|
||||||
|
"empty_config": config_string,
|
||||||
}
|
}
|
||||||
|
|
||||||
current_file_dir = os.path.dirname(os.path.abspath(__file__))
|
current_file_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
output_file = os.path.join(current_file_dir, 'settings/src/schema.json')
|
output_file = os.path.join(current_file_dir, "settings/src/schema.json")
|
||||||
with open(output_file, 'w') as file:
|
with open(output_file, "w") as file:
|
||||||
json.dump(output_schema, file, indent=4, cls=SchemaEncoder)
|
json.dump(output_schema, file, indent=4, cls=SchemaEncoder)
|
||||||
@@ -12,7 +12,6 @@ Then run this script to create a new session file.
|
|||||||
You will need to provide your phone number and a 2FA code the first time you run this script.
|
You will need to provide your phone number and a 2FA code the first time you run this script.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from telethon.sync import TelegramClient
|
from telethon.sync import TelegramClient
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
@@ -26,4 +25,3 @@ SESSION_FILE = "secrets/anon-insta"
|
|||||||
os.makedirs("secrets", exist_ok=True)
|
os.makedirs("secrets", exist_ok=True)
|
||||||
with TelegramClient(SESSION_FILE, API_ID, API_HASH) as client:
|
with TelegramClient(SESSION_FILE, API_ID, API_HASH) as client:
|
||||||
logger.success(f"New session file created: {SESSION_FILE}.session")
|
logger.success(f"New session file created: {SESSION_FILE}.session")
|
||||||
|
|
||||||
|
|||||||
@@ -1,9 +1,13 @@
|
|||||||
""" Entry point for the auto_archiver package. """
|
"""Entry point for the auto_archiver package."""
|
||||||
|
|
||||||
from auto_archiver.core.orchestrator import ArchivingOrchestrator
|
from auto_archiver.core.orchestrator import ArchivingOrchestrator
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
for _ in ArchivingOrchestrator()._command_line_run(sys.argv[1:]): pass
|
for _ in ArchivingOrchestrator()._command_line_run(sys.argv[1:]):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
""" Core modules to handle things such as orchestration, metadata and configs..
|
"""Core modules to handle things such as orchestration, metadata and configs.."""
|
||||||
|
|
||||||
"""
|
|
||||||
from .metadata import Metadata
|
from .metadata import Metadata
|
||||||
from .media import Media
|
from .media import Media
|
||||||
from .base_module import BaseModule
|
from .base_module import BaseModule
|
||||||
|
|||||||
@@ -1,9 +1,8 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import Mapping, Any, Type, TYPE_CHECKING
|
from typing import Mapping, Any, TYPE_CHECKING
|
||||||
from abc import ABC
|
from abc import ABC
|
||||||
from copy import deepcopy, copy
|
from copy import deepcopy
|
||||||
from tempfile import TemporaryDirectory
|
from tempfile import TemporaryDirectory
|
||||||
from auto_archiver.utils import url as UrlUtil
|
from auto_archiver.utils import url as UrlUtil
|
||||||
from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES
|
from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES
|
||||||
@@ -13,8 +12,8 @@ from loguru import logger
|
|||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from .module import ModuleFactory
|
from .module import ModuleFactory
|
||||||
|
|
||||||
class BaseModule(ABC):
|
|
||||||
|
|
||||||
|
class BaseModule(ABC):
|
||||||
"""
|
"""
|
||||||
Base module class. All modules should inherit from this class.
|
Base module class. All modules should inherit from this class.
|
||||||
|
|
||||||
@@ -46,14 +45,13 @@ class BaseModule(ABC):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def storages(self) -> list:
|
def storages(self) -> list:
|
||||||
return self.config.get('storages', [])
|
return self.config.get("storages", [])
|
||||||
|
|
||||||
def config_setup(self, config: dict):
|
def config_setup(self, config: dict):
|
||||||
|
|
||||||
# this is important. Each instance is given its own deepcopied config, so modules cannot
|
# this is important. Each instance is given its own deepcopied config, so modules cannot
|
||||||
# change values to affect other modules
|
# change values to affect other modules
|
||||||
config = deepcopy(config)
|
config = deepcopy(config)
|
||||||
authentication = deepcopy(config.pop('authentication', {}))
|
authentication = deepcopy(config.pop("authentication", {}))
|
||||||
|
|
||||||
self.authentication = authentication
|
self.authentication = authentication
|
||||||
self.config = config
|
self.config = config
|
||||||
@@ -61,7 +59,8 @@ class BaseModule(ABC):
|
|||||||
setattr(self, key, val)
|
setattr(self, key, val)
|
||||||
|
|
||||||
def setup(self):
|
def setup(self):
|
||||||
# For any additional setup required by modules, e.g. autehntication
|
# For any additional setup required by modules outside of the configs in the manifesst,
|
||||||
|
# e.g. authentication
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def auth_for_site(self, site: str, extract_cookies=True) -> Mapping[str, Any]:
|
def auth_for_site(self, site: str, extract_cookies=True) -> Mapping[str, Any]:
|
||||||
@@ -90,11 +89,10 @@ class BaseModule(ABC):
|
|||||||
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
|
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
|
||||||
# for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?
|
# for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?
|
||||||
|
|
||||||
site = UrlUtil.domain_for_url(site).lstrip("www.")
|
site = UrlUtil.domain_for_url(site).removeprefix("www.")
|
||||||
# add the 'www' version of the site to the list of sites to check
|
# add the 'www' version of the site to the list of sites to check
|
||||||
authdict = {}
|
authdict = {}
|
||||||
|
|
||||||
|
|
||||||
for to_try in [site, f"www.{site}"]:
|
for to_try in [site, f"www.{site}"]:
|
||||||
if to_try in self.authentication:
|
if to_try in self.authentication:
|
||||||
authdict.update(self.authentication[to_try])
|
authdict.update(self.authentication[to_try])
|
||||||
@@ -104,17 +102,20 @@ class BaseModule(ABC):
|
|||||||
if not authdict:
|
if not authdict:
|
||||||
for key in self.authentication.keys():
|
for key in self.authentication.keys():
|
||||||
if key in site or site in key:
|
if key in site or site in key:
|
||||||
logger.debug(f"Could not find exact authentication information for site '{site}'. \
|
logger.debug(
|
||||||
|
f"Could not find exact authentication information for site '{site}'. \
|
||||||
did find information for '{key}' which is close, is this what you meant? \
|
did find information for '{key}' which is close, is this what you meant? \
|
||||||
If so, edit your authentication settings to make sure it exactly matches.")
|
If so, edit your authentication settings to make sure it exactly matches."
|
||||||
|
)
|
||||||
|
|
||||||
def get_ytdlp_cookiejar(args):
|
def get_ytdlp_cookiejar(args):
|
||||||
import yt_dlp
|
import yt_dlp
|
||||||
from yt_dlp import parse_options
|
from yt_dlp import parse_options
|
||||||
|
|
||||||
logger.debug(f"Extracting cookies from settings: {args[1]}")
|
logger.debug(f"Extracting cookies from settings: {args[1]}")
|
||||||
# parse_options returns a named tuple as follows, we only need the ydl_options part
|
# parse_options returns a named tuple as follows, we only need the ydl_options part
|
||||||
# collections.namedtuple('ParsedOptions', ('parser', 'options', 'urls', 'ydl_opts'))
|
# collections.namedtuple('ParsedOptions', ('parser', 'options', 'urls', 'ydl_opts'))
|
||||||
ytdlp_opts = getattr(parse_options(args), 'ydl_opts')
|
ytdlp_opts = getattr(parse_options(args), "ydl_opts")
|
||||||
return yt_dlp.YoutubeDL(ytdlp_opts).cookiejar
|
return yt_dlp.YoutubeDL(ytdlp_opts).cookiejar
|
||||||
|
|
||||||
get_cookiejar_options = None
|
get_cookiejar_options = None
|
||||||
@@ -125,20 +126,19 @@ If so, edit your authentication settings to make sure it exactly matches.")
|
|||||||
# 3. cookies_from_browser setting in global config
|
# 3. cookies_from_browser setting in global config
|
||||||
# 4. cookies_file setting in global config
|
# 4. cookies_file setting in global config
|
||||||
|
|
||||||
if 'cookies_from_browser' in authdict:
|
if "cookies_from_browser" in authdict:
|
||||||
get_cookiejar_options = ['--cookies-from-browser', authdict['cookies_from_browser']]
|
get_cookiejar_options = ["--cookies-from-browser", authdict["cookies_from_browser"]]
|
||||||
elif 'cookies_file' in authdict:
|
elif "cookies_file" in authdict:
|
||||||
get_cookiejar_options = ['--cookies', authdict['cookies_file']]
|
get_cookiejar_options = ["--cookies", authdict["cookies_file"]]
|
||||||
elif 'cookies_from_browser' in self.authentication:
|
elif "cookies_from_browser" in self.authentication:
|
||||||
authdict['cookies_from_browser'] = self.authentication['cookies_from_browser']
|
authdict["cookies_from_browser"] = self.authentication["cookies_from_browser"]
|
||||||
get_cookiejar_options = ['--cookies-from-browser', self.authentication['cookies_from_browser']]
|
get_cookiejar_options = ["--cookies-from-browser", self.authentication["cookies_from_browser"]]
|
||||||
elif 'cookies_file' in self.authentication:
|
elif "cookies_file" in self.authentication:
|
||||||
authdict['cookies_file'] = self.authentication['cookies_file']
|
authdict["cookies_file"] = self.authentication["cookies_file"]
|
||||||
get_cookiejar_options = ['--cookies', self.authentication['cookies_file']]
|
get_cookiejar_options = ["--cookies", self.authentication["cookies_file"]]
|
||||||
|
|
||||||
|
|
||||||
if get_cookiejar_options:
|
if get_cookiejar_options:
|
||||||
authdict['cookies_jar'] = get_ytdlp_cookiejar(get_cookiejar_options)
|
authdict["cookies_jar"] = get_ytdlp_cookiejar(get_cookiejar_options)
|
||||||
|
|
||||||
return authdict
|
return authdict
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ flexible setup in various environments.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
from ruamel.yaml import YAML, CommentedMap, add_representer
|
from ruamel.yaml import YAML, CommentedMap
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
@@ -19,12 +19,14 @@ _yaml: YAML = YAML()
|
|||||||
|
|
||||||
DEFAULT_CONFIG_FILE = "secrets/orchestration.yaml"
|
DEFAULT_CONFIG_FILE = "secrets/orchestration.yaml"
|
||||||
|
|
||||||
EMPTY_CONFIG = _yaml.load("""
|
EMPTY_CONFIG = _yaml.load(
|
||||||
|
"""
|
||||||
# Auto Archiver Configuration
|
# Auto Archiver Configuration
|
||||||
|
|
||||||
# Steps are the modules that will be run in the order they are defined
|
# Steps are the modules that will be run in the order they are defined
|
||||||
steps:""" + "".join([f"\n {module}s: []" for module in MODULE_TYPES]) + \
|
steps:"""
|
||||||
"""
|
+ "".join([f"\n {module}s: []" for module in MODULE_TYPES])
|
||||||
|
+ """
|
||||||
|
|
||||||
# Global configuration
|
# Global configuration
|
||||||
|
|
||||||
@@ -51,50 +53,54 @@ authentication: {}
|
|||||||
logging:
|
logging:
|
||||||
level: INFO
|
level: INFO
|
||||||
|
|
||||||
""")
|
"""
|
||||||
|
)
|
||||||
# note: 'logging' is explicitly added above in order to better format the config file
|
# note: 'logging' is explicitly added above in order to better format the config file
|
||||||
|
|
||||||
|
|
||||||
# Arg Parse Actions/Classes
|
# Arg Parse Actions/Classes
|
||||||
class AuthenticationJsonParseAction(argparse.Action):
|
class AuthenticationJsonParseAction(argparse.Action):
|
||||||
def __call__(self, parser, namespace, values, option_string=None):
|
def __call__(self, parser, namespace, values, option_string=None):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
auth_dict = json.loads(values)
|
auth_dict = json.loads(values)
|
||||||
setattr(namespace, self.dest, auth_dict)
|
setattr(namespace, self.dest, auth_dict)
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}")
|
raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}") from e
|
||||||
|
|
||||||
def load_from_file(path):
|
def load_from_file(path):
|
||||||
try:
|
try:
|
||||||
with open(path, 'r') as f:
|
with open(path, "r") as f:
|
||||||
try:
|
try:
|
||||||
auth_dict = json.load(f)
|
auth_dict = json.load(f)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
f.seek(0)
|
f.seek(0)
|
||||||
# maybe it's yaml, try that
|
# maybe it's yaml, try that
|
||||||
auth_dict = _yaml.load(f)
|
auth_dict = _yaml.load(f)
|
||||||
if auth_dict.get('authentication'):
|
if auth_dict.get("authentication"):
|
||||||
auth_dict = auth_dict['authentication']
|
auth_dict = auth_dict["authentication"]
|
||||||
auth_dict['load_from_file'] = path
|
auth_dict["load_from_file"] = path
|
||||||
return auth_dict
|
return auth_dict
|
||||||
except:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if isinstance(auth_dict, dict) and auth_dict.get('from_file'):
|
if isinstance(auth_dict, dict) and auth_dict.get("from_file"):
|
||||||
auth_dict = load_from_file(auth_dict['from_file'])
|
auth_dict = load_from_file(auth_dict["from_file"])
|
||||||
elif isinstance(auth_dict, str):
|
elif isinstance(auth_dict, str):
|
||||||
# if it's a string
|
# if it's a string
|
||||||
auth_dict = load_from_file(auth_dict)
|
auth_dict = load_from_file(auth_dict)
|
||||||
|
|
||||||
if not isinstance(auth_dict, dict):
|
if not isinstance(auth_dict, dict):
|
||||||
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
|
raise argparse.ArgumentTypeError(
|
||||||
global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file']
|
"Authentication must be a dictionary of site names and their authentication methods"
|
||||||
|
)
|
||||||
|
global_options = ["cookies_from_browser", "cookies_file", "load_from_file"]
|
||||||
for key, auth in auth_dict.items():
|
for key, auth in auth_dict.items():
|
||||||
if key in global_options:
|
if key in global_options:
|
||||||
continue
|
continue
|
||||||
if not isinstance(key, str) or not isinstance(auth, dict):
|
if not isinstance(key, str) or not isinstance(auth, dict):
|
||||||
raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}")
|
raise argparse.ArgumentTypeError(
|
||||||
|
f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}"
|
||||||
|
)
|
||||||
|
|
||||||
setattr(namespace, self.dest, auth_dict)
|
setattr(namespace, self.dest, auth_dict)
|
||||||
|
|
||||||
@@ -105,8 +111,8 @@ class UniqueAppendAction(argparse.Action):
|
|||||||
if value not in getattr(namespace, self.dest):
|
if value not in getattr(namespace, self.dest):
|
||||||
getattr(namespace, self.dest).append(value)
|
getattr(namespace, self.dest).append(value)
|
||||||
|
|
||||||
class DefaultValidatingParser(argparse.ArgumentParser):
|
|
||||||
|
|
||||||
|
class DefaultValidatingParser(argparse.ArgumentParser):
|
||||||
def error(self, message):
|
def error(self, message):
|
||||||
"""
|
"""
|
||||||
Override of error to format a nicer looking error message using logger
|
Override of error to format a nicer looking error message using logger
|
||||||
@@ -135,8 +141,10 @@ class DefaultValidatingParser(argparse.ArgumentParser):
|
|||||||
|
|
||||||
return super().parse_known_args(args, namespace)
|
return super().parse_known_args(args, namespace)
|
||||||
|
|
||||||
|
|
||||||
# Config Utils
|
# Config Utils
|
||||||
|
|
||||||
|
|
||||||
def to_dot_notation(yaml_conf: CommentedMap | dict) -> dict:
|
def to_dot_notation(yaml_conf: CommentedMap | dict) -> dict:
|
||||||
dotdict = {}
|
dotdict = {}
|
||||||
|
|
||||||
@@ -150,6 +158,7 @@ def to_dot_notation(yaml_conf: CommentedMap | dict) -> dict:
|
|||||||
process_subdict(yaml_conf)
|
process_subdict(yaml_conf)
|
||||||
return dotdict
|
return dotdict
|
||||||
|
|
||||||
|
|
||||||
def from_dot_notation(dotdict: dict) -> dict:
|
def from_dot_notation(dotdict: dict) -> dict:
|
||||||
normal_dict = {}
|
normal_dict = {}
|
||||||
|
|
||||||
@@ -170,9 +179,11 @@ def from_dot_notation(dotdict: dict) -> dict:
|
|||||||
def is_list_type(value):
|
def is_list_type(value):
|
||||||
return isinstance(value, list) or isinstance(value, tuple) or isinstance(value, set)
|
return isinstance(value, list) or isinstance(value, tuple) or isinstance(value, set)
|
||||||
|
|
||||||
|
|
||||||
def is_dict_type(value):
|
def is_dict_type(value):
|
||||||
return isinstance(value, dict) or isinstance(value, CommentedMap)
|
return isinstance(value, dict) or isinstance(value, CommentedMap)
|
||||||
|
|
||||||
|
|
||||||
def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
|
def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
|
||||||
yaml_dict: CommentedMap = deepcopy(yaml_dict)
|
yaml_dict: CommentedMap = deepcopy(yaml_dict)
|
||||||
|
|
||||||
@@ -183,7 +194,7 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
|
|||||||
yaml_subdict[key] = value
|
yaml_subdict[key] = value
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if key == 'steps':
|
if key == "steps":
|
||||||
for module_type, modules in value.items():
|
for module_type, modules in value.items():
|
||||||
# overwrite the 'steps' from the config file with the ones from the CLI
|
# overwrite the 'steps' from the config file with the ones from the CLI
|
||||||
yaml_subdict[key][module_type] = modules
|
yaml_subdict[key][module_type] = modules
|
||||||
@@ -198,6 +209,7 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
|
|||||||
update_dict(from_dot_notation(dotdict), yaml_dict)
|
update_dict(from_dot_notation(dotdict), yaml_dict)
|
||||||
return yaml_dict
|
return yaml_dict
|
||||||
|
|
||||||
|
|
||||||
def read_yaml(yaml_filename: str) -> CommentedMap:
|
def read_yaml(yaml_filename: str) -> CommentedMap:
|
||||||
config = None
|
config = None
|
||||||
try:
|
try:
|
||||||
@@ -211,6 +223,7 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
|
|||||||
|
|
||||||
return config
|
return config
|
||||||
|
|
||||||
|
|
||||||
# TODO: make this tidier/find a way to notify of which keys should not be stored
|
# TODO: make this tidier/find a way to notify of which keys should not be stored
|
||||||
|
|
||||||
|
|
||||||
@@ -218,13 +231,14 @@ def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
|
|||||||
config_to_save = deepcopy(config)
|
config_to_save = deepcopy(config)
|
||||||
|
|
||||||
auth_dict = config_to_save.get("authentication", {})
|
auth_dict = config_to_save.get("authentication", {})
|
||||||
if auth_dict and auth_dict.get('load_from_file'):
|
if auth_dict and auth_dict.get("load_from_file"):
|
||||||
# remove all other values from the config, don't want to store it in the config file
|
# remove all other values from the config, don't want to store it in the config file
|
||||||
auth_dict = {"load_from_file": auth_dict["load_from_file"]}
|
auth_dict = {"load_from_file": auth_dict["load_from_file"]}
|
||||||
|
|
||||||
config_to_save.pop('urls', None)
|
config_to_save.pop("urls", None)
|
||||||
with open(yaml_filename, "w", encoding="utf-8") as outf:
|
with open(yaml_filename, "w", encoding="utf-8") as outf:
|
||||||
_yaml.dump(config_to_save, outf)
|
_yaml.dump(config_to_save, outf)
|
||||||
|
|
||||||
|
|
||||||
def is_valid_config(config: CommentedMap) -> bool:
|
def is_valid_config(config: CommentedMap) -> bool:
|
||||||
return config and config != EMPTY_CONFIG
|
return config and config != EMPTY_CONFIG
|
||||||
@@ -1,25 +1,19 @@
|
|||||||
class SetupError(ValueError):
|
class SetupError(ValueError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
MODULE_TYPES = [
|
|
||||||
'feeder',
|
MODULE_TYPES = ["feeder", "extractor", "enricher", "database", "storage", "formatter"]
|
||||||
'extractor',
|
|
||||||
'enricher',
|
|
||||||
'database',
|
|
||||||
'storage',
|
|
||||||
'formatter'
|
|
||||||
]
|
|
||||||
|
|
||||||
MANIFEST_FILE = "__manifest__.py"
|
MANIFEST_FILE = "__manifest__.py"
|
||||||
|
|
||||||
DEFAULT_MANIFEST = {
|
DEFAULT_MANIFEST = {
|
||||||
'name': '', # the display name of the module
|
"name": "", # the display name of the module
|
||||||
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
|
"author": "Bellingcat", # creator of the module, leave this as Bellingcat or set your own name!
|
||||||
'type': [], # the type of the module, can be one or more of MODULE_TYPES
|
"type": [], # the type of the module, can be one or more of MODULE_TYPES
|
||||||
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional software
|
"requires_setup": True, # whether or not this module requires additional setup such as setting API Keys or installing additional software
|
||||||
'description': '', # a description of the module
|
"description": "", # a description of the module
|
||||||
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
|
"dependencies": {}, # external dependencies, e.g. python packages or binaries, in dictionary format
|
||||||
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
|
"entry_point": "", # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
|
||||||
'version': '1.0', # the version of the module
|
"version": "1.0", # the version of the module
|
||||||
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
|
"configs": {}, # any configuration options this module has, these will be exposed to the user in the config file or via the command line
|
||||||
}
|
}
|
||||||
@@ -9,6 +9,7 @@ from typing import Union
|
|||||||
|
|
||||||
from auto_archiver.core import Metadata, BaseModule
|
from auto_archiver.core import Metadata, BaseModule
|
||||||
|
|
||||||
|
|
||||||
class Database(BaseModule):
|
class Database(BaseModule):
|
||||||
"""
|
"""
|
||||||
Base class for implementing database modules in the media archiving framework.
|
Base class for implementing database modules in the media archiving framework.
|
||||||
@@ -20,7 +21,7 @@ class Database(BaseModule):
|
|||||||
"""signals the DB that the given item archival has started"""
|
"""signals the DB that the given item archival has started"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def failed(self, item: Metadata, reason:str) -> None:
|
def failed(self, item: Metadata, reason: str) -> None:
|
||||||
"""update DB accordingly for failure"""
|
"""update DB accordingly for failure"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -34,6 +35,6 @@ class Database(BaseModule):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||||
"""archival result ready - should be saved to DB"""
|
"""archival result ready - should be saved to DB"""
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -8,10 +8,12 @@ the archiving step and before storage or formatting.
|
|||||||
|
|
||||||
Enrichers are optional but highly useful for making the archived data more powerful.
|
Enrichers are optional but highly useful for making the archived data more powerful.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from auto_archiver.core import Metadata, BaseModule
|
from auto_archiver.core import Metadata, BaseModule
|
||||||
|
|
||||||
|
|
||||||
class Enricher(BaseModule):
|
class Enricher(BaseModule):
|
||||||
"""Base classes and utilities for enrichers in the Auto Archiver system.
|
"""Base classes and utilities for enrichers in the Auto Archiver system.
|
||||||
|
|
||||||
|
|||||||
@@ -1,17 +1,15 @@
|
|||||||
""" The `extractor` module defines the base functionality for implementing extractors in the media archiving framework.
|
"""The `extractor` module defines the base functionality for implementing extractors in the media archiving framework.
|
||||||
This class provides common utility methods and a standard interface for extractors.
|
This class provides common utility methods and a standard interface for extractors.
|
||||||
|
|
||||||
Factory method to initialize an extractor instance based on its name.
|
Factory method to initialize an extractor instance based on its name.
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from pathlib import Path
|
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from dataclasses import dataclass
|
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
import mimetypes
|
|
||||||
import requests
|
import requests
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from retrying import retry
|
from retrying import retry
|
||||||
@@ -74,17 +72,17 @@ class Extractor(BaseModule):
|
|||||||
@retry(wait_random_min=500, wait_random_max=3500, stop_max_attempt_number=5)
|
@retry(wait_random_min=500, wait_random_max=3500, stop_max_attempt_number=5)
|
||||||
def download_from_url(self, url: str, to_filename: str = None, verbose=True) -> str:
|
def download_from_url(self, url: str, to_filename: str = None, verbose=True) -> str:
|
||||||
"""
|
"""
|
||||||
downloads a URL to provided filename, or inferred from URL, returns local filename
|
downloads a URL to provided filename, or inferred from URL, returns local filename
|
||||||
"""
|
"""
|
||||||
if not to_filename:
|
if not to_filename:
|
||||||
to_filename = url.split('/')[-1].split('?')[0]
|
to_filename = url.split("/")[-1].split("?")[0]
|
||||||
if len(to_filename) > 64:
|
if len(to_filename) > 64:
|
||||||
to_filename = to_filename[-64:]
|
to_filename = to_filename[-64:]
|
||||||
to_filename = os.path.join(self.tmp_dir, to_filename)
|
to_filename = os.path.join(self.tmp_dir, to_filename)
|
||||||
if verbose:
|
if verbose:
|
||||||
logger.debug(f"downloading {url[0:50]=} {to_filename=}")
|
logger.debug(f"downloading {url[0:50]=} {to_filename=}")
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
d = requests.get(url, stream=True, headers=headers, timeout=30)
|
d = requests.get(url, stream=True, headers=headers, timeout=30)
|
||||||
@@ -92,12 +90,12 @@ class Extractor(BaseModule):
|
|||||||
|
|
||||||
# get mimetype from the response headers
|
# get mimetype from the response headers
|
||||||
if not mimetypes.guess_type(to_filename)[0]:
|
if not mimetypes.guess_type(to_filename)[0]:
|
||||||
content_type = d.headers.get('Content-Type') or self._guess_file_type(url)
|
content_type = d.headers.get("Content-Type") or self._guess_file_type(url)
|
||||||
extension = mimetypes.guess_extension(content_type)
|
extension = mimetypes.guess_extension(content_type)
|
||||||
if extension:
|
if extension:
|
||||||
to_filename += extension
|
to_filename += extension
|
||||||
|
|
||||||
with open(to_filename, 'wb') as f:
|
with open(to_filename, "wb") as f:
|
||||||
for chunk in d.iter_content(chunk_size=8192):
|
for chunk in d.iter_content(chunk_size=8192):
|
||||||
f.write(chunk)
|
f.write(chunk)
|
||||||
return to_filename
|
return to_filename
|
||||||
|
|||||||
@@ -7,8 +7,8 @@ from abc import abstractmethod
|
|||||||
from auto_archiver.core import Metadata
|
from auto_archiver.core import Metadata
|
||||||
from auto_archiver.core import BaseModule
|
from auto_archiver.core import BaseModule
|
||||||
|
|
||||||
class Feeder(BaseModule):
|
|
||||||
|
|
||||||
|
class Feeder(BaseModule):
|
||||||
"""
|
"""
|
||||||
Base class for implementing feeders in the media archiving framework.
|
Base class for implementing feeders in the media archiving framework.
|
||||||
|
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ class Media:
|
|||||||
- properties: Additional metadata or transformations for the media.
|
- properties: Additional metadata or transformations for the media.
|
||||||
- _mimetype: The media's mimetype (e.g., image/jpeg, video/mp4).
|
- _mimetype: The media's mimetype (e.g., image/jpeg, video/mp4).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
filename: str
|
filename: str
|
||||||
_key: str = None
|
_key: str = None
|
||||||
urls: List[str] = field(default_factory=list)
|
urls: List[str] = field(default_factory=list)
|
||||||
@@ -51,7 +52,8 @@ class Media:
|
|||||||
This function returns a generator for all the inner media.
|
This function returns a generator for all the inner media.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if include_self: yield self
|
if include_self:
|
||||||
|
yield self
|
||||||
for prop in self.properties.values():
|
for prop in self.properties.values():
|
||||||
if isinstance(prop, Media):
|
if isinstance(prop, Media):
|
||||||
for inner_media in prop.all_inner_media(include_self=True):
|
for inner_media in prop.all_inner_media(include_self=True):
|
||||||
@@ -113,15 +115,17 @@ class Media:
|
|||||||
# checks for video streams with ffmpeg, or min file size for a video
|
# checks for video streams with ffmpeg, or min file size for a video
|
||||||
# self.is_video() should be used together with this method
|
# self.is_video() should be used together with this method
|
||||||
try:
|
try:
|
||||||
streams = ffmpeg.probe(self.filename, select_streams='v')['streams']
|
streams = ffmpeg.probe(self.filename, select_streams="v")["streams"]
|
||||||
logger.warning(f"STREAMS FOR {self.filename} {streams}")
|
logger.warning(f"STREAMS FOR {self.filename} {streams}")
|
||||||
return any(s.get("duration_ts", 0) > 0 for s in streams)
|
return any(s.get("duration_ts", 0) > 0 for s in streams)
|
||||||
except Error: return False # ffmpeg errors when reading bad files
|
except Error:
|
||||||
|
return False # ffmpeg errors when reading bad files
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(e)
|
logger.error(e)
|
||||||
logger.error(traceback.format_exc())
|
logger.error(traceback.format_exc())
|
||||||
try:
|
try:
|
||||||
fsize = os.path.getsize(self.filename)
|
fsize = os.path.getsize(self.filename)
|
||||||
return fsize > 20_000
|
return fsize > 20_000
|
||||||
except: pass
|
except Exception as e:
|
||||||
|
pass
|
||||||
return True
|
return True
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ from __future__ import annotations
|
|||||||
import hashlib
|
import hashlib
|
||||||
from typing import Any, List, Union, Dict
|
from typing import Any, List, Union, Dict
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from dataclasses_json import dataclass_json, config
|
from dataclasses_json import dataclass_json
|
||||||
import datetime
|
import datetime
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from dateutil.parser import parse as parse_dt
|
from dateutil.parser import parse as parse_dt
|
||||||
@@ -21,6 +21,7 @@ from loguru import logger
|
|||||||
|
|
||||||
from .media import Media
|
from .media import Media
|
||||||
|
|
||||||
|
|
||||||
@dataclass_json # annotation order matters
|
@dataclass_json # annotation order matters
|
||||||
@dataclass
|
@dataclass
|
||||||
class Metadata:
|
class Metadata:
|
||||||
@@ -40,19 +41,23 @@ class Metadata:
|
|||||||
- If `True`, this instance's values are overwritten by `right`.
|
- If `True`, this instance's values are overwritten by `right`.
|
||||||
- If `False`, the inverse applies.
|
- If `False`, the inverse applies.
|
||||||
"""
|
"""
|
||||||
if not right: return self
|
if not right:
|
||||||
|
return self
|
||||||
if overwrite_left:
|
if overwrite_left:
|
||||||
if right.status and len(right.status):
|
if right.status and len(right.status):
|
||||||
self.status = right.status
|
self.status = right.status
|
||||||
self._context.update(right._context)
|
self._context.update(right._context)
|
||||||
for k, v in right.metadata.items():
|
for k, v in right.metadata.items():
|
||||||
assert k not in self.metadata or type(v) == type(self.get(k))
|
assert k not in self.metadata or type(v) is type(self.get(k))
|
||||||
if type(v) not in [dict, list, set] or k not in self.metadata:
|
if not isinstance(v, (dict, list, set)) or k not in self.metadata:
|
||||||
self.set(k, v)
|
self.set(k, v)
|
||||||
else: # key conflict
|
else: # key conflict
|
||||||
if type(v) in [dict, set]: self.set(k, self.get(k) | v)
|
if isinstance(v, (dict, set)):
|
||||||
elif type(v) == list: self.set(k, self.get(k) + v)
|
self.set(k, self.get(k) | v)
|
||||||
|
elif type(v) is list:
|
||||||
|
self.set(k, self.get(k) + v)
|
||||||
self.media.extend(right.media)
|
self.media.extend(right.media)
|
||||||
|
|
||||||
else: # invert and do same logic
|
else: # invert and do same logic
|
||||||
return right.merge(self)
|
return right.merge(self)
|
||||||
return self
|
return self
|
||||||
@@ -80,24 +85,26 @@ class Metadata:
|
|||||||
return self.metadata.get(key, default)
|
return self.metadata.get(key, default)
|
||||||
|
|
||||||
def success(self, context: str = None) -> Metadata:
|
def success(self, context: str = None) -> Metadata:
|
||||||
if context: self.status = f"{context}: success"
|
if context:
|
||||||
else: self.status = "success"
|
self.status = f"{context}: success"
|
||||||
|
else:
|
||||||
|
self.status = "success"
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def is_success(self) -> bool:
|
def is_success(self) -> bool:
|
||||||
return "success" in self.status
|
return "success" in self.status
|
||||||
|
|
||||||
def is_empty(self) -> bool:
|
def is_empty(self) -> bool:
|
||||||
meaningfull_ids = set(self.metadata.keys()) - set(["_processed_at", "url", "total_bytes", "total_size", "archive_duration_seconds"])
|
meaningfull_ids = set(self.metadata.keys()) - set(
|
||||||
|
["_processed_at", "url", "total_bytes", "total_size", "archive_duration_seconds"]
|
||||||
|
)
|
||||||
return not self.is_success() and len(self.media) == 0 and len(meaningfull_ids) == 0
|
return not self.is_success() and len(self.media) == 0 and len(meaningfull_ids) == 0
|
||||||
|
|
||||||
@property # getter .netloc
|
@property # getter .netloc
|
||||||
def netloc(self) -> str:
|
def netloc(self) -> str:
|
||||||
return urlparse(self.get_url()).netloc
|
return urlparse(self.get_url()).netloc
|
||||||
|
|
||||||
|
# custom getter/setters
|
||||||
# custom getter/setters
|
|
||||||
|
|
||||||
|
|
||||||
def set_url(self, url: str) -> Metadata:
|
def set_url(self, url: str) -> Metadata:
|
||||||
assert type(url) is str and len(url) > 0, "invalid URL"
|
assert type(url) is str and len(url) > 0, "invalid URL"
|
||||||
@@ -120,36 +127,43 @@ class Metadata:
|
|||||||
return self.get("title")
|
return self.get("title")
|
||||||
|
|
||||||
def set_timestamp(self, timestamp: datetime.datetime) -> Metadata:
|
def set_timestamp(self, timestamp: datetime.datetime) -> Metadata:
|
||||||
if type(timestamp) == str:
|
if isinstance(timestamp, str):
|
||||||
timestamp = parse_dt(timestamp)
|
timestamp = parse_dt(timestamp)
|
||||||
assert type(timestamp) == datetime.datetime, "set_timestamp expects a datetime instance"
|
assert isinstance(timestamp, datetime.datetime), "set_timestamp expects a datetime instance"
|
||||||
return self.set("timestamp", timestamp)
|
return self.set("timestamp", timestamp)
|
||||||
|
|
||||||
def get_timestamp(self, utc=True, iso=True) -> datetime.datetime:
|
def get_timestamp(self, utc=True, iso=True) -> datetime.datetime | str | None:
|
||||||
ts = self.get("timestamp")
|
ts = self.get("timestamp")
|
||||||
if not ts: return
|
if not ts:
|
||||||
|
return None
|
||||||
try:
|
try:
|
||||||
if type(ts) == str: ts = datetime.datetime.fromisoformat(ts)
|
if isinstance(ts, str):
|
||||||
if type(ts) == float: ts = datetime.datetime.fromtimestamp(ts)
|
ts = datetime.datetime.fromisoformat(ts)
|
||||||
if utc: ts = ts.replace(tzinfo=datetime.timezone.utc)
|
elif isinstance(ts, float):
|
||||||
if iso: return ts.isoformat()
|
ts = datetime.datetime.fromtimestamp(ts)
|
||||||
return ts
|
if utc:
|
||||||
|
ts = ts.replace(tzinfo=datetime.timezone.utc)
|
||||||
|
return ts.isoformat() if iso else ts
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Unable to parse timestamp {ts}: {e}")
|
logger.error(f"Unable to parse timestamp {ts}: {e}")
|
||||||
return
|
return None
|
||||||
|
|
||||||
def add_media(self, media: Media, id: str = None) -> Metadata:
|
def add_media(self, media: Media, id: str = None) -> Metadata:
|
||||||
# adds a new media, optionally including an id
|
# adds a new media, optionally including an id
|
||||||
if media is None: return
|
if media is None:
|
||||||
|
return
|
||||||
if id is not None:
|
if id is not None:
|
||||||
assert not len([1 for m in self.media if m.get("id") == id]), f"cannot add 2 pieces of media with the same id {id}"
|
assert not len([1 for m in self.media if m.get("id") == id]), (
|
||||||
|
f"cannot add 2 pieces of media with the same id {id}"
|
||||||
|
)
|
||||||
media.set("id", id)
|
media.set("id", id)
|
||||||
self.media.append(media)
|
self.media.append(media)
|
||||||
return media
|
return media
|
||||||
|
|
||||||
def get_media_by_id(self, id: str, default=None) -> Media:
|
def get_media_by_id(self, id: str, default=None) -> Media:
|
||||||
for m in self.media:
|
for m in self.media:
|
||||||
if m.get("id") == id: return m
|
if m.get("id") == id:
|
||||||
|
return m
|
||||||
return default
|
return default
|
||||||
|
|
||||||
def remove_duplicate_media_by_hash(self) -> None:
|
def remove_duplicate_media_by_hash(self) -> None:
|
||||||
@@ -159,7 +173,8 @@ class Metadata:
|
|||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
while True:
|
while True:
|
||||||
buf = f.read(chunksize)
|
buf = f.read(chunksize)
|
||||||
if not buf: break
|
if not buf:
|
||||||
|
break
|
||||||
hash_algo.update(buf)
|
hash_algo.update(buf)
|
||||||
return hash_algo.hexdigest()
|
return hash_algo.hexdigest()
|
||||||
|
|
||||||
@@ -167,15 +182,18 @@ class Metadata:
|
|||||||
new_media = []
|
new_media = []
|
||||||
for m in self.media:
|
for m in self.media:
|
||||||
h = m.get("hash")
|
h = m.get("hash")
|
||||||
if not h: h = calculate_hash_in_chunks(hashlib.sha256(), int(1.6e7), m.filename)
|
if not h:
|
||||||
if len(h) and h in media_hashes: continue
|
h = calculate_hash_in_chunks(hashlib.sha256(), int(1.6e7), m.filename)
|
||||||
|
if len(h) and h in media_hashes:
|
||||||
|
continue
|
||||||
media_hashes.add(h)
|
media_hashes.add(h)
|
||||||
new_media.append(m)
|
new_media.append(m)
|
||||||
self.media = new_media
|
self.media = new_media
|
||||||
|
|
||||||
def get_first_image(self, default=None) -> Media:
|
def get_first_image(self, default=None) -> Media:
|
||||||
for m in self.media:
|
for m in self.media:
|
||||||
if "image" in m.mimetype: return m
|
if "image" in m.mimetype:
|
||||||
|
return m
|
||||||
return default
|
return default
|
||||||
|
|
||||||
def set_final_media(self, final: Media) -> Metadata:
|
def set_final_media(self, final: Media) -> Metadata:
|
||||||
@@ -193,17 +211,20 @@ class Metadata:
|
|||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
return self.__repr__()
|
return self.__repr__()
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def choose_most_complete(results: List[Metadata]) -> Metadata:
|
def choose_most_complete(results: List[Metadata]) -> Metadata:
|
||||||
# returns the most complete result from a list of results
|
# returns the most complete result from a list of results
|
||||||
# prioritizes results with more media, then more metadata
|
# prioritizes results with more media, then more metadata
|
||||||
if len(results) == 0: return None
|
if len(results) == 0:
|
||||||
if len(results) == 1: return results[0]
|
return None
|
||||||
|
if len(results) == 1:
|
||||||
|
return results[0]
|
||||||
most_complete = results[0]
|
most_complete = results[0]
|
||||||
for r in results[1:]:
|
for r in results[1:]:
|
||||||
if len(r.media) > len(most_complete.media): most_complete = r
|
if len(r.media) > len(most_complete.media):
|
||||||
elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r
|
most_complete = r
|
||||||
|
elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata):
|
||||||
|
most_complete = r
|
||||||
return most_complete
|
return most_complete
|
||||||
|
|
||||||
def set_context(self, key: str, val: Any) -> Metadata:
|
def set_context(self, key: str, val: Any) -> Metadata:
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ Defines the Step abstract base class, which acts as a blueprint for steps in the
|
|||||||
by handling user configuration, validating the steps properties, and implementing dynamic instantiation.
|
by handling user configuration, validating the steps properties, and implementing dynamic instantiation.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
@@ -24,8 +25,8 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
HAS_SETUP_PATHS = False
|
HAS_SETUP_PATHS = False
|
||||||
|
|
||||||
class ModuleFactory:
|
|
||||||
|
|
||||||
|
class ModuleFactory:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._lazy_modules = {}
|
self._lazy_modules = {}
|
||||||
|
|
||||||
@@ -46,11 +47,13 @@ class ModuleFactory:
|
|||||||
|
|
||||||
# see odoo/module/module.py -> initialize_sys_path
|
# see odoo/module/module.py -> initialize_sys_path
|
||||||
if path not in auto_archiver.modules.__path__:
|
if path not in auto_archiver.modules.__path__:
|
||||||
if HAS_SETUP_PATHS == True:
|
if HAS_SETUP_PATHS:
|
||||||
logger.warning(f"You are attempting to re-initialise the module paths with: '{path}' for a 2nd time. \
|
logger.warning(
|
||||||
|
f"You are attempting to re-initialise the module paths with: '{path}' for a 2nd time. \
|
||||||
This could lead to unexpected behaviour. It is recommended to only use a single modules path. \
|
This could lead to unexpected behaviour. It is recommended to only use a single modules path. \
|
||||||
If you wish to load modules from different paths then load a 2nd python interpreter (e.g. using multiprocessing).")
|
If you wish to load modules from different paths then load a 2nd python interpreter (e.g. using multiprocessing)."
|
||||||
auto_archiver.modules.__path__.append(path)
|
)
|
||||||
|
auto_archiver.modules.__path__.append(path)
|
||||||
|
|
||||||
# sort based on the length of the path, so that the longest path is last in the list
|
# sort based on the length of the path, so that the longest path is last in the list
|
||||||
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
|
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
|
||||||
@@ -81,13 +84,14 @@ class ModuleFactory:
|
|||||||
available = self.available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
|
available = self.available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
|
||||||
if not available:
|
if not available:
|
||||||
message = f"Module '{module_name}' not found. Are you sure it's installed/exists?"
|
message = f"Module '{module_name}' not found. Are you sure it's installed/exists?"
|
||||||
if 'archiver' in module_name:
|
if "archiver" in module_name:
|
||||||
message += f" Did you mean {module_name.replace('archiver', 'extractor')}?"
|
message += f" Did you mean {module_name.replace('archiver', 'extractor')}?"
|
||||||
raise IndexError(message)
|
raise IndexError(message)
|
||||||
return available[0]
|
return available[0]
|
||||||
|
|
||||||
def available_modules(self, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
|
def available_modules(
|
||||||
|
self, limit_to_modules: List[str] = [], suppress_warnings: bool = False
|
||||||
|
) -> List[LazyBaseModule]:
|
||||||
# search through all valid 'modules' paths. Default is 'modules' in the current directory
|
# search through all valid 'modules' paths. Default is 'modules' in the current directory
|
||||||
|
|
||||||
# see odoo/modules/module.py -> get_modules
|
# see odoo/modules/module.py -> get_modules
|
||||||
@@ -127,15 +131,16 @@ class ModuleFactory:
|
|||||||
|
|
||||||
return all_modules
|
return all_modules
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class LazyBaseModule:
|
class LazyBaseModule:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
A lazy module class, which only loads the manifest and does not load the module itself.
|
A lazy module class, which only loads the manifest and does not load the module itself.
|
||||||
|
|
||||||
This is useful for getting information about a module without actually loading it.
|
This is useful for getting information about a module without actually loading it.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
name: str
|
name: str
|
||||||
description: str
|
description: str
|
||||||
path: str
|
path: str
|
||||||
@@ -152,30 +157,30 @@ class LazyBaseModule:
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def type(self):
|
def type(self):
|
||||||
return self.manifest['type']
|
return self.manifest["type"]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def entry_point(self):
|
def entry_point(self):
|
||||||
if not self._entry_point and not self.manifest['entry_point']:
|
if not self._entry_point and not self.manifest["entry_point"]:
|
||||||
# try to create the entry point from the module name
|
# try to create the entry point from the module name
|
||||||
self._entry_point = f"{self.name}::{self.name.replace('_', ' ').title().replace(' ', '')}"
|
self._entry_point = f"{self.name}::{self.name.replace('_', ' ').title().replace(' ', '')}"
|
||||||
return self._entry_point
|
return self._entry_point
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def dependencies(self) -> dict:
|
def dependencies(self) -> dict:
|
||||||
return self.manifest['dependencies']
|
return self.manifest["dependencies"]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def configs(self) -> dict:
|
def configs(self) -> dict:
|
||||||
return self.manifest['configs']
|
return self.manifest["configs"]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def requires_setup(self) -> bool:
|
def requires_setup(self) -> bool:
|
||||||
return self.manifest['requires_setup']
|
return self.manifest["requires_setup"]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def display_name(self) -> str:
|
def display_name(self) -> str:
|
||||||
return self.manifest['name']
|
return self.manifest["name"]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def manifest(self) -> dict:
|
def manifest(self) -> dict:
|
||||||
@@ -189,17 +194,16 @@ class LazyBaseModule:
|
|||||||
try:
|
try:
|
||||||
manifest.update(ast.literal_eval(f.read()))
|
manifest.update(ast.literal_eval(f.read()))
|
||||||
except (ValueError, TypeError, SyntaxError, MemoryError, RecursionError) as e:
|
except (ValueError, TypeError, SyntaxError, MemoryError, RecursionError) as e:
|
||||||
raise ValueError(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}")
|
raise ValueError(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}") from e
|
||||||
|
|
||||||
self._manifest = manifest
|
self._manifest = manifest
|
||||||
self._entry_point = manifest['entry_point']
|
self._entry_point = manifest["entry_point"]
|
||||||
self.description = manifest['description']
|
self.description = manifest["description"]
|
||||||
self.version = manifest['version']
|
self.version = manifest["version"]
|
||||||
|
|
||||||
return manifest
|
return manifest
|
||||||
|
|
||||||
def load(self, config) -> BaseModule:
|
def load(self, config) -> BaseModule:
|
||||||
|
|
||||||
if self._instance:
|
if self._instance:
|
||||||
return self._instance
|
return self._instance
|
||||||
|
|
||||||
@@ -210,8 +214,10 @@ class LazyBaseModule:
|
|||||||
# clear out any empty strings that a user may have erroneously added
|
# clear out any empty strings that a user may have erroneously added
|
||||||
continue
|
continue
|
||||||
if not check(dep):
|
if not check(dep):
|
||||||
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
|
logger.error(
|
||||||
Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
|
f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
|
||||||
|
Have you installed the required dependencies for the '{self.name}' module? See the README for more information."
|
||||||
|
)
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
def check_python_dep(dep):
|
def check_python_dep(dep):
|
||||||
@@ -219,10 +225,10 @@ class LazyBaseModule:
|
|||||||
try:
|
try:
|
||||||
m = self.module_factory.get_module_lazy(dep, suppress_warnings=True)
|
m = self.module_factory.get_module_lazy(dep, suppress_warnings=True)
|
||||||
try:
|
try:
|
||||||
# we must now load this module and set it up with the config
|
# we must now load this module and set it up with the config
|
||||||
m.load(config)
|
m.load(config)
|
||||||
return True
|
return True
|
||||||
except:
|
except Exception:
|
||||||
logger.error(f"Unable to setup module '{dep}' for use in module '{self.name}'")
|
logger.error(f"Unable to setup module '{dep}' for use in module '{self.name}'")
|
||||||
return False
|
return False
|
||||||
except IndexError:
|
except IndexError:
|
||||||
@@ -231,13 +237,12 @@ class LazyBaseModule:
|
|||||||
|
|
||||||
return find_spec(dep)
|
return find_spec(dep)
|
||||||
|
|
||||||
check_deps(self.dependencies.get('python', []), check_python_dep)
|
check_deps(self.dependencies.get("python", []), check_python_dep)
|
||||||
check_deps(self.dependencies.get('bin', []), lambda dep: shutil.which(dep))
|
check_deps(self.dependencies.get("bin", []), lambda dep: shutil.which(dep))
|
||||||
|
|
||||||
|
|
||||||
logger.debug(f"Loading module '{self.display_name}'...")
|
logger.debug(f"Loading module '{self.display_name}'...")
|
||||||
|
|
||||||
for qualname in [self.name, f'auto_archiver.modules.{self.name}']:
|
for qualname in [self.name, f"auto_archiver.modules.{self.name}"]:
|
||||||
try:
|
try:
|
||||||
# first import the whole module, to make sure it's working properly
|
# first import the whole module, to make sure it's working properly
|
||||||
__import__(qualname)
|
__import__(qualname)
|
||||||
@@ -246,10 +251,10 @@ class LazyBaseModule:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
# then import the file for the entry point
|
# then import the file for the entry point
|
||||||
file_name, class_name = self.entry_point.split('::')
|
file_name, class_name = self.entry_point.split("::")
|
||||||
sub_qualname = f'{qualname}.{file_name}'
|
sub_qualname = f"{qualname}.{file_name}"
|
||||||
|
|
||||||
__import__(f'{qualname}.{file_name}', fromlist=[self.entry_point])
|
__import__(f"{qualname}.{file_name}", fromlist=[self.entry_point])
|
||||||
# finally, get the class instance
|
# finally, get the class instance
|
||||||
instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
|
instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
|
||||||
|
|
||||||
@@ -259,9 +264,9 @@ class LazyBaseModule:
|
|||||||
instance.module_factory = self.module_factory
|
instance.module_factory = self.module_factory
|
||||||
|
|
||||||
# merge the default config with the user config
|
# merge the default config with the user config
|
||||||
default_config = dict((k, v['default']) for k, v in self.configs.items() if 'default' in v)
|
default_config = dict((k, v["default"]) for k, v in self.configs.items() if "default" in v)
|
||||||
|
|
||||||
config[self.name] = default_config | config.get(self.name, {})
|
config[self.name] = default_config | config.get(self.name, {})
|
||||||
instance.config_setup(config)
|
instance.config_setup(config)
|
||||||
instance.setup()
|
instance.setup()
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
""" Orchestrates all archiving steps, including feeding items,
|
"""Orchestrates all archiving steps, including feeding items,
|
||||||
archiving them with specific archivers, enrichment, storage,
|
archiving them with specific archivers, enrichment, storage,
|
||||||
formatting, database operations and clean up.
|
formatting, database operations and clean up.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -19,8 +19,17 @@ import requests
|
|||||||
|
|
||||||
from .metadata import Metadata, Media
|
from .metadata import Metadata, Media
|
||||||
from auto_archiver.version import __version__
|
from auto_archiver.version import __version__
|
||||||
from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, is_valid_config, \
|
from .config import (
|
||||||
DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE
|
read_yaml,
|
||||||
|
store_yaml,
|
||||||
|
to_dot_notation,
|
||||||
|
merge_dicts,
|
||||||
|
is_valid_config,
|
||||||
|
DefaultValidatingParser,
|
||||||
|
UniqueAppendAction,
|
||||||
|
AuthenticationJsonParseAction,
|
||||||
|
DEFAULT_CONFIG_FILE,
|
||||||
|
)
|
||||||
from .module import ModuleFactory, LazyBaseModule
|
from .module import ModuleFactory, LazyBaseModule
|
||||||
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
|
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
|
||||||
from .consts import MODULE_TYPES, SetupError
|
from .consts import MODULE_TYPES, SetupError
|
||||||
@@ -30,8 +39,8 @@ if TYPE_CHECKING:
|
|||||||
from .base_module import BaseModule
|
from .base_module import BaseModule
|
||||||
from .module import LazyBaseModule
|
from .module import LazyBaseModule
|
||||||
|
|
||||||
class ArchivingOrchestrator:
|
|
||||||
|
|
||||||
|
class ArchivingOrchestrator:
|
||||||
# instance variables
|
# instance variables
|
||||||
module_factory: ModuleFactory
|
module_factory: ModuleFactory
|
||||||
setup_finished: bool
|
setup_finished: bool
|
||||||
@@ -61,30 +70,63 @@ class ArchivingOrchestrator:
|
|||||||
epilog="Check the code at https://github.com/bellingcat/auto-archiver",
|
epilog="Check the code at https://github.com/bellingcat/auto-archiver",
|
||||||
formatter_class=RichHelpFormatter,
|
formatter_class=RichHelpFormatter,
|
||||||
)
|
)
|
||||||
parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit')
|
parser.add_argument("--help", "-h", action="store_true", dest="help", help="show a full help message and exit")
|
||||||
parser.add_argument('--version', action='version', version=__version__)
|
parser.add_argument("--version", action="version", version=__version__)
|
||||||
parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
|
parser.add_argument(
|
||||||
parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
|
"--config",
|
||||||
|
action="store",
|
||||||
|
dest="config_file",
|
||||||
|
help="the filename of the YAML configuration file (defaults to 'config.yaml')",
|
||||||
|
default=DEFAULT_CONFIG_FILE,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--mode",
|
||||||
|
action="store",
|
||||||
|
dest="mode",
|
||||||
|
type=str,
|
||||||
|
choices=["simple", "full"],
|
||||||
|
help="the mode to run the archiver in",
|
||||||
|
default="simple",
|
||||||
|
)
|
||||||
# override the default 'help' so we can inject all the configs and show those
|
# override the default 'help' so we can inject all the configs and show those
|
||||||
parser.add_argument('-s', '--store', dest='store', default=False, help='Store the created config in the config file', action=argparse.BooleanOptionalAction)
|
parser.add_argument(
|
||||||
parser.add_argument('--module_paths', dest='module_paths', nargs='+', default=[], help='additional paths to search for modules', action=UniqueAppendAction)
|
"-s",
|
||||||
|
"--store",
|
||||||
|
dest="store",
|
||||||
|
default=False,
|
||||||
|
help="Store the created config in the config file",
|
||||||
|
action=argparse.BooleanOptionalAction,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--module_paths",
|
||||||
|
dest="module_paths",
|
||||||
|
nargs="+",
|
||||||
|
default=[],
|
||||||
|
help="additional paths to search for modules",
|
||||||
|
action=UniqueAppendAction,
|
||||||
|
)
|
||||||
|
|
||||||
self.basic_parser = parser
|
self.basic_parser = parser
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
def check_steps(self, config):
|
def check_steps(self, config):
|
||||||
for module_type in MODULE_TYPES:
|
for module_type in MODULE_TYPES:
|
||||||
if not config['steps'].get(f"{module_type}s", []):
|
if not config["steps"].get(f"{module_type}s", []):
|
||||||
if module_type == 'feeder' or module_type == 'formatter' and config['steps'].get(f"{module_type}"):
|
if module_type == "feeder" or module_type == "formatter" and config["steps"].get(f"{module_type}"):
|
||||||
raise SetupError(f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
|
raise SetupError(
|
||||||
Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_type}_name_here]\n {'extractors:...' if module_type == 'feeder' else '...'}\n")
|
f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
|
||||||
if module_type == 'extractor' and config['steps'].get('archivers'):
|
Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_type}_name_here]\n {'extractors:...' if module_type == 'feeder' else '...'}\n"
|
||||||
raise SetupError(f"As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
|
)
|
||||||
Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_here]\n enrichers:...\n")
|
if module_type == "extractor" and config["steps"].get("archivers"):
|
||||||
raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
|
raise SetupError(
|
||||||
|
"As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
|
||||||
|
Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_here]\n enrichers:...\n"
|
||||||
|
)
|
||||||
|
raise SetupError(
|
||||||
|
f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
|
||||||
|
)
|
||||||
|
|
||||||
def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
|
def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
|
||||||
|
|
||||||
# modules parser to get the overridden 'steps' values
|
# modules parser to get the overridden 'steps' values
|
||||||
modules_parser = argparse.ArgumentParser(
|
modules_parser = argparse.ArgumentParser(
|
||||||
add_help=False,
|
add_help=False,
|
||||||
@@ -92,7 +134,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
self.add_modules_args(modules_parser)
|
self.add_modules_args(modules_parser)
|
||||||
cli_modules, unused_args = modules_parser.parse_known_args(unused_args)
|
cli_modules, unused_args = modules_parser.parse_known_args(unused_args)
|
||||||
for module_type in MODULE_TYPES:
|
for module_type in MODULE_TYPES:
|
||||||
yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", [])
|
yaml_config["steps"][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config[
|
||||||
|
"steps"
|
||||||
|
].get(f"{module_type}s", [])
|
||||||
|
|
||||||
parser = DefaultValidatingParser(
|
parser = DefaultValidatingParser(
|
||||||
add_help=False,
|
add_help=False,
|
||||||
@@ -115,27 +159,29 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
enabled_modules = []
|
enabled_modules = []
|
||||||
# first loads the modules from the config file, then from the command line
|
# first loads the modules from the config file, then from the command line
|
||||||
for module_type in MODULE_TYPES:
|
for module_type in MODULE_TYPES:
|
||||||
enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
|
enabled_modules.extend(yaml_config["steps"].get(f"{module_type}s", []))
|
||||||
|
|
||||||
# clear out duplicates, but keep the order
|
# clear out duplicates, but keep the order
|
||||||
enabled_modules = list(dict.fromkeys(enabled_modules))
|
enabled_modules = list(dict.fromkeys(enabled_modules))
|
||||||
avail_modules = self.module_factory.available_modules(limit_to_modules=enabled_modules, suppress_warnings=True)
|
avail_modules = self.module_factory.available_modules(
|
||||||
|
limit_to_modules=enabled_modules, suppress_warnings=True
|
||||||
|
)
|
||||||
self.add_individual_module_args(avail_modules, parser)
|
self.add_individual_module_args(avail_modules, parser)
|
||||||
elif basic_config.mode == 'simple':
|
elif basic_config.mode == "simple":
|
||||||
simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
|
simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
|
||||||
self.add_individual_module_args(simple_modules, parser)
|
self.add_individual_module_args(simple_modules, parser)
|
||||||
|
|
||||||
# add them to the config
|
# add them to the config
|
||||||
for module in simple_modules:
|
for module in simple_modules:
|
||||||
for module_type in module.type:
|
for module_type in module.type:
|
||||||
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
|
yaml_config["steps"].setdefault(f"{module_type}s", []).append(module.name)
|
||||||
else:
|
else:
|
||||||
# load all modules, they're not using the 'simple' mode
|
# load all modules, they're not using the 'simple' mode
|
||||||
all_modules = self.module_factory.available_modules()
|
all_modules = self.module_factory.available_modules()
|
||||||
# add all the modules to the steps
|
# add all the modules to the steps
|
||||||
for module in all_modules:
|
for module in all_modules:
|
||||||
for module_type in module.type:
|
for module_type in module.type:
|
||||||
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
|
yaml_config["steps"].setdefault(f"{module_type}s", []).append(module.name)
|
||||||
|
|
||||||
self.add_individual_module_args(all_modules, parser)
|
self.add_individual_module_args(all_modules, parser)
|
||||||
|
|
||||||
@@ -171,34 +217,67 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
|
|
||||||
# Module loading from the command line
|
# Module loading from the command line
|
||||||
for module_type in MODULE_TYPES:
|
for module_type in MODULE_TYPES:
|
||||||
parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction)
|
parser.add_argument(
|
||||||
|
f"--{module_type}s",
|
||||||
|
dest=f"{module_type}s",
|
||||||
|
nargs="+",
|
||||||
|
help=f"the {module_type}s to use",
|
||||||
|
default=[],
|
||||||
|
action=UniqueAppendAction,
|
||||||
|
)
|
||||||
|
|
||||||
def add_additional_args(self, parser: argparse.ArgumentParser = None):
|
def add_additional_args(self, parser: argparse.ArgumentParser = None):
|
||||||
if not parser:
|
if not parser:
|
||||||
parser = self.parser
|
parser = self.parser
|
||||||
|
|
||||||
parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
|
parser.add_argument(
|
||||||
|
"--authentication",
|
||||||
|
dest="authentication",
|
||||||
|
help="A dictionary of sites and their authentication methods \
|
||||||
(token, username etc.) that extractors can use to log into \
|
(token, username etc.) that extractors can use to log into \
|
||||||
a website. If passing this on the command line, use a JSON string. \
|
a website. If passing this on the command line, use a JSON string. \
|
||||||
You may also pass a path to a valid JSON/YAML file which will be parsed.',
|
You may also pass a path to a valid JSON/YAML file which will be parsed.",
|
||||||
default={},
|
default={},
|
||||||
nargs="?",
|
nargs="?",
|
||||||
action=AuthenticationJsonParseAction)
|
action=AuthenticationJsonParseAction,
|
||||||
|
)
|
||||||
|
|
||||||
# logging arguments
|
# logging arguments
|
||||||
parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper)
|
parser.add_argument(
|
||||||
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
|
"--logging.level",
|
||||||
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
|
action="store",
|
||||||
|
dest="logging.level",
|
||||||
def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
|
choices=["INFO", "DEBUG", "ERROR", "WARNING"],
|
||||||
|
help="the logging level to use",
|
||||||
|
default="INFO",
|
||||||
|
type=str.upper,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--logging.file", action="store", dest="logging.file", help="the logging file to write to", default=None
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--logging.rotation",
|
||||||
|
action="store",
|
||||||
|
dest="logging.rotation",
|
||||||
|
help="the logging rotation to use",
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
def add_individual_module_args(
|
||||||
|
self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None
|
||||||
|
) -> None:
|
||||||
if not modules:
|
if not modules:
|
||||||
modules = self.module_factory.available_modules()
|
modules = self.module_factory.available_modules()
|
||||||
|
|
||||||
for module in modules:
|
for module in modules:
|
||||||
if module.name == 'cli_feeder':
|
if module.name == "cli_feeder":
|
||||||
# special case. For the CLI feeder, allow passing URLs directly on the command line without setting --cli_feeder.urls=
|
# special case. For the CLI feeder, allow passing URLs directly on the command line without setting --cli_feeder.urls=
|
||||||
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
|
parser.add_argument(
|
||||||
|
"urls",
|
||||||
|
nargs="*",
|
||||||
|
default=[],
|
||||||
|
help="URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not module.configs:
|
if not module.configs:
|
||||||
@@ -209,21 +288,21 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")
|
group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")
|
||||||
|
|
||||||
for name, kwargs in module.configs.items():
|
for name, kwargs in module.configs.items():
|
||||||
if not kwargs.get('metavar', None):
|
if not kwargs.get("metavar", None):
|
||||||
# make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR]
|
# make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR]
|
||||||
kwargs['metavar'] = name.upper()
|
kwargs["metavar"] = name.upper()
|
||||||
|
|
||||||
if kwargs.get('required', False):
|
if kwargs.get("required", False):
|
||||||
# required args shouldn't have a 'default' value, remove it
|
# required args shouldn't have a 'default' value, remove it
|
||||||
kwargs.pop('default', None)
|
kwargs.pop("default", None)
|
||||||
|
|
||||||
kwargs.pop('cli_set', None)
|
kwargs.pop("cli_set", None)
|
||||||
should_store = kwargs.pop('should_store', False)
|
should_store = kwargs.pop("should_store", False)
|
||||||
kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
|
kwargs["dest"] = f"{module.name}.{kwargs.pop('dest', name)}"
|
||||||
try:
|
try:
|
||||||
kwargs['type'] = getattr(validators, kwargs.get('type', '__invalid__'))
|
kwargs["type"] = getattr(validators, kwargs.get("type", "__invalid__"))
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
kwargs['type'] = __builtins__.get(kwargs.get('type'), str)
|
kwargs["type"] = __builtins__.get(kwargs.get("type"), str)
|
||||||
arg = group.add_argument(f"--{module.name}.{name}", **kwargs)
|
arg = group.add_argument(f"--{module.name}.{name}", **kwargs)
|
||||||
arg.should_store = should_store
|
arg.should_store = should_store
|
||||||
|
|
||||||
@@ -238,12 +317,11 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
self.basic_parser.exit()
|
self.basic_parser.exit()
|
||||||
|
|
||||||
def setup_logging(self, config):
|
def setup_logging(self, config):
|
||||||
|
logging_config = config["logging"]
|
||||||
|
|
||||||
logging_config = config['logging']
|
if logging_config.get("enabled", True) is False:
|
||||||
|
|
||||||
if logging_config.get('enabled', True) is False:
|
|
||||||
# disabled logging settings, they're set on a higher level
|
# disabled logging settings, they're set on a higher level
|
||||||
logger.disable('auto_archiver')
|
logger.disable("auto_archiver")
|
||||||
return
|
return
|
||||||
|
|
||||||
# setup loguru logging
|
# setup loguru logging
|
||||||
@@ -253,10 +331,12 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
# add other logging info
|
# add other logging info
|
||||||
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
|
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
|
||||||
self.logger_id = logger.add(sys.stderr, level=logging_config['level'])
|
self.logger_id = logger.add(sys.stderr, level=logging_config["level"])
|
||||||
if log_file := logging_config['file']:
|
if log_file := logging_config["file"]:
|
||||||
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
|
logger.add(log_file) if not logging_config["rotation"] else logger.add(
|
||||||
|
log_file, rotation=logging_config["rotation"]
|
||||||
|
)
|
||||||
|
|
||||||
def install_modules(self, modules_by_type):
|
def install_modules(self, modules_by_type):
|
||||||
"""
|
"""
|
||||||
@@ -267,24 +347,29 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
|
|
||||||
invalid_modules = []
|
invalid_modules = []
|
||||||
for module_type in MODULE_TYPES:
|
for module_type in MODULE_TYPES:
|
||||||
|
|
||||||
step_items = []
|
step_items = []
|
||||||
modules_to_load = modules_by_type[f"{module_type}s"]
|
modules_to_load = modules_by_type[f"{module_type}s"]
|
||||||
if not modules_to_load:
|
if not modules_to_load:
|
||||||
raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
|
raise SetupError(
|
||||||
|
f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
|
||||||
|
)
|
||||||
|
|
||||||
def check_steps_ok():
|
def check_steps_ok():
|
||||||
if not len(step_items):
|
if not len(step_items):
|
||||||
if len(modules_to_load):
|
if len(modules_to_load):
|
||||||
logger.error(f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}")
|
logger.error(
|
||||||
raise SetupError(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.")
|
f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}"
|
||||||
|
)
|
||||||
|
raise SetupError(
|
||||||
|
f"NO {module_type.upper()}S LOADED. Please check your configuration and try again."
|
||||||
|
)
|
||||||
|
|
||||||
|
if (module_type == "feeder" or module_type == "formatter") and len(step_items) > 1:
|
||||||
if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1:
|
raise SetupError(
|
||||||
raise SetupError(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
|
f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}"
|
||||||
|
)
|
||||||
|
|
||||||
for module in modules_to_load:
|
for module in modules_to_load:
|
||||||
|
|
||||||
if module in invalid_modules:
|
if module in invalid_modules:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -293,7 +378,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
|
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
|
||||||
except (KeyboardInterrupt, Exception) as e:
|
except (KeyboardInterrupt, Exception) as e:
|
||||||
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
|
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
|
||||||
if loaded_module and module_type == 'extractor':
|
if loaded_module and module_type == "extractor":
|
||||||
loaded_module.cleanup()
|
loaded_module.cleanup()
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
@@ -308,7 +393,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
|
|
||||||
def load_config(self, config_file: str) -> dict:
|
def load_config(self, config_file: str) -> dict:
|
||||||
if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
|
if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
|
||||||
logger.error(f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
|
logger.error(
|
||||||
|
f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings."
|
||||||
|
)
|
||||||
raise FileNotFoundError(f"Configuration file {config_file} not found")
|
raise FileNotFoundError(f"Configuration file {config_file} not found")
|
||||||
|
|
||||||
return read_yaml(config_file)
|
return read_yaml(config_file)
|
||||||
@@ -338,10 +425,10 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
|
|
||||||
def check_for_updates(self):
|
def check_for_updates(self):
|
||||||
response = requests.get("https://pypi.org/pypi/auto-archiver/json").json()
|
response = requests.get("https://pypi.org/pypi/auto-archiver/json").json()
|
||||||
latest_version = response['info']['version']
|
latest_version = response["info"]["version"]
|
||||||
# check version compared to current version
|
# check version compared to current version
|
||||||
if latest_version != __version__:
|
if latest_version != __version__:
|
||||||
if os.environ.get('RUNNING_IN_DOCKER'):
|
if os.environ.get("RUNNING_IN_DOCKER"):
|
||||||
update_cmd = "`docker pull bellingcat/auto-archiver:latest`"
|
update_cmd = "`docker pull bellingcat/auto-archiver:latest`"
|
||||||
else:
|
else:
|
||||||
update_cmd = "`pip install --upgrade auto-archiver`"
|
update_cmd = "`pip install --upgrade auto-archiver`"
|
||||||
@@ -351,7 +438,6 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
logger.warning(f"Make sure to update to the latest version using: {update_cmd}")
|
logger.warning(f"Make sure to update to the latest version using: {update_cmd}")
|
||||||
logger.warning("")
|
logger.warning("")
|
||||||
|
|
||||||
|
|
||||||
def setup(self, args: list):
|
def setup(self, args: list):
|
||||||
"""
|
"""
|
||||||
Function to configure all setup of the orchestrator: setup configs and load modules.
|
Function to configure all setup of the orchestrator: setup configs and load modules.
|
||||||
@@ -362,21 +448,25 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
self.check_for_updates()
|
self.check_for_updates()
|
||||||
|
|
||||||
if self.setup_finished:
|
if self.setup_finished:
|
||||||
logger.warning("The `setup_config()` function should only ever be run once. \
|
logger.warning(
|
||||||
|
"The `setup_config()` function should only ever be run once. \
|
||||||
If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \
|
If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \
|
||||||
For code implementatations, you should call .setup_config() once then you may call .feed() \
|
For code implementatations, you should call .setup_config() once then you may call .feed() \
|
||||||
multiple times to archive multiple URLs.")
|
multiple times to archive multiple URLs."
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
self.setup_basic_parser()
|
self.setup_basic_parser()
|
||||||
self.config = self.setup_config(args)
|
self.config = self.setup_config(args)
|
||||||
|
|
||||||
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
|
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
|
||||||
self.install_modules(self.config['steps'])
|
self.install_modules(self.config["steps"])
|
||||||
|
|
||||||
# log out the modules that were loaded
|
# log out the modules that were loaded
|
||||||
for module_type in MODULE_TYPES:
|
for module_type in MODULE_TYPES:
|
||||||
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
|
logger.info(
|
||||||
|
f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s"))
|
||||||
|
)
|
||||||
|
|
||||||
self.setup_finished = True
|
self.setup_finished = True
|
||||||
|
|
||||||
@@ -405,7 +495,6 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
e.cleanup()
|
e.cleanup()
|
||||||
|
|
||||||
def feed(self) -> Generator[Metadata]:
|
def feed(self) -> Generator[Metadata]:
|
||||||
|
|
||||||
url_count = 0
|
url_count = 0
|
||||||
for feeder in self.feeders:
|
for feeder in self.feeders:
|
||||||
for item in feeder:
|
for item in feeder:
|
||||||
@@ -436,9 +525,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
self.cleanup()
|
self.cleanup()
|
||||||
exit()
|
exit()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
|
logger.error(f"Got unexpected error on item {item}: {e}\n{traceback.format_exc()}")
|
||||||
for d in self.databases:
|
for d in self.databases:
|
||||||
if type(e) == AssertionError:
|
if isinstance(e, AssertionError):
|
||||||
d.failed(item, str(e))
|
d.failed(item, str(e))
|
||||||
else:
|
else:
|
||||||
d.failed(item, reason="unexpected error")
|
d.failed(item, reason="unexpected error")
|
||||||
@@ -451,13 +540,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
|
|
||||||
def archive(self, result: Metadata) -> Union[Metadata, None]:
|
def archive(self, result: Metadata) -> Union[Metadata, None]:
|
||||||
"""
|
"""
|
||||||
Runs the archiving process for a single URL
|
Runs the archiving process for a single URL
|
||||||
1. Each archiver can sanitize its own URLs
|
1. Each archiver can sanitize its own URLs
|
||||||
2. Check for cached results in Databases, and signal start to the databases
|
2. Check for cached results in Databases, and signal start to the databases
|
||||||
3. Call Archivers until one succeeds
|
3. Call Archivers until one succeeds
|
||||||
4. Call Enrichers
|
4. Call Enrichers
|
||||||
5. Store all downloaded/generated media
|
5. Store all downloaded/generated media
|
||||||
6. Call selected Formatter and store formatted if needed
|
6. Call selected Formatter and store formatted if needed
|
||||||
"""
|
"""
|
||||||
|
|
||||||
original_url = result.get_url().strip()
|
original_url = result.get_url().strip()
|
||||||
@@ -473,7 +562,8 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
url = a.sanitize_url(url)
|
url = a.sanitize_url(url)
|
||||||
|
|
||||||
result.set_url(url)
|
result.set_url(url)
|
||||||
if original_url != url: result.set("original_url", original_url)
|
if original_url != url:
|
||||||
|
result.set("original_url", original_url)
|
||||||
|
|
||||||
# 2 - notify start to DBs, propagate already archived if feature enabled in DBs
|
# 2 - notify start to DBs, propagate already archived if feature enabled in DBs
|
||||||
cached_result = None
|
cached_result = None
|
||||||
@@ -484,7 +574,8 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
if cached_result:
|
if cached_result:
|
||||||
logger.debug("Found previously archived entry")
|
logger.debug("Found previously archived entry")
|
||||||
for d in self.databases:
|
for d in self.databases:
|
||||||
try: d.done(cached_result, cached=True)
|
try:
|
||||||
|
d.done(cached_result, cached=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
||||||
return cached_result
|
return cached_result
|
||||||
@@ -494,13 +585,15 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
logger.info(f"Trying extractor {a.name} for {url}")
|
logger.info(f"Trying extractor {a.name} for {url}")
|
||||||
try:
|
try:
|
||||||
result.merge(a.download(result))
|
result.merge(a.download(result))
|
||||||
if result.is_success(): break
|
if result.is_success():
|
||||||
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")
|
logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")
|
||||||
|
|
||||||
# 4 - call enrichers to work with archived content
|
# 4 - call enrichers to work with archived content
|
||||||
for e in self.enrichers:
|
for e in self.enrichers:
|
||||||
try: e.enrich(result)
|
try:
|
||||||
|
e.enrich(result)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
|
logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
|
||||||
|
|
||||||
@@ -518,13 +611,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
|
|
||||||
# signal completion to databases and archivers
|
# signal completion to databases and archivers
|
||||||
for d in self.databases:
|
for d in self.databases:
|
||||||
try: d.done(result)
|
try:
|
||||||
|
d.done(result)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def setup_authentication(self, config: dict) -> dict:
|
def setup_authentication(self, config: dict) -> dict:
|
||||||
"""
|
"""
|
||||||
Setup authentication for all modules that require it
|
Setup authentication for all modules that require it
|
||||||
@@ -532,7 +625,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
Split up strings into multiple sites if they are comma separated
|
Split up strings into multiple sites if they are comma separated
|
||||||
"""
|
"""
|
||||||
|
|
||||||
authentication = config.get('authentication', {})
|
authentication = config.get("authentication", {})
|
||||||
|
|
||||||
# extract out concatenated sites
|
# extract out concatenated sites
|
||||||
for key, val in copy(authentication).items():
|
for key, val in copy(authentication).items():
|
||||||
@@ -542,7 +635,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
authentication[site] = val
|
authentication[site] = val
|
||||||
del authentication[key]
|
del authentication[key]
|
||||||
|
|
||||||
config['authentication'] = authentication
|
config["authentication"] = authentication
|
||||||
return config
|
return config
|
||||||
|
|
||||||
# Helper Properties
|
# Helper Properties
|
||||||
|
|||||||
@@ -23,7 +23,6 @@ from __future__ import annotations
|
|||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from typing import IO
|
from typing import IO
|
||||||
import os
|
import os
|
||||||
import platform
|
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
@@ -33,15 +32,15 @@ from auto_archiver.utils.misc import random_str
|
|||||||
from auto_archiver.core import Media, BaseModule, Metadata
|
from auto_archiver.core import Media, BaseModule, Metadata
|
||||||
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
|
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
|
||||||
|
|
||||||
class Storage(BaseModule):
|
|
||||||
|
|
||||||
|
class Storage(BaseModule):
|
||||||
"""
|
"""
|
||||||
Base class for implementing storage modules in the media archiving framework.
|
Base class for implementing storage modules in the media archiving framework.
|
||||||
|
|
||||||
Subclasses must implement the `get_cdn_url` and `uploadf` methods to define their behavior.
|
Subclasses must implement the `get_cdn_url` and `uploadf` methods to define their behavior.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def store(self, media: Media, url: str, metadata: Metadata=None) -> None:
|
def store(self, media: Media, url: str, metadata: Metadata = None) -> None:
|
||||||
if media.is_stored(in_storage=self):
|
if media.is_stored(in_storage=self):
|
||||||
logger.debug(f"{media.key} already stored, skipping")
|
logger.debug(f"{media.key} already stored, skipping")
|
||||||
return
|
return
|
||||||
@@ -74,8 +73,8 @@ class Storage(BaseModule):
|
|||||||
This method should not be called directly, but instead be called through the 'store' method,
|
This method should not be called directly, but instead be called through the 'store' method,
|
||||||
which sets up the media for storage.
|
which sets up the media for storage.
|
||||||
"""
|
"""
|
||||||
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
|
logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key}")
|
||||||
with open(media.filename, 'rb') as f:
|
with open(media.filename, "rb") as f:
|
||||||
return self.uploadf(f, media, **kwargs)
|
return self.uploadf(f, media, **kwargs)
|
||||||
|
|
||||||
def set_key(self, media: Media, url: str, metadata: Metadata) -> None:
|
def set_key(self, media: Media, url: str, metadata: Metadata) -> None:
|
||||||
@@ -85,7 +84,7 @@ class Storage(BaseModule):
|
|||||||
# media key is already set
|
# media key is already set
|
||||||
return
|
return
|
||||||
|
|
||||||
folder = metadata.get_context('folder', '')
|
folder = metadata.get_context("folder", "")
|
||||||
filename, ext = os.path.splitext(media.filename)
|
filename, ext = os.path.splitext(media.filename)
|
||||||
|
|
||||||
# Handle path_generator logic
|
# Handle path_generator logic
|
||||||
@@ -105,12 +104,11 @@ class Storage(BaseModule):
|
|||||||
filename = random_str(24)
|
filename = random_str(24)
|
||||||
elif filename_generator == "static":
|
elif filename_generator == "static":
|
||||||
# load the hash_enricher module
|
# load the hash_enricher module
|
||||||
he = self.module_factory.get_module("hash_enricher", self.config)
|
he: HashEnricher = self.module_factory.get_module("hash_enricher", self.config)
|
||||||
hd = he.calculate_hash(media.filename)
|
hd = he.calculate_hash(media.filename)
|
||||||
filename = hd[:24]
|
filename = hd[:24]
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid filename_generator: {filename_generator}")
|
raise ValueError(f"Invalid filename_generator: {filename_generator}")
|
||||||
|
|
||||||
key = os.path.join(folder, path, f"{filename}{ext}")
|
key = os.path.join(folder, path, f"{filename}{ext}")
|
||||||
|
|
||||||
media._key = key
|
media._key = key
|
||||||
@@ -3,11 +3,13 @@ from pathlib import Path
|
|||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
|
||||||
def example_validator(value):
|
def example_validator(value):
|
||||||
if "example" not in value:
|
if "example" not in value:
|
||||||
raise argparse.ArgumentTypeError(f"{value} is not a valid value for this argument")
|
raise argparse.ArgumentTypeError(f"{value} is not a valid value for this argument")
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
def positive_number(value):
|
def positive_number(value):
|
||||||
if value < 0:
|
if value < 0:
|
||||||
raise argparse.ArgumentTypeError(f"{value} is not a positive number")
|
raise argparse.ArgumentTypeError(f"{value} is not a positive number")
|
||||||
@@ -19,5 +21,6 @@ def valid_file(value):
|
|||||||
raise argparse.ArgumentTypeError(f"File '{value}' does not exist.")
|
raise argparse.ArgumentTypeError(f"File '{value}' does not exist.")
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
def json_loader(cli_val):
|
def json_loader(cli_val):
|
||||||
return json.loads(cli_val)
|
return json.loads(cli_val)
|
||||||
@@ -11,8 +11,7 @@
|
|||||||
"required": True,
|
"required": True,
|
||||||
"help": "API endpoint where calls are made to",
|
"help": "API endpoint where calls are made to",
|
||||||
},
|
},
|
||||||
"api_token": {"default": None,
|
"api_token": {"default": None, "help": "API Bearer token."},
|
||||||
"help": "API Bearer token."},
|
|
||||||
"public": {
|
"public": {
|
||||||
"default": False,
|
"default": False,
|
||||||
"type": "bool",
|
"type": "bool",
|
||||||
|
|||||||
@@ -12,10 +12,11 @@ class AAApiDb(Database):
|
|||||||
"""Connects to auto-archiver-api instance"""
|
"""Connects to auto-archiver-api instance"""
|
||||||
|
|
||||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||||
""" query the database for the existence of this item.
|
"""query the database for the existence of this item.
|
||||||
Helps avoid re-archiving the same URL multiple times.
|
Helps avoid re-archiving the same URL multiple times.
|
||||||
"""
|
"""
|
||||||
if not self.use_api_cache: return
|
if not self.use_api_cache:
|
||||||
|
return
|
||||||
|
|
||||||
params = {"url": item.get_url(), "limit": 15}
|
params = {"url": item.get_url(), "limit": 15}
|
||||||
headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
|
headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
|
||||||
@@ -32,22 +33,25 @@ class AAApiDb(Database):
|
|||||||
|
|
||||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||||
"""archival result ready - should be saved to DB"""
|
"""archival result ready - should be saved to DB"""
|
||||||
if not self.store_results: return
|
if not self.store_results:
|
||||||
|
return
|
||||||
if cached:
|
if cached:
|
||||||
logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
|
logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
|
||||||
return
|
return
|
||||||
logger.debug(f"saving archive of {item.get_url()} to the AA API.")
|
logger.debug(f"saving archive of {item.get_url()} to the AA API.")
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
'author_id': self.author_id,
|
"author_id": self.author_id,
|
||||||
'url': item.get_url(),
|
"url": item.get_url(),
|
||||||
'public': self.public,
|
"public": self.public,
|
||||||
'group_id': self.group_id,
|
"group_id": self.group_id,
|
||||||
'tags': list(self.tags),
|
"tags": list(self.tags),
|
||||||
'result': item.to_json(),
|
"result": item.to_json(),
|
||||||
}
|
}
|
||||||
headers = {"Authorization": f"Bearer {self.api_token}"}
|
headers = {"Authorization": f"Bearer {self.api_token}"}
|
||||||
response = requests.post(os.path.join(self.api_endpoint, "interop/submit-archive"), json=payload, headers=headers)
|
response = requests.post(
|
||||||
|
os.path.join(self.api_endpoint, "interop/submit-archive"), json=payload, headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
if response.status_code == 201:
|
if response.status_code == 201:
|
||||||
logger.success(f"AA API: {response.json()}")
|
logger.success(f"AA API: {response.json()}")
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
{
|
{
|
||||||
"name": "Atlos Feeder Database Storage",
|
"name": "Atlos Feeder Database Storage",
|
||||||
"type": ["feeder", "database", "storage"],
|
"type": ["feeder", "database", "storage"],
|
||||||
"entry_point": "atlos_feeder_db_storage::AtlosFeederDbStorage",
|
"entry_point": "atlos_feeder_db_storage::AtlosFeederDbStorage",
|
||||||
"requires_setup": True,
|
"requires_setup": True,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"python": ["loguru", "requests"],
|
"python": ["loguru", "requests"],
|
||||||
@@ -15,7 +15,7 @@
|
|||||||
"atlos_url": {
|
"atlos_url": {
|
||||||
"default": "https://platform.atlos.org",
|
"default": "https://platform.atlos.org",
|
||||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||||
"type": "str"
|
"type": "str",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
"description": """
|
"description": """
|
||||||
@@ -42,5 +42,5 @@
|
|||||||
- Requires an Atlos account with a project and a valid API token for authentication.
|
- Requires an Atlos account with a project and a valid API token for authentication.
|
||||||
- Ensures only unprocessed, visible, and ready-to-archive URLs are returned.
|
- Ensures only unprocessed, visible, and ready-to-archive URLs are returned.
|
||||||
- Feches any media items within an Atlos project, regardless of separation into incidents.
|
- Feches any media items within an Atlos project, regardless of separation into incidents.
|
||||||
"""
|
""",
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,7 +10,6 @@ from auto_archiver.utils import calculate_file_hash
|
|||||||
|
|
||||||
|
|
||||||
class AtlosFeederDbStorage(Feeder, Database, Storage):
|
class AtlosFeederDbStorage(Feeder, Database, Storage):
|
||||||
|
|
||||||
def setup(self) -> requests.Session:
|
def setup(self) -> requests.Session:
|
||||||
"""create and return a persistent session."""
|
"""create and return a persistent session."""
|
||||||
self.session = requests.Session()
|
self.session = requests.Session()
|
||||||
@@ -18,9 +17,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
|||||||
def _get(self, endpoint: str, params: Optional[dict] = None) -> dict:
|
def _get(self, endpoint: str, params: Optional[dict] = None) -> dict:
|
||||||
"""Wrapper for GET requests to the Atlos API."""
|
"""Wrapper for GET requests to the Atlos API."""
|
||||||
url = f"{self.atlos_url}{endpoint}"
|
url = f"{self.atlos_url}{endpoint}"
|
||||||
response = self.session.get(
|
response = self.session.get(url, headers={"Authorization": f"Bearer {self.api_token}"}, params=params)
|
||||||
url, headers={"Authorization": f"Bearer {self.api_token}"}, params=params
|
|
||||||
)
|
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
return response.json()
|
return response.json()
|
||||||
|
|
||||||
@@ -85,10 +82,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
|||||||
def _process_metadata(self, item: Metadata) -> dict:
|
def _process_metadata(self, item: Metadata) -> dict:
|
||||||
"""Process metadata for storage on Atlos. Will convert any datetime
|
"""Process metadata for storage on Atlos. Will convert any datetime
|
||||||
objects to ISO format."""
|
objects to ISO format."""
|
||||||
return {
|
return {k: v.isoformat() if hasattr(v, "isoformat") else v for k, v in item.metadata.items()}
|
||||||
k: v.isoformat() if hasattr(v, "isoformat") else v
|
|
||||||
for k, v in item.metadata.items()
|
|
||||||
}
|
|
||||||
|
|
||||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||||
"""Mark an item as successfully archived in Atlos."""
|
"""Mark an item as successfully archived in Atlos."""
|
||||||
@@ -129,10 +123,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
|||||||
|
|
||||||
# Check whether the media has already been uploaded
|
# Check whether the media has already been uploaded
|
||||||
source_material = self._get(f"/api/v2/source_material/{atlos_id}")["result"]
|
source_material = self._get(f"/api/v2/source_material/{atlos_id}")["result"]
|
||||||
existing_media = [
|
existing_media = [artifact.get("file_hash_sha256") for artifact in source_material.get("artifacts", [])]
|
||||||
artifact.get("file_hash_sha256")
|
|
||||||
for artifact in source_material.get("artifacts", [])
|
|
||||||
]
|
|
||||||
if media_hash in existing_media:
|
if media_hash in existing_media:
|
||||||
logger.info(f"{media.filename} with SHA256 {media_hash} already uploaded to Atlos")
|
logger.info(f"{media.filename} with SHA256 {media_hash} already uploaded to Atlos")
|
||||||
return True
|
return True
|
||||||
@@ -150,4 +141,3 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
|||||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||||||
"""Upload a file-like object; not implemented."""
|
"""Upload a file-like object; not implemented."""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
@@ -1,16 +1,15 @@
|
|||||||
{
|
{
|
||||||
'name': 'Command Line Feeder',
|
"name": "Command Line Feeder",
|
||||||
'type': ['feeder'],
|
"type": ["feeder"],
|
||||||
'entry_point': 'cli_feeder::CLIFeeder',
|
"entry_point": "cli_feeder::CLIFeeder",
|
||||||
'requires_setup': False,
|
"requires_setup": False,
|
||||||
'description': 'Feeds URLs to orchestrator from the command line',
|
"configs": {
|
||||||
'configs': {
|
"urls": {
|
||||||
'urls': {
|
"default": None,
|
||||||
'default': None,
|
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||||
'help': 'URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml',
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
'description': """
|
"description": """
|
||||||
The Command Line Feeder is the default enabled feeder for the Auto Archiver. It allows you to pass URLs directly to the orchestrator from the command line
|
The Command Line Feeder is the default enabled feeder for the Auto Archiver. It allows you to pass URLs directly to the orchestrator from the command line
|
||||||
without the need to specify any additional configuration or command line arguments:
|
without the need to specify any additional configuration or command line arguments:
|
||||||
|
|
||||||
|
|||||||
@@ -3,15 +3,17 @@ from loguru import logger
|
|||||||
from auto_archiver.core.feeder import Feeder
|
from auto_archiver.core.feeder import Feeder
|
||||||
from auto_archiver.core.metadata import Metadata
|
from auto_archiver.core.metadata import Metadata
|
||||||
|
|
||||||
class CLIFeeder(Feeder):
|
|
||||||
|
|
||||||
|
class CLIFeeder(Feeder):
|
||||||
def setup(self) -> None:
|
def setup(self) -> None:
|
||||||
self.urls = self.config['urls']
|
self.urls = self.config["urls"]
|
||||||
if not self.urls:
|
if not self.urls:
|
||||||
raise ValueError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
|
raise ValueError(
|
||||||
|
"No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information."
|
||||||
|
)
|
||||||
|
|
||||||
def __iter__(self) -> Metadata:
|
def __iter__(self) -> Metadata:
|
||||||
urls = self.config['urls']
|
urls = self.config["urls"]
|
||||||
for url in urls:
|
for url in urls:
|
||||||
logger.debug(f"Processing {url}")
|
logger.debug(f"Processing {url}")
|
||||||
m = Metadata().set_url(url)
|
m = Metadata().set_url(url)
|
||||||
|
|||||||
@@ -6,18 +6,18 @@ from auto_archiver.core import Metadata
|
|||||||
|
|
||||||
class ConsoleDb(Database):
|
class ConsoleDb(Database):
|
||||||
"""
|
"""
|
||||||
Outputs results to the console
|
Outputs results to the console
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def started(self, item: Metadata) -> None:
|
def started(self, item: Metadata) -> None:
|
||||||
logger.info(f"STARTED {item}")
|
logger.info(f"STARTED {item}")
|
||||||
|
|
||||||
def failed(self, item: Metadata, reason:str) -> None:
|
def failed(self, item: Metadata, reason: str) -> None:
|
||||||
logger.error(f"FAILED {item}: {reason}")
|
logger.error(f"FAILED {item}: {reason}")
|
||||||
|
|
||||||
def aborted(self, item: Metadata) -> None:
|
def aborted(self, item: Metadata) -> None:
|
||||||
logger.warning(f"ABORTED {item}")
|
logger.warning(f"ABORTED {item}")
|
||||||
|
|
||||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||||
"""archival result ready - should be saved to DB"""
|
"""archival result ready - should be saved to DB"""
|
||||||
logger.success(f"DONE {item}")
|
logger.success(f"DONE {item}")
|
||||||
@@ -2,12 +2,11 @@
|
|||||||
"name": "CSV Database",
|
"name": "CSV Database",
|
||||||
"type": ["database"],
|
"type": ["database"],
|
||||||
"requires_setup": False,
|
"requires_setup": False,
|
||||||
"dependencies": {"python": ["loguru"]
|
"dependencies": {"python": ["loguru"]},
|
||||||
},
|
"entry_point": "csv_db::CSVDb",
|
||||||
'entry_point': 'csv_db::CSVDb',
|
|
||||||
"configs": {
|
"configs": {
|
||||||
"csv_file": {"default": "db.csv", "help": "CSV file name to save metadata to"},
|
"csv_file": {"default": "db.csv", "help": "CSV file name to save metadata to"},
|
||||||
},
|
},
|
||||||
"description": """
|
"description": """
|
||||||
Handles exporting archival results to a CSV file.
|
Handles exporting archival results to a CSV file.
|
||||||
|
|
||||||
|
|||||||
@@ -9,14 +9,15 @@ from auto_archiver.core import Metadata
|
|||||||
|
|
||||||
class CSVDb(Database):
|
class CSVDb(Database):
|
||||||
"""
|
"""
|
||||||
Outputs results to a CSV file
|
Outputs results to a CSV file
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||||
"""archival result ready - should be saved to DB"""
|
"""archival result ready - should be saved to DB"""
|
||||||
logger.success(f"DONE {item}")
|
logger.success(f"DONE {item}")
|
||||||
is_empty = not os.path.isfile(self.csv_file) or os.path.getsize(self.csv_file) == 0
|
is_empty = not os.path.isfile(self.csv_file) or os.path.getsize(self.csv_file) == 0
|
||||||
with open(self.csv_file, "a", encoding="utf-8") as outf:
|
with open(self.csv_file, "a", encoding="utf-8") as outf:
|
||||||
writer = DictWriter(outf, fieldnames=asdict(Metadata()))
|
writer = DictWriter(outf, fieldnames=asdict(Metadata()))
|
||||||
if is_empty: writer.writeheader()
|
if is_empty:
|
||||||
|
writer.writeheader()
|
||||||
writer.writerow(asdict(item))
|
writer.writerow(asdict(item))
|
||||||
|
|||||||
@@ -1,27 +1,23 @@
|
|||||||
{
|
{
|
||||||
"name": "CSV Feeder",
|
"name": "CSV Feeder",
|
||||||
"type": ["feeder"],
|
"type": ["feeder"],
|
||||||
"requires_setup": False,
|
"dependencies": {"python": ["loguru"], "bin": [""]},
|
||||||
"dependencies": {
|
"requires_setup": True,
|
||||||
"python": ["loguru"],
|
"entry_point": "csv_feeder::CSVFeeder",
|
||||||
"bin": [""]
|
|
||||||
},
|
|
||||||
'requires_setup': True,
|
|
||||||
'entry_point': "csv_feeder::CSVFeeder",
|
|
||||||
"configs": {
|
"configs": {
|
||||||
"files": {
|
"files": {
|
||||||
"default": None,
|
"default": None,
|
||||||
"help": "Path to the input file(s) to read the URLs from, comma separated. \
|
"help": "Path to the input file(s) to read the URLs from, comma separated. \
|
||||||
Input files should be formatted with one URL per line",
|
Input files should be formatted with one URL per line",
|
||||||
"required": True,
|
"required": True,
|
||||||
"type": "valid_file",
|
"type": "valid_file",
|
||||||
"nargs": "+",
|
"nargs": "+",
|
||||||
},
|
|
||||||
"column": {
|
|
||||||
"default": None,
|
|
||||||
"help": "Column number or name to read the URLs from, 0-indexed",
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
|
"column": {
|
||||||
|
"default": None,
|
||||||
|
"help": "Column number or name to read the URLs from, 0-indexed",
|
||||||
|
},
|
||||||
|
},
|
||||||
"description": """
|
"description": """
|
||||||
Reads URLs from CSV files and feeds them into the archiving process.
|
Reads URLs from CSV files and feeds them into the archiving process.
|
||||||
|
|
||||||
@@ -33,5 +29,5 @@
|
|||||||
### Setup
|
### Setup
|
||||||
- Input files should be formatted with one URL per line, with or without a header row.
|
- Input files should be formatted with one URL per line, with or without a header row.
|
||||||
- If you have a header row, you can specify the column number or name to read URLs from using the 'column' config option.
|
- If you have a header row, you can specify the column number or name to read URLs from using the 'column' config option.
|
||||||
"""
|
""",
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,11 +5,10 @@ from auto_archiver.core import Feeder
|
|||||||
from auto_archiver.core import Metadata
|
from auto_archiver.core import Metadata
|
||||||
from auto_archiver.utils import url_or_none
|
from auto_archiver.utils import url_or_none
|
||||||
|
|
||||||
|
|
||||||
class CSVFeeder(Feeder):
|
class CSVFeeder(Feeder):
|
||||||
|
|
||||||
column = None
|
column = None
|
||||||
|
|
||||||
|
|
||||||
def __iter__(self) -> Metadata:
|
def __iter__(self) -> Metadata:
|
||||||
for file in self.files:
|
for file in self.files:
|
||||||
with open(file, "r") as f:
|
with open(file, "r") as f:
|
||||||
@@ -20,9 +19,11 @@ class CSVFeeder(Feeder):
|
|||||||
try:
|
try:
|
||||||
url_column = first_row.index(url_column)
|
url_column = first_row.index(url_column)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
logger.error(f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?")
|
logger.error(
|
||||||
|
f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?"
|
||||||
|
)
|
||||||
return
|
return
|
||||||
elif not(url_or_none(first_row[url_column])):
|
elif not (url_or_none(first_row[url_column])):
|
||||||
# it's a header row, but we've been given a column number already
|
# it's a header row, but we've been given a column number already
|
||||||
logger.debug(f"Skipping header row: {first_row}")
|
logger.debug(f"Skipping header row: {first_row}")
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -22,11 +22,18 @@
|
|||||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||||
"choices": ["random", "static"],
|
"choices": ["random", "static"],
|
||||||
},
|
},
|
||||||
"root_folder_id": {"required": True,
|
"root_folder_id": {
|
||||||
"help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
|
"required": True,
|
||||||
"oauth_token": {"default": None,
|
"help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'",
|
||||||
"help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
|
},
|
||||||
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."},
|
"oauth_token": {
|
||||||
|
"default": None,
|
||||||
|
"help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account.",
|
||||||
|
},
|
||||||
|
"service_account": {
|
||||||
|
"default": "secrets/service_account.json",
|
||||||
|
"help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account.",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"description": """
|
"description": """
|
||||||
|
|
||||||
@@ -94,5 +101,5 @@ This module integrates Google Drive as a storage backend, enabling automatic fol
|
|||||||
https://davemateer.com/2022/04/28/google-drive-with-python#tokens
|
https://davemateer.com/2022/04/28/google-drive-with-python#tokens
|
||||||
|
|
||||||
|
|
||||||
"""
|
""",
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
@@ -15,12 +14,9 @@ from auto_archiver.core import Media
|
|||||||
from auto_archiver.core import Storage
|
from auto_archiver.core import Storage
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class GDriveStorage(Storage):
|
class GDriveStorage(Storage):
|
||||||
|
|
||||||
def setup(self) -> None:
|
def setup(self) -> None:
|
||||||
self.scopes = ['https://www.googleapis.com/auth/drive']
|
self.scopes = ["https://www.googleapis.com/auth/drive"]
|
||||||
# Initialize Google Drive service
|
# Initialize Google Drive service
|
||||||
self._setup_google_drive_service()
|
self._setup_google_drive_service()
|
||||||
|
|
||||||
@@ -37,25 +33,25 @@ class GDriveStorage(Storage):
|
|||||||
|
|
||||||
def _initialize_with_oauth_token(self):
|
def _initialize_with_oauth_token(self):
|
||||||
"""Initialize Google Drive service with OAuth token."""
|
"""Initialize Google Drive service with OAuth token."""
|
||||||
with open(self.oauth_token, 'r') as stream:
|
with open(self.oauth_token, "r") as stream:
|
||||||
creds_json = json.load(stream)
|
creds_json = json.load(stream)
|
||||||
creds_json['refresh_token'] = creds_json.get("refresh_token", "")
|
creds_json["refresh_token"] = creds_json.get("refresh_token", "")
|
||||||
|
|
||||||
creds = Credentials.from_authorized_user_info(creds_json, self.scopes)
|
creds = Credentials.from_authorized_user_info(creds_json, self.scopes)
|
||||||
if not creds.valid and creds.expired and creds.refresh_token:
|
if not creds.valid and creds.expired and creds.refresh_token:
|
||||||
creds.refresh(Request())
|
creds.refresh(Request())
|
||||||
with open(self.oauth_token, 'w') as token_file:
|
with open(self.oauth_token, "w") as token_file:
|
||||||
logger.debug("Saving refreshed OAuth token.")
|
logger.debug("Saving refreshed OAuth token.")
|
||||||
token_file.write(creds.to_json())
|
token_file.write(creds.to_json())
|
||||||
elif not creds.valid:
|
elif not creds.valid:
|
||||||
raise ValueError("Invalid OAuth token. Please regenerate the token.")
|
raise ValueError("Invalid OAuth token. Please regenerate the token.")
|
||||||
|
|
||||||
return build('drive', 'v3', credentials=creds)
|
return build("drive", "v3", credentials=creds)
|
||||||
|
|
||||||
def _initialize_with_service_account(self):
|
def _initialize_with_service_account(self):
|
||||||
"""Initialize Google Drive service with service account."""
|
"""Initialize Google Drive service with service account."""
|
||||||
creds = service_account.Credentials.from_service_account_file(self.service_account, scopes=self.scopes)
|
creds = service_account.Credentials.from_service_account_file(self.service_account, scopes=self.scopes)
|
||||||
return build('drive', 'v3', credentials=creds)
|
return build("drive", "v3", credentials=creds)
|
||||||
|
|
||||||
def get_cdn_url(self, media: Media) -> str:
|
def get_cdn_url(self, media: Media) -> str:
|
||||||
"""
|
"""
|
||||||
@@ -79,7 +75,7 @@ class GDriveStorage(Storage):
|
|||||||
return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
|
return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
|
||||||
|
|
||||||
def upload(self, media: Media, **kwargs) -> bool:
|
def upload(self, media: Media, **kwargs) -> bool:
|
||||||
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
|
logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key}")
|
||||||
"""
|
"""
|
||||||
1. for each sub-folder in the path check if exists or create
|
1. for each sub-folder in the path check if exists or create
|
||||||
2. upload file to root_id/other_paths.../filename
|
2. upload file to root_id/other_paths.../filename
|
||||||
@@ -95,25 +91,30 @@ class GDriveStorage(Storage):
|
|||||||
parent_id = upload_to
|
parent_id = upload_to
|
||||||
|
|
||||||
# upload file to gd
|
# upload file to gd
|
||||||
logger.debug(f'uploading {filename=} to folder id {upload_to}')
|
logger.debug(f"uploading {filename=} to folder id {upload_to}")
|
||||||
file_metadata = {
|
file_metadata = {"name": [filename], "parents": [upload_to]}
|
||||||
'name': [filename],
|
|
||||||
'parents': [upload_to]
|
|
||||||
}
|
|
||||||
media = MediaFileUpload(media.filename, resumable=True)
|
media = MediaFileUpload(media.filename, resumable=True)
|
||||||
gd_file = self.service.files().create(supportsAllDrives=True, body=file_metadata, media_body=media, fields='id').execute()
|
gd_file = (
|
||||||
logger.debug(f'uploadf: uploaded file {gd_file["id"]} successfully in folder={upload_to}')
|
self.service.files()
|
||||||
|
.create(supportsAllDrives=True, body=file_metadata, media_body=media, fields="id")
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
logger.debug(f"uploadf: uploaded file {gd_file['id']} successfully in folder={upload_to}")
|
||||||
|
|
||||||
# must be implemented even if unused
|
# must be implemented even if unused
|
||||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
|
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||||||
|
pass
|
||||||
|
|
||||||
def _get_id_from_parent_and_name(self, parent_id: str,
|
def _get_id_from_parent_and_name(
|
||||||
name: str,
|
self,
|
||||||
retries: int = 1,
|
parent_id: str,
|
||||||
sleep_seconds: int = 10,
|
name: str,
|
||||||
use_mime_type: bool = False,
|
retries: int = 1,
|
||||||
raise_on_missing: bool = True,
|
sleep_seconds: int = 10,
|
||||||
use_cache=False):
|
use_mime_type: bool = False,
|
||||||
|
raise_on_missing: bool = True,
|
||||||
|
use_cache=False,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Retrieves the id of a folder or file from its @name and the @parent_id folder
|
Retrieves the id of a folder or file from its @name and the @parent_id folder
|
||||||
Optionally does multiple @retries and sleeps @sleep_seconds between them
|
Optionally does multiple @retries and sleeps @sleep_seconds between them
|
||||||
@@ -134,32 +135,39 @@ class GDriveStorage(Storage):
|
|||||||
debug_header: str = f"[searching {name=} in {parent_id=}]"
|
debug_header: str = f"[searching {name=} in {parent_id=}]"
|
||||||
query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false "
|
query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false "
|
||||||
if use_mime_type:
|
if use_mime_type:
|
||||||
query_string += f" and mimeType='application/vnd.google-apps.folder' "
|
query_string += " and mimeType='application/vnd.google-apps.folder' "
|
||||||
|
|
||||||
for attempt in range(retries):
|
for attempt in range(retries):
|
||||||
results = self.service.files().list(
|
results = (
|
||||||
# both below for Google Shared Drives
|
self.service.files()
|
||||||
supportsAllDrives=True,
|
.list(
|
||||||
includeItemsFromAllDrives=True,
|
# both below for Google Shared Drives
|
||||||
q=query_string,
|
supportsAllDrives=True,
|
||||||
spaces='drive', # ie not appDataFolder or photos
|
includeItemsFromAllDrives=True,
|
||||||
fields='files(id, name)'
|
q=query_string,
|
||||||
).execute()
|
spaces="drive", # ie not appDataFolder or photos
|
||||||
items = results.get('files', [])
|
fields="files(id, name)",
|
||||||
|
)
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
items = results.get("files", [])
|
||||||
|
|
||||||
if len(items) > 0:
|
if len(items) > 0:
|
||||||
logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}")
|
logger.debug(
|
||||||
_id = items[-1]['id']
|
f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}"
|
||||||
if use_cache: self.api_cache[cache_key] = _id
|
)
|
||||||
|
_id = items[-1]["id"]
|
||||||
|
if use_cache:
|
||||||
|
self.api_cache[cache_key] = _id
|
||||||
return _id
|
return _id
|
||||||
else:
|
else:
|
||||||
logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}.')
|
logger.debug(f"{debug_header} not found, attempt {attempt + 1}/{retries}.")
|
||||||
if attempt < retries - 1:
|
if attempt < retries - 1:
|
||||||
logger.debug(f'sleeping for {sleep_seconds} second(s)')
|
logger.debug(f"sleeping for {sleep_seconds} second(s)")
|
||||||
time.sleep(sleep_seconds)
|
time.sleep(sleep_seconds)
|
||||||
|
|
||||||
if raise_on_missing:
|
if raise_on_missing:
|
||||||
raise ValueError(f'{debug_header} not found after {retries} attempt(s)')
|
raise ValueError(f"{debug_header} not found after {retries} attempt(s)")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _mkdir(self, name: str, parent_id: str):
|
def _mkdir(self, name: str, parent_id: str):
|
||||||
@@ -167,12 +175,7 @@ class GDriveStorage(Storage):
|
|||||||
Creates a new GDrive folder @name inside folder @parent_id
|
Creates a new GDrive folder @name inside folder @parent_id
|
||||||
Returns id of the created folder
|
Returns id of the created folder
|
||||||
"""
|
"""
|
||||||
logger.debug(f'Creating new folder with {name=} inside {parent_id=}')
|
logger.debug(f"Creating new folder with {name=} inside {parent_id=}")
|
||||||
file_metadata = {
|
file_metadata = {"name": [name], "mimeType": "application/vnd.google-apps.folder", "parents": [parent_id]}
|
||||||
'name': [name],
|
gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields="id").execute()
|
||||||
'mimeType': 'application/vnd.google-apps.folder',
|
return gd_folder.get("id")
|
||||||
'parents': [parent_id]
|
|
||||||
}
|
|
||||||
gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields='id').execute()
|
|
||||||
return gd_folder.get('id')
|
|
||||||
|
|
||||||
|
|||||||
@@ -4,15 +4,16 @@ from auto_archiver.core.extractor import Extractor
|
|||||||
from auto_archiver.core.metadata import Metadata, Media
|
from auto_archiver.core.metadata import Metadata, Media
|
||||||
from .dropin import GenericDropin, InfoExtractor
|
from .dropin import GenericDropin, InfoExtractor
|
||||||
|
|
||||||
class Bluesky(GenericDropin):
|
|
||||||
|
|
||||||
|
class Bluesky(GenericDropin):
|
||||||
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
|
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
|
||||||
result = Metadata()
|
result = Metadata()
|
||||||
result.set_url(url)
|
result.set_url(url)
|
||||||
result.set_title(post["record"]["text"])
|
result.set_title(post["record"]["text"])
|
||||||
result.set_timestamp(post["record"]["createdAt"])
|
result.set_timestamp(post["record"]["createdAt"])
|
||||||
for k, v in self._get_post_data(post).items():
|
for k, v in self._get_post_data(post).items():
|
||||||
if v: result.set(k, v)
|
if v:
|
||||||
|
result.set(k, v)
|
||||||
|
|
||||||
# download if embeds present (1 video XOR >=1 images)
|
# download if embeds present (1 video XOR >=1 images)
|
||||||
for media in self._download_bsky_embeds(post, archiver):
|
for media in self._download_bsky_embeds(post, archiver):
|
||||||
@@ -23,7 +24,7 @@ class Bluesky(GenericDropin):
|
|||||||
|
|
||||||
def extract_post(self, url: str, ie_instance: InfoExtractor) -> dict:
|
def extract_post(self, url: str, ie_instance: InfoExtractor) -> dict:
|
||||||
# TODO: If/when this PR (https://github.com/yt-dlp/yt-dlp/pull/12098) is merged on ytdlp, remove the comments and delete the code below
|
# TODO: If/when this PR (https://github.com/yt-dlp/yt-dlp/pull/12098) is merged on ytdlp, remove the comments and delete the code below
|
||||||
handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
|
handle, video_id = ie_instance._match_valid_url(url).group("handle", "id")
|
||||||
return ie_instance._extract_post(handle=handle, post_id=video_id)
|
return ie_instance._extract_post(handle=handle, post_id=video_id)
|
||||||
|
|
||||||
def _download_bsky_embeds(self, post: dict, archiver: Extractor) -> list[Media]:
|
def _download_bsky_embeds(self, post: dict, archiver: Extractor) -> list[Media]:
|
||||||
@@ -37,16 +38,15 @@ class Bluesky(GenericDropin):
|
|||||||
|
|
||||||
media_url = "https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={}&did={}"
|
media_url = "https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={}&did={}"
|
||||||
for image_media in image_medias:
|
for image_media in image_medias:
|
||||||
url = media_url.format(image_media['image']['ref']['$link'], post['author']['did'])
|
url = media_url.format(image_media["image"]["ref"]["$link"], post["author"]["did"])
|
||||||
image_media = archiver.download_from_url(url)
|
image_media = archiver.download_from_url(url)
|
||||||
media.append(Media(image_media))
|
media.append(Media(image_media))
|
||||||
for video_media in video_medias:
|
for video_media in video_medias:
|
||||||
url = media_url.format(video_media['ref']['$link'], post['author']['did'])
|
url = media_url.format(video_media["ref"]["$link"], post["author"]["did"])
|
||||||
video_media = archiver.download_from_url(url)
|
video_media = archiver.download_from_url(url)
|
||||||
media.append(Media(video_media))
|
media.append(Media(video_media))
|
||||||
return media
|
return media
|
||||||
|
|
||||||
|
|
||||||
def _get_post_data(self, post: dict) -> dict:
|
def _get_post_data(self, post: dict) -> dict:
|
||||||
"""
|
"""
|
||||||
Extracts relevant information returned by the .getPostThread api call (excluding text/created_at): author, mentions, tags, links.
|
Extracts relevant information returned by the .getPostThread api call (excluding text/created_at): author, mentions, tags, links.
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ from yt_dlp.extractor.common import InfoExtractor
|
|||||||
from auto_archiver.core.metadata import Metadata
|
from auto_archiver.core.metadata import Metadata
|
||||||
from auto_archiver.core.extractor import Extractor
|
from auto_archiver.core.extractor import Extractor
|
||||||
|
|
||||||
|
|
||||||
class GenericDropin:
|
class GenericDropin:
|
||||||
"""Base class for dropins for the generic extractor.
|
"""Base class for dropins for the generic extractor.
|
||||||
|
|
||||||
@@ -29,14 +30,12 @@ class GenericDropin:
|
|||||||
"""
|
"""
|
||||||
raise NotImplementedError("This method should be implemented in the subclass")
|
raise NotImplementedError("This method should be implemented in the subclass")
|
||||||
|
|
||||||
|
|
||||||
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
|
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
|
||||||
"""
|
"""
|
||||||
This method should create a Metadata object from the post data.
|
This method should create a Metadata object from the post data.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError("This method should be implemented in the subclass")
|
raise NotImplementedError("This method should be implemented in the subclass")
|
||||||
|
|
||||||
|
|
||||||
def skip_ytdlp_download(self, url: str, ie_instance: InfoExtractor):
|
def skip_ytdlp_download(self, url: str, ie_instance: InfoExtractor):
|
||||||
"""
|
"""
|
||||||
This method should return True if you want to skip the ytdlp download method.
|
This method should return True if you want to skip the ytdlp download method.
|
||||||
|
|||||||
@@ -3,10 +3,9 @@ from .dropin import GenericDropin
|
|||||||
|
|
||||||
class Facebook(GenericDropin):
|
class Facebook(GenericDropin):
|
||||||
def extract_post(self, url: str, ie_instance):
|
def extract_post(self, url: str, ie_instance):
|
||||||
video_id = ie_instance._match_valid_url(url).group('id')
|
video_id = ie_instance._match_valid_url(url).group("id")
|
||||||
ie_instance._download_webpage(
|
ie_instance._download_webpage(url.replace("://m.facebook.com/", "://www.facebook.com/"), video_id)
|
||||||
url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
|
webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group("id"))
|
||||||
webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group('id'))
|
|
||||||
|
|
||||||
# TODO: fix once https://github.com/yt-dlp/yt-dlp/pull/12275 is merged
|
# TODO: fix once https://github.com/yt-dlp/yt-dlp/pull/12275 is merged
|
||||||
post_data = ie_instance._extract_metadata(webpage)
|
post_data = ie_instance._extract_metadata(webpage)
|
||||||
@@ -14,5 +13,5 @@ class Facebook(GenericDropin):
|
|||||||
|
|
||||||
def create_metadata(self, post: dict, ie_instance, archiver, url):
|
def create_metadata(self, post: dict, ie_instance, archiver, url):
|
||||||
metadata = archiver.create_metadata(url)
|
metadata = archiver.create_metadata(url)
|
||||||
metadata.set_title(post.get('title')).set_content(post.get('description')).set_post_data(post)
|
metadata.set_title(post.get("title")).set_content(post.get("description")).set_post_data(post)
|
||||||
return metadata
|
return metadata
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
import datetime, os
|
import datetime
|
||||||
|
import os
|
||||||
import importlib
|
import importlib
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
@@ -13,8 +14,11 @@ from loguru import logger
|
|||||||
from auto_archiver.core.extractor import Extractor
|
from auto_archiver.core.extractor import Extractor
|
||||||
from auto_archiver.core import Metadata, Media
|
from auto_archiver.core import Metadata, Media
|
||||||
|
|
||||||
|
|
||||||
class SkipYtdlp(Exception):
|
class SkipYtdlp(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class GenericExtractor(Extractor):
|
class GenericExtractor(Extractor):
|
||||||
_dropins = {}
|
_dropins = {}
|
||||||
|
|
||||||
@@ -23,8 +27,8 @@ class GenericExtractor(Extractor):
|
|||||||
if self.ytdlp_update_interval < 0:
|
if self.ytdlp_update_interval < 0:
|
||||||
return
|
return
|
||||||
|
|
||||||
use_secrets = os.path.exists('secrets')
|
use_secrets = os.path.exists("secrets")
|
||||||
path = os.path.join('secrets' if use_secrets else '', '.ytdlp-update')
|
path = os.path.join("secrets" if use_secrets else "", ".ytdlp-update")
|
||||||
next_update_check = None
|
next_update_check = None
|
||||||
if os.path.exists(path):
|
if os.path.exists(path):
|
||||||
with open(path, "r") as f:
|
with open(path, "r") as f:
|
||||||
@@ -39,8 +43,11 @@ class GenericExtractor(Extractor):
|
|||||||
|
|
||||||
def update_ytdlp(self):
|
def update_ytdlp(self):
|
||||||
logger.info("Checking and updating yt-dlp...")
|
logger.info("Checking and updating yt-dlp...")
|
||||||
logger.info(f"Tip: change the 'ytdlp_update_interval' setting to control how often yt-dlp is updated. Set to -1 to disable or 0 to enable on every run. Current setting: {self.ytdlp_update_interval}")
|
logger.info(
|
||||||
|
f"Tip: change the 'ytdlp_update_interval' setting to control how often yt-dlp is updated. Set to -1 to disable or 0 to enable on every run. Current setting: {self.ytdlp_update_interval}"
|
||||||
|
)
|
||||||
from importlib.metadata import version as get_version
|
from importlib.metadata import version as get_version
|
||||||
|
|
||||||
old_version = get_version("yt-dlp")
|
old_version = get_version("yt-dlp")
|
||||||
try:
|
try:
|
||||||
# try and update with pip (this works inside poetry environment and in a normal virtualenv)
|
# try and update with pip (this works inside poetry environment and in a normal virtualenv)
|
||||||
@@ -70,7 +77,9 @@ class GenericExtractor(Extractor):
|
|||||||
"""
|
"""
|
||||||
return any(self.suitable_extractors(url))
|
return any(self.suitable_extractors(url))
|
||||||
|
|
||||||
def download_additional_media(self, video_data: dict, info_extractor: InfoExtractor, metadata: Metadata) -> Metadata:
|
def download_additional_media(
|
||||||
|
self, video_data: dict, info_extractor: InfoExtractor, metadata: Metadata
|
||||||
|
) -> Metadata:
|
||||||
"""
|
"""
|
||||||
Downloads additional media like images, comments, subtitles, etc.
|
Downloads additional media like images, comments, subtitles, etc.
|
||||||
|
|
||||||
@@ -79,7 +88,7 @@ class GenericExtractor(Extractor):
|
|||||||
|
|
||||||
# Just get the main thumbnail. More thumbnails are available in
|
# Just get the main thumbnail. More thumbnails are available in
|
||||||
# video_data['thumbnails'] should they be required
|
# video_data['thumbnails'] should they be required
|
||||||
thumbnail_url = video_data.get('thumbnail')
|
thumbnail_url = video_data.get("thumbnail")
|
||||||
if thumbnail_url:
|
if thumbnail_url:
|
||||||
try:
|
try:
|
||||||
cover_image_path = self.download_from_url(thumbnail_url)
|
cover_image_path = self.download_from_url(thumbnail_url)
|
||||||
@@ -102,14 +111,64 @@ class GenericExtractor(Extractor):
|
|||||||
Clean up the ytdlp generic video data to make it more readable and remove unnecessary keys that ytdlp adds
|
Clean up the ytdlp generic video data to make it more readable and remove unnecessary keys that ytdlp adds
|
||||||
"""
|
"""
|
||||||
|
|
||||||
base_keys = ['formats', 'thumbnail', 'display_id', 'epoch', 'requested_downloads',
|
base_keys = [
|
||||||
'duration_string', 'thumbnails', 'http_headers', 'webpage_url_basename', 'webpage_url_domain',
|
"formats",
|
||||||
'extractor', 'extractor_key', 'playlist', 'playlist_index', 'duration_string', 'protocol', 'requested_subtitles',
|
"thumbnail",
|
||||||
'format_id', 'acodec', 'vcodec', 'ext', 'epoch', '_has_drm', 'filesize', 'audio_ext', 'video_ext', 'vbr', 'abr',
|
"display_id",
|
||||||
'resolution', 'dynamic_range', 'aspect_ratio', 'cookies', 'format', 'quality', 'preference', 'artists',
|
"epoch",
|
||||||
'channel_id', 'subtitles', 'tbr', 'url', 'original_url', 'automatic_captions', 'playable_in_embed', 'live_status',
|
"requested_downloads",
|
||||||
'_format_sort_fields', 'chapters', 'requested_formats', 'format_note',
|
"duration_string",
|
||||||
'audio_channels', 'asr', 'fps', 'was_live', 'is_live', 'heatmap', 'age_limit', 'stretched_ratio']
|
"thumbnails",
|
||||||
|
"http_headers",
|
||||||
|
"webpage_url_basename",
|
||||||
|
"webpage_url_domain",
|
||||||
|
"extractor",
|
||||||
|
"extractor_key",
|
||||||
|
"playlist",
|
||||||
|
"playlist_index",
|
||||||
|
"duration_string",
|
||||||
|
"protocol",
|
||||||
|
"requested_subtitles",
|
||||||
|
"format_id",
|
||||||
|
"acodec",
|
||||||
|
"vcodec",
|
||||||
|
"ext",
|
||||||
|
"epoch",
|
||||||
|
"_has_drm",
|
||||||
|
"filesize",
|
||||||
|
"audio_ext",
|
||||||
|
"video_ext",
|
||||||
|
"vbr",
|
||||||
|
"abr",
|
||||||
|
"resolution",
|
||||||
|
"dynamic_range",
|
||||||
|
"aspect_ratio",
|
||||||
|
"cookies",
|
||||||
|
"format",
|
||||||
|
"quality",
|
||||||
|
"preference",
|
||||||
|
"artists",
|
||||||
|
"channel_id",
|
||||||
|
"subtitles",
|
||||||
|
"tbr",
|
||||||
|
"url",
|
||||||
|
"original_url",
|
||||||
|
"automatic_captions",
|
||||||
|
"playable_in_embed",
|
||||||
|
"live_status",
|
||||||
|
"_format_sort_fields",
|
||||||
|
"chapters",
|
||||||
|
"requested_formats",
|
||||||
|
"format_note",
|
||||||
|
"audio_channels",
|
||||||
|
"asr",
|
||||||
|
"fps",
|
||||||
|
"was_live",
|
||||||
|
"is_live",
|
||||||
|
"heatmap",
|
||||||
|
"age_limit",
|
||||||
|
"stretched_ratio",
|
||||||
|
]
|
||||||
|
|
||||||
dropin = self.dropin_for_name(info_extractor.ie_key())
|
dropin = self.dropin_for_name(info_extractor.ie_key())
|
||||||
if dropin:
|
if dropin:
|
||||||
@@ -120,7 +179,7 @@ class GenericExtractor(Extractor):
|
|||||||
|
|
||||||
return base_keys
|
return base_keys
|
||||||
|
|
||||||
def add_metadata(self, video_data: dict, info_extractor: InfoExtractor, url:str, result: Metadata) -> Metadata:
|
def add_metadata(self, video_data: dict, info_extractor: InfoExtractor, url: str, result: Metadata) -> Metadata:
|
||||||
"""
|
"""
|
||||||
Creates a Metadata object from the given video_data
|
Creates a Metadata object from the given video_data
|
||||||
"""
|
"""
|
||||||
@@ -129,23 +188,30 @@ class GenericExtractor(Extractor):
|
|||||||
result = self.download_additional_media(video_data, info_extractor, result)
|
result = self.download_additional_media(video_data, info_extractor, result)
|
||||||
|
|
||||||
# keep both 'title' and 'fulltitle', but prefer 'title', falling back to 'fulltitle' if it doesn't exist
|
# keep both 'title' and 'fulltitle', but prefer 'title', falling back to 'fulltitle' if it doesn't exist
|
||||||
result.set_title(video_data.pop('title', video_data.pop('fulltitle', "")))
|
result.set_title(video_data.pop("title", video_data.pop("fulltitle", "")))
|
||||||
result.set_url(url)
|
result.set_url(url)
|
||||||
if "description" in video_data: result.set_content(video_data["description"])
|
if "description" in video_data:
|
||||||
|
result.set_content(video_data["description"])
|
||||||
# extract comments if enabled
|
# extract comments if enabled
|
||||||
if self.comments:
|
if self.comments:
|
||||||
result.set("comments", [{
|
result.set(
|
||||||
"text": c["text"],
|
"comments",
|
||||||
"author": c["author"],
|
[
|
||||||
"timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz = datetime.timezone.utc)
|
{
|
||||||
} for c in video_data.get("comments", [])])
|
"text": c["text"],
|
||||||
|
"author": c["author"],
|
||||||
|
"timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz=datetime.timezone.utc),
|
||||||
|
}
|
||||||
|
for c in video_data.get("comments", [])
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
# then add the common metadata
|
# then add the common metadata
|
||||||
if timestamp := video_data.pop("timestamp", None):
|
if timestamp := video_data.pop("timestamp", None):
|
||||||
timestamp = datetime.datetime.fromtimestamp(timestamp, tz = datetime.timezone.utc).isoformat()
|
timestamp = datetime.datetime.fromtimestamp(timestamp, tz=datetime.timezone.utc).isoformat()
|
||||||
result.set_timestamp(timestamp)
|
result.set_timestamp(timestamp)
|
||||||
if upload_date := video_data.pop("upload_date", None):
|
if upload_date := video_data.pop("upload_date", None):
|
||||||
upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
|
upload_date = datetime.datetime.strptime(upload_date, "%Y%m%d").replace(tzinfo=datetime.timezone.utc)
|
||||||
result.set("upload_date", upload_date)
|
result.set("upload_date", upload_date)
|
||||||
|
|
||||||
# then clean away any keys we don't want
|
# then clean away any keys we don't want
|
||||||
@@ -176,18 +242,20 @@ class GenericExtractor(Extractor):
|
|||||||
post_data = dropin.extract_post(url, ie_instance)
|
post_data = dropin.extract_post(url, ie_instance)
|
||||||
return dropin.create_metadata(post_data, ie_instance, self, url)
|
return dropin.create_metadata(post_data, ie_instance, self, url)
|
||||||
|
|
||||||
def get_metadata_for_video(self, data: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
|
def get_metadata_for_video(
|
||||||
|
self, data: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL
|
||||||
|
) -> Metadata:
|
||||||
# this time download
|
# this time download
|
||||||
ydl.params['getcomments'] = self.comments
|
ydl.params["getcomments"] = self.comments
|
||||||
#TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
|
# TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
|
||||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
|
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
|
||||||
if "entries" in data:
|
if "entries" in data:
|
||||||
entries = data.get("entries", [])
|
entries = data.get("entries", [])
|
||||||
if not len(entries):
|
if not len(entries):
|
||||||
logger.warning('YoutubeDLArchiver could not find any video')
|
logger.warning("YoutubeDLArchiver could not find any video")
|
||||||
return False
|
return False
|
||||||
else: entries = [data]
|
else:
|
||||||
|
entries = [data]
|
||||||
|
|
||||||
result = Metadata()
|
result = Metadata()
|
||||||
|
|
||||||
@@ -195,17 +263,18 @@ class GenericExtractor(Extractor):
|
|||||||
try:
|
try:
|
||||||
filename = ydl.prepare_filename(entry)
|
filename = ydl.prepare_filename(entry)
|
||||||
if not os.path.exists(filename):
|
if not os.path.exists(filename):
|
||||||
filename = filename.split('.')[0] + '.mkv'
|
filename = filename.split(".")[0] + ".mkv"
|
||||||
|
|
||||||
new_media = Media(filename)
|
new_media = Media(filename)
|
||||||
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
|
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
|
||||||
if x in entry: new_media.set(x, entry[x])
|
if x in entry:
|
||||||
|
new_media.set(x, entry[x])
|
||||||
|
|
||||||
# read text from subtitles if enabled
|
# read text from subtitles if enabled
|
||||||
if self.subtitles:
|
if self.subtitles:
|
||||||
for lang, val in (data.get('requested_subtitles') or {}).items():
|
for lang, val in (data.get("requested_subtitles") or {}).items():
|
||||||
try:
|
try:
|
||||||
subs = pysubs2.load(val.get('filepath'), encoding="utf-8")
|
subs = pysubs2.load(val.get("filepath"), encoding="utf-8")
|
||||||
text = " ".join([line.text for line in subs])
|
text = " ".join([line.text for line in subs])
|
||||||
new_media.set(f"subtitles_{lang}", text)
|
new_media.set(f"subtitles_{lang}", text)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -216,7 +285,7 @@ class GenericExtractor(Extractor):
|
|||||||
|
|
||||||
return self.add_metadata(data, info_extractor, url, result)
|
return self.add_metadata(data, info_extractor, url, result)
|
||||||
|
|
||||||
def dropin_for_name(self, dropin_name: str, additional_paths = [], package=__package__) -> Type[InfoExtractor]:
|
def dropin_for_name(self, dropin_name: str, additional_paths=[], package=__package__) -> Type[InfoExtractor]:
|
||||||
dropin_name = dropin_name.lower()
|
dropin_name = dropin_name.lower()
|
||||||
|
|
||||||
if dropin_name == "generic":
|
if dropin_name == "generic":
|
||||||
@@ -224,6 +293,7 @@ class GenericExtractor(Extractor):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
dropin_class_name = dropin_name.title()
|
dropin_class_name = dropin_name.title()
|
||||||
|
|
||||||
def _load_dropin(dropin):
|
def _load_dropin(dropin):
|
||||||
dropin_class = getattr(dropin, dropin_class_name)()
|
dropin_class = getattr(dropin, dropin_class_name)()
|
||||||
return self._dropins.setdefault(dropin_name, dropin_class)
|
return self._dropins.setdefault(dropin_name, dropin_class)
|
||||||
@@ -264,7 +334,7 @@ class GenericExtractor(Extractor):
|
|||||||
use the extractor's _extract_post method to get the post metadata if possible.
|
use the extractor's _extract_post method to get the post metadata if possible.
|
||||||
"""
|
"""
|
||||||
# when getting info without download, we also don't need the comments
|
# when getting info without download, we also don't need the comments
|
||||||
ydl.params['getcomments'] = False
|
ydl.params["getcomments"] = False
|
||||||
result = False
|
result = False
|
||||||
|
|
||||||
dropin_submodule = self.dropin_for_name(info_extractor.ie_key())
|
dropin_submodule = self.dropin_for_name(info_extractor.ie_key())
|
||||||
@@ -276,7 +346,7 @@ class GenericExtractor(Extractor):
|
|||||||
|
|
||||||
# don't download since it can be a live stream
|
# don't download since it can be a live stream
|
||||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
|
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
|
||||||
if data.get('is_live', False) and not self.livestreams:
|
if data.get("is_live", False) and not self.livestreams:
|
||||||
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
|
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
|
||||||
return False
|
return False
|
||||||
# it's a valid video, that the youtubdedl can download out of the box
|
# it's a valid video, that the youtubdedl can download out of the box
|
||||||
@@ -288,7 +358,9 @@ class GenericExtractor(Extractor):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
if not isinstance(e, SkipYtdlp):
|
if not isinstance(e, SkipYtdlp):
|
||||||
logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead')
|
logger.debug(
|
||||||
|
f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead'
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = self.get_metadata_for_post(info_extractor, url, ydl)
|
result = self.get_metadata_for_post(info_extractor, url, ydl)
|
||||||
@@ -296,9 +368,12 @@ class GenericExtractor(Extractor):
|
|||||||
logger.error("Error downloading metadata for post: {error}", error=str(post_e))
|
logger.error("Error downloading metadata for post: {error}", error=str(post_e))
|
||||||
return False
|
return False
|
||||||
except Exception as generic_e:
|
except Exception as generic_e:
|
||||||
logger.debug('Attempt to extract using ytdlp extractor "{name}" failed: \n {error}',
|
logger.debug(
|
||||||
name=info_extractor.IE_NAME, error=str(generic_e),
|
'Attempt to extract using ytdlp extractor "{name}" failed: \n {error}',
|
||||||
exc_info=True)
|
name=info_extractor.IE_NAME,
|
||||||
|
error=str(generic_e),
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
@@ -316,53 +391,56 @@ class GenericExtractor(Extractor):
|
|||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
|
|
||||||
#TODO: this is a temporary hack until this issue is closed: https://github.com/yt-dlp/yt-dlp/issues/11025
|
# TODO: this is a temporary hack until this issue is closed: https://github.com/yt-dlp/yt-dlp/issues/11025
|
||||||
if url.startswith("https://ya.ru"):
|
if url.startswith("https://ya.ru"):
|
||||||
url = url.replace("https://ya.ru", "https://yandex.ru")
|
url = url.replace("https://ya.ru", "https://yandex.ru")
|
||||||
item.set("replaced_url", url)
|
item.set("replaced_url", url)
|
||||||
|
|
||||||
|
ydl_options = [
|
||||||
ydl_options = ["-o", os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'),
|
"-o",
|
||||||
"--quiet",
|
os.path.join(self.tmp_dir, f"%(id)s.%(ext)s"),
|
||||||
"--no-playlist" if not self.allow_playlist else "--yes-playlist",
|
"--quiet",
|
||||||
"--write-subs" if self.subtitles else "--no-write-subs",
|
"--no-playlist" if not self.allow_playlist else "--yes-playlist",
|
||||||
"--write-auto-subs" if self.subtitles else "--no-write-auto-subs",
|
"--write-subs" if self.subtitles else "--no-write-subs",
|
||||||
"--live-from-start" if self.live_from_start else "--no-live-from-start",
|
"--write-auto-subs" if self.subtitles else "--no-write-auto-subs",
|
||||||
"--proxy", self.proxy if self.proxy else '',
|
"--live-from-start" if self.live_from_start else "--no-live-from-start",
|
||||||
f"--max-downloads {self.max_downloads}" if self.max_downloads != "inf" else '',
|
"--proxy",
|
||||||
f"--playlist-end {self.max_downloads}" if self.max_downloads != "inf" else ''
|
self.proxy if self.proxy else "",
|
||||||
]
|
f"--max-downloads {self.max_downloads}" if self.max_downloads != "inf" else "",
|
||||||
|
f"--playlist-end {self.max_downloads}" if self.max_downloads != "inf" else "",
|
||||||
|
]
|
||||||
|
|
||||||
# set up auth
|
# set up auth
|
||||||
auth = self.auth_for_site(url, extract_cookies=False)
|
auth = self.auth_for_site(url, extract_cookies=False)
|
||||||
|
|
||||||
# order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
|
# order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
|
||||||
if auth:
|
if auth:
|
||||||
if 'username' in auth and 'password' in auth:
|
if "username" in auth and "password" in auth:
|
||||||
logger.debug(f'Using provided auth username and password for {url}')
|
logger.debug(f"Using provided auth username and password for {url}")
|
||||||
ydl_options.extend(('--username', auth['username']))
|
ydl_options.extend(("--username", auth["username"]))
|
||||||
ydl_options.extend(('--password', auth['password']))
|
ydl_options.extend(("--password", auth["password"]))
|
||||||
elif 'cookie' in auth:
|
elif "cookie" in auth:
|
||||||
logger.debug(f'Using provided auth cookie for {url}')
|
logger.debug(f"Using provided auth cookie for {url}")
|
||||||
yt_dlp.utils.std_headers['cookie'] = auth['cookie']
|
yt_dlp.utils.std_headers["cookie"] = auth["cookie"]
|
||||||
elif 'cookies_from_browser' in auth:
|
elif "cookies_from_browser" in auth:
|
||||||
logger.debug(f'Using extracted cookies from browser {auth["cookies_from_browser"]} for {url}')
|
logger.debug(f"Using extracted cookies from browser {auth['cookies_from_browser']} for {url}")
|
||||||
ydl_options.extend(('--cookies-from-browser', auth['cookies_from_browser']))
|
ydl_options.extend(("--cookies-from-browser", auth["cookies_from_browser"]))
|
||||||
elif 'cookies_file' in auth:
|
elif "cookies_file" in auth:
|
||||||
logger.debug(f'Using cookies from file {auth["cookies_file"]} for {url}')
|
logger.debug(f"Using cookies from file {auth['cookies_file']} for {url}")
|
||||||
ydl_options.extend(('--cookies', auth['cookies_file']))
|
ydl_options.extend(("--cookies", auth["cookies_file"]))
|
||||||
|
|
||||||
if self.ytdlp_args:
|
if self.ytdlp_args:
|
||||||
logger.debug("Adding additional ytdlp arguments: {self.ytdlp_args}")
|
logger.debug("Adding additional ytdlp arguments: {self.ytdlp_args}")
|
||||||
ydl_options += self.ytdlp_args.split(" ")
|
ydl_options += self.ytdlp_args.split(" ")
|
||||||
|
|
||||||
*_, validated_options = yt_dlp.parse_options(ydl_options)
|
*_, validated_options = yt_dlp.parse_options(ydl_options)
|
||||||
ydl = yt_dlp.YoutubeDL(validated_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
|
ydl = yt_dlp.YoutubeDL(
|
||||||
|
validated_options
|
||||||
|
) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
|
||||||
|
|
||||||
for info_extractor in self.suitable_extractors(url):
|
for info_extractor in self.suitable_extractors(url):
|
||||||
result = self.download_for_extractor(info_extractor, url, ydl)
|
result = self.download_for_extractor(info_extractor, url, ydl)
|
||||||
if result:
|
if result:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ from auto_archiver.core import Metadata, Media
|
|||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from .dropin import GenericDropin
|
from .dropin import GenericDropin
|
||||||
|
|
||||||
|
|
||||||
class Tiktok(GenericDropin):
|
class Tiktok(GenericDropin):
|
||||||
"""
|
"""
|
||||||
TikTok droping for the Generic Extractor that uses an unofficial API if/when ytdlp fails.
|
TikTok droping for the Generic Extractor that uses an unofficial API if/when ytdlp fails.
|
||||||
@@ -13,7 +14,6 @@ class Tiktok(GenericDropin):
|
|||||||
TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}"
|
TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}"
|
||||||
|
|
||||||
def extract_post(self, url: str, ie_instance):
|
def extract_post(self, url: str, ie_instance):
|
||||||
|
|
||||||
logger.debug(f"Using Tikwm API to attempt to download tiktok video from {url=}")
|
logger.debug(f"Using Tikwm API to attempt to download tiktok video from {url=}")
|
||||||
|
|
||||||
endpoint = self.TIKWM_ENDPOINT.format(url=url)
|
endpoint = self.TIKWM_ENDPOINT.format(url=url)
|
||||||
@@ -27,7 +27,7 @@ class Tiktok(GenericDropin):
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
raise ValueError(f"failed to parse JSON response from tikwm.com for {url=}")
|
raise ValueError(f"failed to parse JSON response from tikwm.com for {url=}")
|
||||||
|
|
||||||
if not json_response.get('msg') == 'success' or not (api_data := json_response.get('data', {})):
|
if not json_response.get("msg") == "success" or not (api_data := json_response.get("data", {})):
|
||||||
raise ValueError(f"failed to get a valid response from tikwm.com for {url=}: {repr(json_response)}")
|
raise ValueError(f"failed to get a valid response from tikwm.com for {url=}: {repr(json_response)}")
|
||||||
|
|
||||||
# tries to get the non-watermarked version first
|
# tries to get the non-watermarked version first
|
||||||
@@ -35,13 +35,11 @@ class Tiktok(GenericDropin):
|
|||||||
if not video_url:
|
if not video_url:
|
||||||
raise ValueError(f"no valid video URL found in response from tikwm.com for {url=}")
|
raise ValueError(f"no valid video URL found in response from tikwm.com for {url=}")
|
||||||
|
|
||||||
api_data['video_url'] = video_url
|
api_data["video_url"] = video_url
|
||||||
return api_data
|
return api_data
|
||||||
|
|
||||||
|
|
||||||
def create_metadata(self, post: dict, ie_instance, archiver, url):
|
def create_metadata(self, post: dict, ie_instance, archiver, url):
|
||||||
|
# prepare result, start by downloading video
|
||||||
# prepare result, start by downloading video
|
|
||||||
result = Metadata()
|
result = Metadata()
|
||||||
video_url = post.pop("video_url")
|
video_url = post.pop("video_url")
|
||||||
|
|
||||||
@@ -66,7 +64,7 @@ class Tiktok(GenericDropin):
|
|||||||
if created_at := post.pop("create_time", None):
|
if created_at := post.pop("create_time", None):
|
||||||
result.set_timestamp(datetime.fromtimestamp(created_at, tz=timezone.utc))
|
result.set_timestamp(datetime.fromtimestamp(created_at, tz=timezone.utc))
|
||||||
|
|
||||||
if (author := post.pop("author", None)):
|
if author := post.pop("author", None):
|
||||||
result.set("author", author)
|
result.set("author", author)
|
||||||
|
|
||||||
result.set("api_data", post)
|
result.set("api_data", post)
|
||||||
|
|||||||
@@ -9,11 +9,11 @@ from dateutil.parser import parse as parse_dt
|
|||||||
|
|
||||||
from .dropin import GenericDropin
|
from .dropin import GenericDropin
|
||||||
|
|
||||||
class Truth(GenericDropin):
|
|
||||||
|
|
||||||
|
class Truth(GenericDropin):
|
||||||
def extract_post(self, url, ie_instance: InfoExtractor) -> dict:
|
def extract_post(self, url, ie_instance: InfoExtractor) -> dict:
|
||||||
video_id = ie_instance._match_id(url)
|
video_id = ie_instance._match_id(url)
|
||||||
truthsocial_url = f'https://truthsocial.com/api/v1/statuses/{video_id}'
|
truthsocial_url = f"https://truthsocial.com/api/v1/statuses/{video_id}"
|
||||||
return ie_instance._download_json(truthsocial_url, video_id)
|
return ie_instance._download_json(truthsocial_url, video_id)
|
||||||
|
|
||||||
def skip_ytdlp_download(self, url, ie_instance: Type[InfoExtractor]) -> bool:
|
def skip_ytdlp_download(self, url, ie_instance: Type[InfoExtractor]) -> bool:
|
||||||
@@ -32,12 +32,23 @@ class Truth(GenericDropin):
|
|||||||
|
|
||||||
result = Metadata()
|
result = Metadata()
|
||||||
result.set_url(url)
|
result.set_url(url)
|
||||||
timestamp = post['created_at'] # format is 2022-12-29T19:51:18.161Z
|
timestamp = post["created_at"] # format is 2022-12-29T19:51:18.161Z
|
||||||
result.set_timestamp(parse_dt(timestamp))
|
result.set_timestamp(parse_dt(timestamp))
|
||||||
result.set('description', post['content'])
|
result.set("description", post["content"])
|
||||||
result.set('author', post['account']['username'])
|
result.set("author", post["account"]["username"])
|
||||||
|
|
||||||
for key in ['replies_count', 'reblogs_count', 'favourites_count', ('account', 'followers_count'), ('account', 'following_count'), ('account', 'statuses_count'), ('account', 'display_name'), 'language', 'in_reply_to_account', 'replies_count']:
|
for key in [
|
||||||
|
"replies_count",
|
||||||
|
"reblogs_count",
|
||||||
|
"favourites_count",
|
||||||
|
("account", "followers_count"),
|
||||||
|
("account", "following_count"),
|
||||||
|
("account", "statuses_count"),
|
||||||
|
("account", "display_name"),
|
||||||
|
"language",
|
||||||
|
"in_reply_to_account",
|
||||||
|
"replies_count",
|
||||||
|
]:
|
||||||
if isinstance(key, tuple):
|
if isinstance(key, tuple):
|
||||||
store_key = " ".join(key)
|
store_key = " ".join(key)
|
||||||
else:
|
else:
|
||||||
@@ -45,8 +56,8 @@ class Truth(GenericDropin):
|
|||||||
result.set(store_key, traverse_obj(post, key))
|
result.set(store_key, traverse_obj(post, key))
|
||||||
|
|
||||||
# add the media
|
# add the media
|
||||||
for media in post.get('media_attachments', []):
|
for media in post.get("media_attachments", []):
|
||||||
filename = archiver.download_from_url(media['url'])
|
filename = archiver.download_from_url(media["url"])
|
||||||
result.add_media(Media(filename), id=media.get('id'))
|
result.add_media(Media(filename), id=media.get("id"))
|
||||||
|
|
||||||
return result
|
return result
|
||||||
@@ -1,4 +1,6 @@
|
|||||||
import re, mimetypes, json
|
import re
|
||||||
|
import mimetypes
|
||||||
|
import json
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
@@ -10,9 +12,8 @@ from auto_archiver.core.extractor import Extractor
|
|||||||
|
|
||||||
from .dropin import GenericDropin, InfoExtractor
|
from .dropin import GenericDropin, InfoExtractor
|
||||||
|
|
||||||
|
|
||||||
class Twitter(GenericDropin):
|
class Twitter(GenericDropin):
|
||||||
|
|
||||||
|
|
||||||
def choose_variant(self, variants):
|
def choose_variant(self, variants):
|
||||||
# choosing the highest quality possible
|
# choosing the highest quality possible
|
||||||
variant, width, height = None, 0, 0
|
variant, width, height = None, 0, 0
|
||||||
@@ -29,42 +30,41 @@ class Twitter(GenericDropin):
|
|||||||
return variant
|
return variant
|
||||||
|
|
||||||
def extract_post(self, url: str, ie_instance: InfoExtractor):
|
def extract_post(self, url: str, ie_instance: InfoExtractor):
|
||||||
twid = ie_instance._match_valid_url(url).group('id')
|
twid = ie_instance._match_valid_url(url).group("id")
|
||||||
return ie_instance._extract_status(twid=twid)
|
return ie_instance._extract_status(twid=twid)
|
||||||
|
|
||||||
def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
|
def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
|
||||||
result = Metadata()
|
result = Metadata()
|
||||||
try:
|
try:
|
||||||
if not tweet.get("user") or not tweet.get("created_at"):
|
if not tweet.get("user") or not tweet.get("created_at"):
|
||||||
raise ValueError(f"Error retreiving post. Are you sure it exists?")
|
raise ValueError("Error retreiving post. Are you sure it exists?")
|
||||||
timestamp = datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
timestamp = datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
||||||
except (ValueError, KeyError) as ex:
|
except (ValueError, KeyError) as ex:
|
||||||
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
result\
|
result.set_title(tweet.get("full_text", "")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(
|
||||||
.set_title(tweet.get('full_text', ''))\
|
timestamp
|
||||||
.set_content(json.dumps(tweet, ensure_ascii=False))\
|
)
|
||||||
.set_timestamp(timestamp)
|
|
||||||
if not tweet.get("entities", {}).get("media"):
|
if not tweet.get("entities", {}).get("media"):
|
||||||
logger.debug('No media found, archiving tweet text only')
|
logger.debug("No media found, archiving tweet text only")
|
||||||
result.status = "twitter-ytdl"
|
result.status = "twitter-ytdl"
|
||||||
return result
|
return result
|
||||||
for i, tw_media in enumerate(tweet["entities"]["media"]):
|
for i, tw_media in enumerate(tweet["entities"]["media"]):
|
||||||
media = Media(filename="")
|
media = Media(filename="")
|
||||||
mimetype = ""
|
mimetype = ""
|
||||||
if tw_media["type"] == "photo":
|
if tw_media["type"] == "photo":
|
||||||
media.set("src", UrlUtil.twitter_best_quality_url(tw_media['media_url_https']))
|
media.set("src", UrlUtil.twitter_best_quality_url(tw_media["media_url_https"]))
|
||||||
mimetype = "image/jpeg"
|
mimetype = "image/jpeg"
|
||||||
elif tw_media["type"] == "video":
|
elif tw_media["type"] == "video":
|
||||||
variant = self.choose_variant(tw_media['video_info']['variants'])
|
variant = self.choose_variant(tw_media["video_info"]["variants"])
|
||||||
media.set("src", variant['url'])
|
media.set("src", variant["url"])
|
||||||
mimetype = variant['content_type']
|
mimetype = variant["content_type"]
|
||||||
elif tw_media["type"] == "animated_gif":
|
elif tw_media["type"] == "animated_gif":
|
||||||
variant = tw_media['video_info']['variants'][0]
|
variant = tw_media["video_info"]["variants"][0]
|
||||||
media.set("src", variant['url'])
|
media.set("src", variant["url"])
|
||||||
mimetype = variant['content_type']
|
mimetype = variant["content_type"]
|
||||||
ext = mimetypes.guess_extension(mimetype)
|
ext = mimetypes.guess_extension(mimetype)
|
||||||
media.filename = archiver.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}')
|
media.filename = archiver.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}")
|
||||||
result.add_media(media)
|
result.add_media(media)
|
||||||
return result
|
return result
|
||||||
@@ -12,9 +12,7 @@
|
|||||||
"default": None,
|
"default": None,
|
||||||
"help": "the id of the sheet to archive (alternative to 'sheet' config)",
|
"help": "the id of the sheet to archive (alternative to 'sheet' config)",
|
||||||
},
|
},
|
||||||
"header": {"default": 1,
|
"header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
|
||||||
"type": "int",
|
|
||||||
"help": "index of the header row (starts at 1)", "type": "int"},
|
|
||||||
"service_account": {
|
"service_account": {
|
||||||
"default": "secrets/service_account.json",
|
"default": "secrets/service_account.json",
|
||||||
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
|
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
|
||||||
@@ -53,19 +51,6 @@
|
|||||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||||
"type": "bool",
|
"type": "bool",
|
||||||
},
|
},
|
||||||
"allow_worksheets": {
|
|
||||||
"default": set(),
|
|
||||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
|
||||||
},
|
|
||||||
"block_worksheets": {
|
|
||||||
"default": set(),
|
|
||||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
|
||||||
},
|
|
||||||
"use_sheet_names_in_stored_paths": {
|
|
||||||
"default": True,
|
|
||||||
"type": "bool",
|
|
||||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"description": """
|
"description": """
|
||||||
GsheetsFeederDatabase
|
GsheetsFeederDatabase
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ The filtered rows are processed into `Metadata` objects.
|
|||||||
- validates the sheet's structure and filters rows based on input configurations.
|
- validates the sheet's structure and filters rows based on input configurations.
|
||||||
- Ensures only rows with valid URLs and unprocessed statuses are included.
|
- Ensures only rows with valid URLs and unprocessed statuses are included.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from typing import Tuple, Union
|
from typing import Tuple, Union
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
@@ -19,11 +20,10 @@ from slugify import slugify
|
|||||||
from auto_archiver.core import Feeder, Database, Media
|
from auto_archiver.core import Feeder, Database, Media
|
||||||
from auto_archiver.core import Metadata
|
from auto_archiver.core import Metadata
|
||||||
from auto_archiver.modules.gsheet_feeder_db import GWorksheet
|
from auto_archiver.modules.gsheet_feeder_db import GWorksheet
|
||||||
from auto_archiver.utils.misc import calculate_file_hash, get_current_timestamp
|
from auto_archiver.utils.misc import get_current_timestamp
|
||||||
|
|
||||||
|
|
||||||
class GsheetsFeederDB(Feeder, Database):
|
class GsheetsFeederDB(Feeder, Database):
|
||||||
|
|
||||||
def setup(self) -> None:
|
def setup(self) -> None:
|
||||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||||
# TODO mv to validators
|
# TODO mv to validators
|
||||||
@@ -42,24 +42,28 @@ class GsheetsFeederDB(Feeder, Database):
|
|||||||
if not self.should_process_sheet(worksheet.title):
|
if not self.should_process_sheet(worksheet.title):
|
||||||
logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
|
logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
|
||||||
continue
|
continue
|
||||||
logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
|
logger.info(f"Opening worksheet {ii=}: {worksheet.title=} header={self.header}")
|
||||||
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
|
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
|
||||||
if len(missing_cols := self.missing_required_columns(gw)):
|
if len(missing_cols := self.missing_required_columns(gw)):
|
||||||
logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
|
logger.warning(
|
||||||
|
f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}"
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# process and yield metadata here:
|
# process and yield metadata here:
|
||||||
yield from self._process_rows(gw)
|
yield from self._process_rows(gw)
|
||||||
logger.success(f'Finished worksheet {worksheet.title}')
|
logger.success(f"Finished worksheet {worksheet.title}")
|
||||||
|
|
||||||
def _process_rows(self, gw: GWorksheet):
|
def _process_rows(self, gw: GWorksheet):
|
||||||
for row in range(1 + self.header, gw.count_rows() + 1):
|
for row in range(1 + self.header, gw.count_rows() + 1):
|
||||||
url = gw.get_cell(row, 'url').strip()
|
url = gw.get_cell(row, "url").strip()
|
||||||
if not len(url): continue
|
if not len(url):
|
||||||
original_status = gw.get_cell(row, 'status')
|
continue
|
||||||
status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
|
original_status = gw.get_cell(row, "status")
|
||||||
|
status = gw.get_cell(row, "status", fresh=original_status in ["", None])
|
||||||
# TODO: custom status parser(?) aka should_retry_from_status
|
# TODO: custom status parser(?) aka should_retry_from_status
|
||||||
if status not in ['', None]: continue
|
if status not in ["", None]:
|
||||||
|
continue
|
||||||
|
|
||||||
# All checks done - archival process starts here
|
# All checks done - archival process starts here
|
||||||
m = Metadata().set_url(url)
|
m = Metadata().set_url(url)
|
||||||
@@ -70,10 +74,10 @@ class GsheetsFeederDB(Feeder, Database):
|
|||||||
# TODO: Check folder value not being recognised
|
# TODO: Check folder value not being recognised
|
||||||
m.set_context("gsheet", {"row": row, "worksheet": gw})
|
m.set_context("gsheet", {"row": row, "worksheet": gw})
|
||||||
|
|
||||||
if gw.get_cell_or_default(row, 'folder', "") is None:
|
if gw.get_cell_or_default(row, "folder", "") is None:
|
||||||
folder = ''
|
folder = ""
|
||||||
else:
|
else:
|
||||||
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
|
folder = slugify(gw.get_cell_or_default(row, "folder", "").strip())
|
||||||
if len(folder):
|
if len(folder):
|
||||||
if self.use_sheet_names_in_stored_paths:
|
if self.use_sheet_names_in_stored_paths:
|
||||||
m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title)))
|
m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title)))
|
||||||
@@ -91,12 +95,11 @@ class GsheetsFeederDB(Feeder, Database):
|
|||||||
|
|
||||||
def missing_required_columns(self, gw: GWorksheet) -> list:
|
def missing_required_columns(self, gw: GWorksheet) -> list:
|
||||||
missing = []
|
missing = []
|
||||||
for required_col in ['url', 'status']:
|
for required_col in ["url", "status"]:
|
||||||
if not gw.col_exists(required_col):
|
if not gw.col_exists(required_col):
|
||||||
missing.append(required_col)
|
missing.append(required_col)
|
||||||
return missing
|
return missing
|
||||||
|
|
||||||
|
|
||||||
def started(self, item: Metadata) -> None:
|
def started(self, item: Metadata) -> None:
|
||||||
logger.warning(f"STARTED {item}")
|
logger.warning(f"STARTED {item}")
|
||||||
gw, row = self._retrieve_gsheet(item)
|
gw, row = self._retrieve_gsheet(item)
|
||||||
@@ -155,9 +158,7 @@ class GsheetsFeederDB(Feeder, Database):
|
|||||||
if len(pdq_hashes):
|
if len(pdq_hashes):
|
||||||
batch_if_valid("pdq_hash", ",".join(pdq_hashes))
|
batch_if_valid("pdq_hash", ",".join(pdq_hashes))
|
||||||
|
|
||||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(
|
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
|
||||||
screenshot, "urls"
|
|
||||||
):
|
|
||||||
batch_if_valid("screenshot", "\n".join(screenshot.urls))
|
batch_if_valid("screenshot", "\n".join(screenshot.urls))
|
||||||
|
|
||||||
if thumbnail := item.get_first_image("thumbnail"):
|
if thumbnail := item.get_first_image("thumbnail"):
|
||||||
@@ -186,11 +187,12 @@ class GsheetsFeederDB(Feeder, Database):
|
|||||||
logger.debug(f"Unable to update sheet: {e}")
|
logger.debug(f"Unable to update sheet: {e}")
|
||||||
|
|
||||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||||
|
|
||||||
if gsheet := item.get_context("gsheet"):
|
if gsheet := item.get_context("gsheet"):
|
||||||
gw: GWorksheet = gsheet.get("worksheet")
|
gw: GWorksheet = gsheet.get("worksheet")
|
||||||
row: int = gsheet.get("row")
|
row: int = gsheet.get("row")
|
||||||
elif self.sheet_id:
|
elif self.sheet_id:
|
||||||
logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")
|
logger.error(
|
||||||
|
f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder."
|
||||||
|
)
|
||||||
|
|
||||||
return gw, row
|
return gw, row
|
||||||
|
|||||||
@@ -8,21 +8,22 @@ class GWorksheet:
|
|||||||
should always include the offset of the header.
|
should always include the offset of the header.
|
||||||
eg: if header=4, row 5 will be the first with data.
|
eg: if header=4, row 5 will be the first with data.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
COLUMN_NAMES = {
|
COLUMN_NAMES = {
|
||||||
'url': 'link',
|
"url": "link",
|
||||||
'status': 'archive status',
|
"status": "archive status",
|
||||||
'folder': 'destination folder',
|
"folder": "destination folder",
|
||||||
'archive': 'archive location',
|
"archive": "archive location",
|
||||||
'date': 'archive date',
|
"date": "archive date",
|
||||||
'thumbnail': 'thumbnail',
|
"thumbnail": "thumbnail",
|
||||||
'timestamp': 'upload timestamp',
|
"timestamp": "upload timestamp",
|
||||||
'title': 'upload title',
|
"title": "upload title",
|
||||||
'text': 'text content',
|
"text": "text content",
|
||||||
'screenshot': 'screenshot',
|
"screenshot": "screenshot",
|
||||||
'hash': 'hash',
|
"hash": "hash",
|
||||||
'pdq_hash': 'perceptual hashes',
|
"pdq_hash": "perceptual hashes",
|
||||||
'wacz': 'wacz',
|
"wacz": "wacz",
|
||||||
'replaywebpage': 'replaywebpage',
|
"replaywebpage": "replaywebpage",
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):
|
def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):
|
||||||
@@ -36,7 +37,7 @@ class GWorksheet:
|
|||||||
|
|
||||||
def _check_col_exists(self, col: str):
|
def _check_col_exists(self, col: str):
|
||||||
if col not in self.columns:
|
if col not in self.columns:
|
||||||
raise Exception(f'Column {col} is not in the configured column names: {self.columns.keys()}')
|
raise Exception(f"Column {col} is not in the configured column names: {self.columns.keys()}")
|
||||||
|
|
||||||
def _col_index(self, col: str):
|
def _col_index(self, col: str):
|
||||||
self._check_col_exists(col)
|
self._check_col_exists(col)
|
||||||
@@ -67,11 +68,11 @@ class GWorksheet:
|
|||||||
|
|
||||||
if fresh:
|
if fresh:
|
||||||
return self.wks.cell(row, col_index + 1).value
|
return self.wks.cell(row, col_index + 1).value
|
||||||
if type(row) == int:
|
if isinstance(row, int):
|
||||||
row = self.get_row(row)
|
row = self.get_row(row)
|
||||||
|
|
||||||
if col_index >= len(row):
|
if col_index >= len(row):
|
||||||
return ''
|
return ""
|
||||||
return row[col_index]
|
return row[col_index]
|
||||||
|
|
||||||
def get_cell_or_default(self, row, col: str, default: str = None, fresh=False, when_empty_use_default=True):
|
def get_cell_or_default(self, row, col: str, default: str = None, fresh=False, when_empty_use_default=True):
|
||||||
@@ -83,7 +84,7 @@ class GWorksheet:
|
|||||||
if when_empty_use_default and val.strip() == "":
|
if when_empty_use_default and val.strip() == "":
|
||||||
return default
|
return default
|
||||||
return val
|
return val
|
||||||
except:
|
except Exception:
|
||||||
return default
|
return default
|
||||||
|
|
||||||
def set_cell(self, row: int, col: str, val):
|
def set_cell(self, row: int, col: str, val):
|
||||||
@@ -96,13 +97,9 @@ class GWorksheet:
|
|||||||
receives a list of [(row:int, col:str, val)] and batch updates it, the parameters are the same as in the self.set_cell() method
|
receives a list of [(row:int, col:str, val)] and batch updates it, the parameters are the same as in the self.set_cell() method
|
||||||
"""
|
"""
|
||||||
cell_updates = [
|
cell_updates = [
|
||||||
{
|
{"range": self.to_a1(row, col), "values": [[str(val)[0:49999]]]} for row, col, val in cell_updates
|
||||||
'range': self.to_a1(row, col),
|
|
||||||
'values': [[str(val)[0:49999]]]
|
|
||||||
}
|
|
||||||
for row, col, val in cell_updates
|
|
||||||
]
|
]
|
||||||
self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
|
self.wks.batch_update(cell_updates, value_input_option="USER_ENTERED")
|
||||||
|
|
||||||
def to_a1(self, row: int, col: str):
|
def to_a1(self, row: int, col: str):
|
||||||
# row is 1-based
|
# row is 1-based
|
||||||
|
|||||||
@@ -3,16 +3,17 @@
|
|||||||
"type": ["enricher"],
|
"type": ["enricher"],
|
||||||
"requires_setup": False,
|
"requires_setup": False,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"python": ["loguru"],
|
"python": ["loguru"],
|
||||||
},
|
},
|
||||||
"configs": {
|
"configs": {
|
||||||
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
|
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
|
||||||
# TODO add non-negative requirement to match previous implementation?
|
# TODO add non-negative requirement to match previous implementation?
|
||||||
"chunksize": {"default": 16000000,
|
"chunksize": {
|
||||||
"help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB",
|
"default": 16000000,
|
||||||
'type': 'int',
|
"help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB",
|
||||||
},
|
"type": "int",
|
||||||
},
|
},
|
||||||
|
},
|
||||||
"description": """
|
"description": """
|
||||||
Generates cryptographic hashes for media files to ensure data integrity and authenticity.
|
Generates cryptographic hashes for media files to ensure data integrity and authenticity.
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
""" Hash Enricher for generating cryptographic hashes of media files.
|
"""Hash Enricher for generating cryptographic hashes of media files.
|
||||||
|
|
||||||
The `HashEnricher` calculates cryptographic hashes (e.g., SHA-256, SHA3-512)
|
The `HashEnricher` calculates cryptographic hashes (e.g., SHA-256, SHA3-512)
|
||||||
for media files stored in `Metadata` objects. These hashes are used for
|
for media files stored in `Metadata` objects. These hashes are used for
|
||||||
@@ -7,6 +7,7 @@ exact duplicates. The hash is computed by reading the file's bytes in chunks,
|
|||||||
making it suitable for handling large files efficiently.
|
making it suitable for handling large files efficiently.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import hashlib
|
import hashlib
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
@@ -20,7 +21,6 @@ class HashEnricher(Enricher):
|
|||||||
Calculates hashes for Media instances
|
Calculates hashes for Media instances
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def enrich(self, to_enrich: Metadata) -> None:
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
url = to_enrich.get_url()
|
url = to_enrich.get_url()
|
||||||
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
|
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
|
||||||
@@ -35,5 +35,6 @@ class HashEnricher(Enricher):
|
|||||||
hash_algo = hashlib.sha256
|
hash_algo = hashlib.sha256
|
||||||
elif self.algorithm == "SHA3-512":
|
elif self.algorithm == "SHA3-512":
|
||||||
hash_algo = hashlib.sha3_512
|
hash_algo = hashlib.sha3_512
|
||||||
else: return ""
|
else:
|
||||||
|
return ""
|
||||||
return calculate_file_hash(filename, hash_algo, self.chunksize)
|
return calculate_file_hash(filename, hash_algo, self.chunksize)
|
||||||
|
|||||||
@@ -2,14 +2,13 @@
|
|||||||
"name": "HTML Formatter",
|
"name": "HTML Formatter",
|
||||||
"type": ["formatter"],
|
"type": ["formatter"],
|
||||||
"requires_setup": False,
|
"requires_setup": False,
|
||||||
"dependencies": {
|
"dependencies": {"python": ["hash_enricher", "loguru", "jinja2"], "bin": [""]},
|
||||||
"python": ["hash_enricher", "loguru", "jinja2"],
|
|
||||||
"bin": [""]
|
|
||||||
},
|
|
||||||
"configs": {
|
"configs": {
|
||||||
"detect_thumbnails": {"default": True,
|
"detect_thumbnails": {
|
||||||
"help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'",
|
"default": True,
|
||||||
"type": "bool"},
|
"help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'",
|
||||||
|
"type": "bool",
|
||||||
},
|
},
|
||||||
|
},
|
||||||
"description": """ """,
|
"description": """ """,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import mimetypes, os, pathlib
|
import mimetypes
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
from jinja2 import Environment, FileSystemLoader
|
from jinja2 import Environment, FileSystemLoader
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
@@ -11,6 +13,7 @@ from auto_archiver.core import Metadata, Media
|
|||||||
from auto_archiver.core import Formatter
|
from auto_archiver.core import Formatter
|
||||||
from auto_archiver.utils.misc import random_str
|
from auto_archiver.utils.misc import random_str
|
||||||
|
|
||||||
|
|
||||||
class HtmlFormatter(Formatter):
|
class HtmlFormatter(Formatter):
|
||||||
environment: Environment = None
|
environment: Environment = None
|
||||||
template: any = None
|
template: any = None
|
||||||
@@ -21,9 +24,9 @@ class HtmlFormatter(Formatter):
|
|||||||
self.environment = Environment(loader=FileSystemLoader(template_dir), autoescape=True)
|
self.environment = Environment(loader=FileSystemLoader(template_dir), autoescape=True)
|
||||||
|
|
||||||
# JinjaHelper class static methods are added as filters
|
# JinjaHelper class static methods are added as filters
|
||||||
self.environment.filters.update({
|
self.environment.filters.update(
|
||||||
k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)
|
{k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)}
|
||||||
})
|
)
|
||||||
|
|
||||||
# Load a specific template or default to "html_template.html"
|
# Load a specific template or default to "html_template.html"
|
||||||
template_name = self.config.get("template_name", "html_template.html")
|
template_name = self.config.get("template_name", "html_template.html")
|
||||||
@@ -36,11 +39,7 @@ class HtmlFormatter(Formatter):
|
|||||||
return
|
return
|
||||||
|
|
||||||
content = self.template.render(
|
content = self.template.render(
|
||||||
url=url,
|
url=url, title=item.get_title(), media=item.media, metadata=item.metadata, version=__version__
|
||||||
title=item.get_title(),
|
|
||||||
media=item.media,
|
|
||||||
metadata=item.metadata,
|
|
||||||
version=__version__
|
|
||||||
)
|
)
|
||||||
|
|
||||||
html_path = os.path.join(self.tmp_dir, f"formatted{random_str(24)}.html")
|
html_path = os.path.join(self.tmp_dir, f"formatted{random_str(24)}.html")
|
||||||
@@ -49,7 +48,7 @@ class HtmlFormatter(Formatter):
|
|||||||
final_media = Media(filename=html_path, _mimetype="text/html")
|
final_media = Media(filename=html_path, _mimetype="text/html")
|
||||||
|
|
||||||
# get the already instantiated hash_enricher module
|
# get the already instantiated hash_enricher module
|
||||||
he = self.module_factory.get_module('hash_enricher', self.config)
|
he = self.module_factory.get_module("hash_enricher", self.config)
|
||||||
if len(hd := he.calculate_hash(final_media.filename)):
|
if len(hd := he.calculate_hash(final_media.filename)):
|
||||||
final_media.set("hash", f"{he.algorithm}:{hd}")
|
final_media.set("hash", f"{he.algorithm}:{hd}")
|
||||||
|
|
||||||
|
|||||||
@@ -2,18 +2,18 @@
|
|||||||
"name": "Instagram API Extractor",
|
"name": "Instagram API Extractor",
|
||||||
"type": ["extractor"],
|
"type": ["extractor"],
|
||||||
"entry_point": "instagram_api_extractor::InstagramAPIExtractor",
|
"entry_point": "instagram_api_extractor::InstagramAPIExtractor",
|
||||||
"dependencies":
|
"dependencies": {
|
||||||
{"python": ["requests",
|
"python": [
|
||||||
"loguru",
|
"requests",
|
||||||
"retrying",
|
"loguru",
|
||||||
"tqdm",],
|
"retrying",
|
||||||
},
|
"tqdm",
|
||||||
|
],
|
||||||
|
},
|
||||||
"requires_setup": True,
|
"requires_setup": True,
|
||||||
"configs": {
|
"configs": {
|
||||||
"access_token": {"default": None,
|
"access_token": {"default": None, "help": "a valid instagrapi-api token"},
|
||||||
"help": "a valid instagrapi-api token"},
|
"api_endpoint": {"required": True, "help": "API endpoint to use"},
|
||||||
"api_endpoint": {"required": True,
|
|
||||||
"help": "API endpoint to use"},
|
|
||||||
"full_profile": {
|
"full_profile": {
|
||||||
"default": False,
|
"default": False,
|
||||||
"type": "bool",
|
"type": "bool",
|
||||||
|
|||||||
@@ -36,21 +36,16 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
if self.api_endpoint[-1] == "/":
|
if self.api_endpoint[-1] == "/":
|
||||||
self.api_endpoint = self.api_endpoint[:-1]
|
self.api_endpoint = self.api_endpoint[:-1]
|
||||||
|
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
|
|
||||||
url.replace("instagr.com", "instagram.com").replace(
|
url.replace("instagr.com", "instagram.com").replace("instagr.am", "instagram.com")
|
||||||
"instagr.am", "instagram.com"
|
|
||||||
)
|
|
||||||
insta_matches = self.valid_url.findall(url)
|
insta_matches = self.valid_url.findall(url)
|
||||||
logger.info(f"{insta_matches=}")
|
logger.info(f"{insta_matches=}")
|
||||||
if not len(insta_matches) or len(insta_matches[0]) != 3:
|
if not len(insta_matches) or len(insta_matches[0]) != 3:
|
||||||
return
|
return
|
||||||
if len(insta_matches) > 1:
|
if len(insta_matches) > 1:
|
||||||
logger.warning(
|
logger.warning(f"Multiple instagram matches found in {url=}, using the first one")
|
||||||
f"Multiple instagram matches found in {url=}, using the first one"
|
|
||||||
)
|
|
||||||
return
|
return
|
||||||
g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2]
|
g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2]
|
||||||
if g1 == "":
|
if g1 == "":
|
||||||
@@ -73,23 +68,20 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
def call_api(self, path: str, params: dict) -> dict:
|
def call_api(self, path: str, params: dict) -> dict:
|
||||||
headers = {"accept": "application/json", "x-access-key": self.access_token}
|
headers = {"accept": "application/json", "x-access-key": self.access_token}
|
||||||
logger.debug(f"calling {self.api_endpoint}/{path} with {params=}")
|
logger.debug(f"calling {self.api_endpoint}/{path} with {params=}")
|
||||||
return requests.get(
|
return requests.get(f"{self.api_endpoint}/{path}", headers=headers, params=params).json()
|
||||||
f"{self.api_endpoint}/{path}", headers=headers, params=params
|
|
||||||
).json()
|
|
||||||
|
|
||||||
def cleanup_dict(self, d: dict | list) -> dict:
|
def cleanup_dict(self, d: dict | list) -> dict:
|
||||||
# repeats 3 times to remove nested empty values
|
# repeats 3 times to remove nested empty values
|
||||||
if not self.minimize_json_output:
|
if not self.minimize_json_output:
|
||||||
return d
|
return d
|
||||||
if type(d) == list:
|
if isinstance(d, list):
|
||||||
return [self.cleanup_dict(v) for v in d]
|
return [self.cleanup_dict(v) for v in d]
|
||||||
if type(d) != dict:
|
if not isinstance(d, dict):
|
||||||
return d
|
return d
|
||||||
return {
|
return {
|
||||||
k: clean_v
|
k: clean_v
|
||||||
for k, v in d.items()
|
for k, v in d.items()
|
||||||
if (clean_v := self.cleanup_dict(v))
|
if (clean_v := self.cleanup_dict(v)) not in [0.0, 0, [], {}, "", None, "null"]
|
||||||
not in [0.0, 0, [], {}, "", None, "null"]
|
|
||||||
and k not in ["x", "y", "width", "height"]
|
and k not in ["x", "y", "width", "height"]
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -103,7 +95,7 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
result.set_title(user.get("full_name", username)).set("data", user)
|
result.set_title(user.get("full_name", username)).set("data", user)
|
||||||
if pic_url := user.get("profile_pic_url_hd", user.get("profile_pic_url")):
|
if pic_url := user.get("profile_pic_url_hd", user.get("profile_pic_url")):
|
||||||
filename = self.download_from_url(pic_url)
|
filename = self.download_from_url(pic_url)
|
||||||
result.add_media(Media(filename=filename), id=f"profile_picture")
|
result.add_media(Media(filename=filename), id="profile_picture")
|
||||||
|
|
||||||
if self.full_profile:
|
if self.full_profile:
|
||||||
user_id = user.get("pk")
|
user_id = user.get("pk")
|
||||||
@@ -126,9 +118,7 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
try:
|
try:
|
||||||
self.download_all_tagged(result, user_id)
|
self.download_all_tagged(result, user_id)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result.append(
|
result.append("errors", f"Error downloading tagged posts for {username}")
|
||||||
"errors", f"Error downloading tagged posts for {username}"
|
|
||||||
)
|
|
||||||
logger.error(f"Error downloading tagged posts for {username}: {e}")
|
logger.error(f"Error downloading tagged posts for {username}: {e}")
|
||||||
|
|
||||||
# download all highlights
|
# download all highlights
|
||||||
@@ -143,7 +133,7 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
|
|
||||||
def download_all_highlights(self, result, username, user_id):
|
def download_all_highlights(self, result, username, user_id):
|
||||||
count_highlights = 0
|
count_highlights = 0
|
||||||
highlights = self.call_api(f"v1/user/highlights", {"user_id": user_id})
|
highlights = self.call_api("v1/user/highlights", {"user_id": user_id})
|
||||||
for h in highlights:
|
for h in highlights:
|
||||||
try:
|
try:
|
||||||
h_info = self._download_highlights_reusable(result, h.get("pk"))
|
h_info = self._download_highlights_reusable(result, h.get("pk"))
|
||||||
@@ -153,26 +143,17 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
"errors",
|
"errors",
|
||||||
f"Error downloading highlight id{h.get('pk')} for {username}",
|
f"Error downloading highlight id{h.get('pk')} for {username}",
|
||||||
)
|
)
|
||||||
logger.error(
|
logger.error(f"Error downloading highlight id{h.get('pk')} for {username}: {e}")
|
||||||
f"Error downloading highlight id{h.get('pk')} for {username}: {e}"
|
if self.full_profile_max_posts and count_highlights >= self.full_profile_max_posts:
|
||||||
)
|
logger.info(f"HIGHLIGHTS reached full_profile_max_posts={self.full_profile_max_posts}")
|
||||||
if (
|
|
||||||
self.full_profile_max_posts
|
|
||||||
and count_highlights >= self.full_profile_max_posts
|
|
||||||
):
|
|
||||||
logger.info(
|
|
||||||
f"HIGHLIGHTS reached full_profile_max_posts={self.full_profile_max_posts}"
|
|
||||||
)
|
|
||||||
break
|
break
|
||||||
result.set("#highlights", count_highlights)
|
result.set("#highlights", count_highlights)
|
||||||
|
|
||||||
def download_post(
|
def download_post(self, result: Metadata, code: str = None, id: str = None, context: str = None) -> Metadata:
|
||||||
self, result: Metadata, code: str = None, id: str = None, context: str = None
|
|
||||||
) -> Metadata:
|
|
||||||
if id:
|
if id:
|
||||||
post = self.call_api(f"v1/media/by/id", {"id": id})
|
post = self.call_api("v1/media/by/id", {"id": id})
|
||||||
else:
|
else:
|
||||||
post = self.call_api(f"v1/media/by/code", {"code": code})
|
post = self.call_api("v1/media/by/code", {"code": code})
|
||||||
assert post, f"Post {id or code} not found"
|
assert post, f"Post {id or code} not found"
|
||||||
|
|
||||||
if caption_text := post.get("caption_text"):
|
if caption_text := post.get("caption_text"):
|
||||||
@@ -192,15 +173,11 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
return result.success("insta highlights")
|
return result.success("insta highlights")
|
||||||
|
|
||||||
def _download_highlights_reusable(self, result: Metadata, id: str) -> dict:
|
def _download_highlights_reusable(self, result: Metadata, id: str) -> dict:
|
||||||
full_h = self.call_api(f"v2/highlight/by/id", {"id": id})
|
full_h = self.call_api("v2/highlight/by/id", {"id": id})
|
||||||
h_info = full_h.get("response", {}).get("reels", {}).get(f"highlight:{id}")
|
h_info = full_h.get("response", {}).get("reels", {}).get(f"highlight:{id}")
|
||||||
assert h_info, f"Highlight {id} not found: {full_h=}"
|
assert h_info, f"Highlight {id} not found: {full_h=}"
|
||||||
|
|
||||||
if (
|
if cover_media := h_info.get("cover_media", {}).get("cropped_image_version", {}).get("url"):
|
||||||
cover_media := h_info.get("cover_media", {})
|
|
||||||
.get("cropped_image_version", {})
|
|
||||||
.get("url")
|
|
||||||
):
|
|
||||||
filename = self.download_from_url(cover_media)
|
filename = self.download_from_url(cover_media)
|
||||||
result.add_media(Media(filename=filename), id=f"cover_media highlight {id}")
|
result.add_media(Media(filename=filename), id=f"cover_media highlight {id}")
|
||||||
|
|
||||||
@@ -210,9 +187,7 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
self.scrape_item(result, h, "highlight")
|
self.scrape_item(result, h, "highlight")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result.append("errors", f"Error downloading highlight {h.get('id')}")
|
result.append("errors", f"Error downloading highlight {h.get('id')}")
|
||||||
logger.error(
|
logger.error(f"Error downloading highlight, skipping {h.get('id')}: {e}")
|
||||||
f"Error downloading highlight, skipping {h.get('id')}: {e}"
|
|
||||||
)
|
|
||||||
|
|
||||||
return h_info
|
return h_info
|
||||||
|
|
||||||
@@ -225,7 +200,7 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
return result.success(f"insta stories {now}")
|
return result.success(f"insta stories {now}")
|
||||||
|
|
||||||
def _download_stories_reusable(self, result: Metadata, username: str) -> list[dict]:
|
def _download_stories_reusable(self, result: Metadata, username: str) -> list[dict]:
|
||||||
stories = self.call_api(f"v1/user/stories/by/username", {"username": username})
|
stories = self.call_api("v1/user/stories/by/username", {"username": username})
|
||||||
if not stories or not len(stories):
|
if not stories or not len(stories):
|
||||||
return []
|
return []
|
||||||
stories = stories[::-1] # newest to oldest
|
stories = stories[::-1] # newest to oldest
|
||||||
@@ -244,10 +219,8 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
|
|
||||||
post_count = 0
|
post_count = 0
|
||||||
while end_cursor != "":
|
while end_cursor != "":
|
||||||
posts = self.call_api(
|
posts = self.call_api("v1/user/medias/chunk", {"user_id": user_id, "end_cursor": end_cursor})
|
||||||
f"v1/user/medias/chunk", {"user_id": user_id, "end_cursor": end_cursor}
|
if not posts or not isinstance(posts, list) or len(posts) != 2:
|
||||||
)
|
|
||||||
if not len(posts) or not type(posts) == list or len(posts) != 2:
|
|
||||||
break
|
break
|
||||||
posts, end_cursor = posts[0], posts[1]
|
posts, end_cursor = posts[0], posts[1]
|
||||||
logger.info(f"parsing {len(posts)} posts, next {end_cursor=}")
|
logger.info(f"parsing {len(posts)} posts, next {end_cursor=}")
|
||||||
@@ -260,13 +233,8 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
logger.error(f"Error downloading post, skipping {p.get('id')}: {e}")
|
logger.error(f"Error downloading post, skipping {p.get('id')}: {e}")
|
||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
post_count += 1
|
post_count += 1
|
||||||
if (
|
if self.full_profile_max_posts and post_count >= self.full_profile_max_posts:
|
||||||
self.full_profile_max_posts
|
logger.info(f"POSTS reached full_profile_max_posts={self.full_profile_max_posts}")
|
||||||
and post_count >= self.full_profile_max_posts
|
|
||||||
):
|
|
||||||
logger.info(
|
|
||||||
f"POSTS reached full_profile_max_posts={self.full_profile_max_posts}"
|
|
||||||
)
|
|
||||||
break
|
break
|
||||||
result.set("#posts", post_count)
|
result.set("#posts", post_count)
|
||||||
|
|
||||||
@@ -275,10 +243,8 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
pbar = tqdm(desc="downloading tagged posts")
|
pbar = tqdm(desc="downloading tagged posts")
|
||||||
|
|
||||||
tagged_count = 0
|
tagged_count = 0
|
||||||
while next_page_id != None:
|
while next_page_id is not None:
|
||||||
resp = self.call_api(
|
resp = self.call_api("v2/user/tag/medias", {"user_id": user_id, "page_id": next_page_id})
|
||||||
f"v2/user/tag/medias", {"user_id": user_id, "page_id": next_page_id}
|
|
||||||
)
|
|
||||||
posts = resp.get("response", {}).get("items", [])
|
posts = resp.get("response", {}).get("items", [])
|
||||||
if not len(posts):
|
if not len(posts):
|
||||||
break
|
break
|
||||||
@@ -290,21 +256,12 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
try:
|
try:
|
||||||
self.scrape_item(result, p, "tagged")
|
self.scrape_item(result, p, "tagged")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result.append(
|
result.append("errors", f"Error downloading tagged post {p.get('id')}")
|
||||||
"errors", f"Error downloading tagged post {p.get('id')}"
|
logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e}")
|
||||||
)
|
|
||||||
logger.error(
|
|
||||||
f"Error downloading tagged post, skipping {p.get('id')}: {e}"
|
|
||||||
)
|
|
||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
tagged_count += 1
|
tagged_count += 1
|
||||||
if (
|
if self.full_profile_max_posts and tagged_count >= self.full_profile_max_posts:
|
||||||
self.full_profile_max_posts
|
logger.info(f"TAGS reached full_profile_max_posts={self.full_profile_max_posts}")
|
||||||
and tagged_count >= self.full_profile_max_posts
|
|
||||||
):
|
|
||||||
logger.info(
|
|
||||||
f"TAGS reached full_profile_max_posts={self.full_profile_max_posts}"
|
|
||||||
)
|
|
||||||
break
|
break
|
||||||
result.set("#tagged", tagged_count)
|
result.set("#tagged", tagged_count)
|
||||||
|
|
||||||
@@ -318,9 +275,7 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
context can be used to give specific id prefixes to media
|
context can be used to give specific id prefixes to media
|
||||||
"""
|
"""
|
||||||
if "clips_metadata" in item:
|
if "clips_metadata" in item:
|
||||||
if reusable_text := item.get("clips_metadata", {}).get(
|
if reusable_text := item.get("clips_metadata", {}).get("reusable_text_attribute_string"):
|
||||||
"reusable_text_attribute_string"
|
|
||||||
):
|
|
||||||
item["clips_metadata_text"] = reusable_text
|
item["clips_metadata_text"] = reusable_text
|
||||||
if self.minimize_json_output:
|
if self.minimize_json_output:
|
||||||
del item["clips_metadata"]
|
del item["clips_metadata"]
|
||||||
|
|||||||
@@ -9,8 +9,7 @@
|
|||||||
},
|
},
|
||||||
"requires_setup": True,
|
"requires_setup": True,
|
||||||
"configs": {
|
"configs": {
|
||||||
"username": {"required": True,
|
"username": {"required": True, "help": "A valid Instagram username."},
|
||||||
"help": "A valid Instagram username."},
|
|
||||||
"password": {
|
"password": {
|
||||||
"required": True,
|
"required": True,
|
||||||
"help": "The corresponding Instagram account password.",
|
"help": "The corresponding Instagram account password.",
|
||||||
|
|||||||
@@ -1,9 +1,12 @@
|
|||||||
""" Uses the Instaloader library to download content from Instagram. This class handles both individual posts
|
"""Uses the Instaloader library to download content from Instagram. This class handles both individual posts
|
||||||
and user profiles, downloading as much information as possible, including images, videos, text, stories,
|
and user profiles, downloading as much information as possible, including images, videos, text, stories,
|
||||||
highlights, and tagged posts. Authentication is required via username/password or a session file.
|
highlights, and tagged posts. Authentication is required via username/password or a session file.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
import re, os, shutil
|
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
import instaloader
|
import instaloader
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
@@ -11,6 +14,7 @@ from auto_archiver.core import Extractor
|
|||||||
from auto_archiver.core import Metadata
|
from auto_archiver.core import Metadata
|
||||||
from auto_archiver.core import Media
|
from auto_archiver.core import Media
|
||||||
|
|
||||||
|
|
||||||
class InstagramExtractor(Extractor):
|
class InstagramExtractor(Extractor):
|
||||||
"""
|
"""
|
||||||
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
|
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
|
||||||
@@ -25,26 +29,24 @@ class InstagramExtractor(Extractor):
|
|||||||
# TODO: links to stories
|
# TODO: links to stories
|
||||||
|
|
||||||
def setup(self) -> None:
|
def setup(self) -> None:
|
||||||
|
|
||||||
self.insta = instaloader.Instaloader(
|
self.insta = instaloader.Instaloader(
|
||||||
download_geotags=True,
|
download_geotags=True,
|
||||||
download_comments=True,
|
download_comments=True,
|
||||||
compress_json=False,
|
compress_json=False,
|
||||||
dirname_pattern=self.download_folder,
|
dirname_pattern=self.download_folder,
|
||||||
filename_pattern="{date_utc}_UTC_{target}__{typename}"
|
filename_pattern="{date_utc}_UTC_{target}__{typename}",
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
self.insta.load_session_from_file(self.username, self.session_file)
|
self.insta.load_session_from_file(self.username, self.session_file)
|
||||||
except Exception as e:
|
except Exception:
|
||||||
try:
|
try:
|
||||||
logger.debug(f"Session file failed", exc_info=True)
|
logger.debug("Session file failed", exc_info=True)
|
||||||
logger.info("No valid session file found - Attempting login with use and password.")
|
logger.info("No valid session file found - Attempting login with use and password.")
|
||||||
self.insta.login(self.username, self.password)
|
self.insta.login(self.username, self.password)
|
||||||
self.insta.save_session_to_file(self.session_file)
|
self.insta.save_session_to_file(self.session_file)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}")
|
logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}")
|
||||||
|
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
|
|
||||||
@@ -53,7 +55,8 @@ class InstagramExtractor(Extractor):
|
|||||||
profile_matches = self.profile_pattern.findall(url)
|
profile_matches = self.profile_pattern.findall(url)
|
||||||
|
|
||||||
# return if not a valid instagram link
|
# return if not a valid instagram link
|
||||||
if not len(post_matches) and not len(profile_matches): return
|
if not len(post_matches) and not len(profile_matches):
|
||||||
|
return
|
||||||
|
|
||||||
result = None
|
result = None
|
||||||
try:
|
try:
|
||||||
@@ -65,7 +68,9 @@ class InstagramExtractor(Extractor):
|
|||||||
elif len(profile_matches):
|
elif len(profile_matches):
|
||||||
result = self.download_profile(url, profile_matches[0])
|
result = self.download_profile(url, profile_matches[0])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to download with instagram extractor due to: {e}, make sure your account credentials are valid.")
|
logger.error(
|
||||||
|
f"Failed to download with instagram extractor due to: {e}, make sure your account credentials are valid."
|
||||||
|
)
|
||||||
finally:
|
finally:
|
||||||
shutil.rmtree(self.download_folder, ignore_errors=True)
|
shutil.rmtree(self.download_folder, ignore_errors=True)
|
||||||
return result
|
return result
|
||||||
@@ -84,35 +89,50 @@ class InstagramExtractor(Extractor):
|
|||||||
profile = instaloader.Profile.from_username(self.insta.context, username)
|
profile = instaloader.Profile.from_username(self.insta.context, username)
|
||||||
try:
|
try:
|
||||||
for post in profile.get_posts():
|
for post in profile.get_posts():
|
||||||
try: self.insta.download_post(post, target=f"profile_post_{post.owner_username}")
|
try:
|
||||||
except Exception as e: logger.error(f"Failed to download post: {post.shortcode}: {e}")
|
self.insta.download_post(post, target=f"profile_post_{post.owner_username}")
|
||||||
except Exception as e: logger.error(f"Failed profile.get_posts: {e}")
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to download post: {post.shortcode}: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed profile.get_posts: {e}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for post in profile.get_tagged_posts():
|
for post in profile.get_tagged_posts():
|
||||||
try: self.insta.download_post(post, target=f"tagged_post_{post.owner_username}")
|
try:
|
||||||
except Exception as e: logger.error(f"Failed to download tagged post: {post.shortcode}: {e}")
|
self.insta.download_post(post, target=f"tagged_post_{post.owner_username}")
|
||||||
except Exception as e: logger.error(f"Failed profile.get_tagged_posts: {e}")
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to download tagged post: {post.shortcode}: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed profile.get_tagged_posts: {e}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for post in profile.get_igtv_posts():
|
for post in profile.get_igtv_posts():
|
||||||
try: self.insta.download_post(post, target=f"igtv_post_{post.owner_username}")
|
try:
|
||||||
except Exception as e: logger.error(f"Failed to download igtv post: {post.shortcode}: {e}")
|
self.insta.download_post(post, target=f"igtv_post_{post.owner_username}")
|
||||||
except Exception as e: logger.error(f"Failed profile.get_igtv_posts: {e}")
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to download igtv post: {post.shortcode}: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed profile.get_igtv_posts: {e}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for story in self.insta.get_stories([profile.userid]):
|
for story in self.insta.get_stories([profile.userid]):
|
||||||
for item in story.get_items():
|
for item in story.get_items():
|
||||||
try: self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}")
|
try:
|
||||||
except Exception as e: logger.error(f"Failed to download story item: {item}: {e}")
|
self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}")
|
||||||
except Exception as e: logger.error(f"Failed get_stories: {e}")
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to download story item: {item}: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed get_stories: {e}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for highlight in self.insta.get_highlights(profile.userid):
|
for highlight in self.insta.get_highlights(profile.userid):
|
||||||
for item in highlight.get_items():
|
for item in highlight.get_items():
|
||||||
try: self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}")
|
try:
|
||||||
except Exception as e: logger.error(f"Failed to download highlight item: {item}: {e}")
|
self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}")
|
||||||
except Exception as e: logger.error(f"Failed get_highlights: {e}")
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to download highlight item: {item}: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed get_highlights: {e}")
|
||||||
|
|
||||||
return self.process_downloads(url, f"@{username}", profile._asdict(), None)
|
return self.process_downloads(url, f"@{username}", profile._asdict(), None)
|
||||||
|
|
||||||
@@ -124,7 +144,8 @@ class InstagramExtractor(Extractor):
|
|||||||
all_media = []
|
all_media = []
|
||||||
for f in os.listdir(self.download_folder):
|
for f in os.listdir(self.download_folder):
|
||||||
if os.path.isfile((filename := os.path.join(self.download_folder, f))):
|
if os.path.isfile((filename := os.path.join(self.download_folder, f))):
|
||||||
if filename[-4:] == ".txt": continue
|
if filename[-4:] == ".txt":
|
||||||
|
continue
|
||||||
all_media.append(Media(filename))
|
all_media.append(Media(filename))
|
||||||
|
|
||||||
assert len(all_media) > 1, "No uploaded media found"
|
assert len(all_media) > 1, "No uploaded media found"
|
||||||
|
|||||||
@@ -1,16 +1,21 @@
|
|||||||
{
|
{
|
||||||
"name": "Instagram Telegram Bot Extractor",
|
"name": "Instagram Telegram Bot Extractor",
|
||||||
"type": ["extractor"],
|
"type": ["extractor"],
|
||||||
"dependencies": {"python": ["loguru", "telethon",],
|
"dependencies": {
|
||||||
},
|
"python": [
|
||||||
|
"loguru",
|
||||||
|
"telethon",
|
||||||
|
],
|
||||||
|
},
|
||||||
"requires_setup": True,
|
"requires_setup": True,
|
||||||
"configs": {
|
"configs": {
|
||||||
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
|
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
|
||||||
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
||||||
"session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
"session_file": {
|
||||||
"timeout": {"default": 45,
|
"default": "secrets/anon-insta",
|
||||||
"type": "int",
|
"help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value.",
|
||||||
"help": "timeout to fetch the instagram content in seconds."},
|
},
|
||||||
|
"timeout": {"default": 45, "type": "int", "help": "timeout to fetch the instagram content in seconds."},
|
||||||
},
|
},
|
||||||
"description": """
|
"description": """
|
||||||
The `InstagramTbotExtractor` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content,
|
The `InstagramTbotExtractor` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content,
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ class InstagramTbotExtractor(Extractor):
|
|||||||
"""Initializes the Telegram client."""
|
"""Initializes the Telegram client."""
|
||||||
try:
|
try:
|
||||||
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
|
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
|
||||||
except OperationalError as e:
|
except OperationalError:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Unable to access the {self.session_file} session. "
|
f"Unable to access the {self.session_file} session. "
|
||||||
"Ensure that you don't use the same session file here and in telethon_extractor. "
|
"Ensure that you don't use the same session file here and in telethon_extractor. "
|
||||||
@@ -68,12 +68,12 @@ class InstagramTbotExtractor(Extractor):
|
|||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
if not "instagram.com" in url: return False
|
if "instagram.com" not in url:
|
||||||
|
return False
|
||||||
|
|
||||||
result = Metadata()
|
result = Metadata()
|
||||||
tmp_dir = self.tmp_dir
|
tmp_dir = self.tmp_dir
|
||||||
with self.client.start():
|
with self.client.start():
|
||||||
|
|
||||||
chat, since_id = self._send_url_to_bot(url)
|
chat, since_id = self._send_url_to_bot(url)
|
||||||
message = self._process_messages(chat, since_id, tmp_dir, result)
|
message = self._process_messages(chat, since_id, tmp_dir, result)
|
||||||
|
|
||||||
@@ -110,13 +110,14 @@ class InstagramTbotExtractor(Extractor):
|
|||||||
for post in self.client.iter_messages(chat, min_id=since_id):
|
for post in self.client.iter_messages(chat, min_id=since_id):
|
||||||
since_id = max(since_id, post.id)
|
since_id = max(since_id, post.id)
|
||||||
# Skip known filler message:
|
# Skip known filler message:
|
||||||
if post.message == 'The bot receives information through https://hikerapi.com/p/hJqpppqi':
|
if post.message == "The bot receives information through https://hikerapi.com/p/hJqpppqi":
|
||||||
continue
|
continue
|
||||||
if post.media and post.id not in seen_media:
|
if post.media and post.id not in seen_media:
|
||||||
filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
|
filename_dest = os.path.join(tmp_dir, f"{chat.id}_{post.id}")
|
||||||
media = self.client.download_media(post.media, filename_dest)
|
media = self.client.download_media(post.media, filename_dest)
|
||||||
if media:
|
if media:
|
||||||
result.add_media(Media(media))
|
result.add_media(Media(media))
|
||||||
seen_media.append(post.id)
|
seen_media.append(post.id)
|
||||||
if post.message: message += post.message
|
if post.message:
|
||||||
|
message += post.message
|
||||||
return message.strip()
|
return message.strip()
|
||||||
@@ -17,9 +17,11 @@
|
|||||||
"choices": ["random", "static"],
|
"choices": ["random", "static"],
|
||||||
},
|
},
|
||||||
"save_to": {"default": "./local_archive", "help": "folder where to save archived content"},
|
"save_to": {"default": "./local_archive", "help": "folder where to save archived content"},
|
||||||
"save_absolute": {"default": False,
|
"save_absolute": {
|
||||||
"type": "bool",
|
"default": False,
|
||||||
"help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
|
"type": "bool",
|
||||||
|
"help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"description": """
|
"description": """
|
||||||
LocalStorage: A storage module for saving archived content locally on the filesystem.
|
LocalStorage: A storage module for saving archived content locally on the filesystem.
|
||||||
@@ -33,5 +35,5 @@
|
|||||||
### Notes
|
### Notes
|
||||||
- Default storage folder is `./archived`, but this can be changed via the `save_to` configuration.
|
- Default storage folder is `./archived`, but this can be changed via the `save_to` configuration.
|
||||||
- The `save_absolute` option can reveal the file structure in output formats; use with caution.
|
- The `save_absolute` option can reveal the file structure in output formats; use with caution.
|
||||||
"""
|
""",
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
import shutil
|
import shutil
|
||||||
from typing import IO
|
from typing import IO
|
||||||
import os
|
import os
|
||||||
@@ -8,12 +7,13 @@ from auto_archiver.core import Media
|
|||||||
from auto_archiver.core import Storage
|
from auto_archiver.core import Storage
|
||||||
from auto_archiver.core.consts import SetupError
|
from auto_archiver.core.consts import SetupError
|
||||||
|
|
||||||
|
|
||||||
class LocalStorage(Storage):
|
class LocalStorage(Storage):
|
||||||
|
|
||||||
|
|
||||||
def setup(self) -> None:
|
def setup(self) -> None:
|
||||||
if len(self.save_to) > 200:
|
if len(self.save_to) > 200:
|
||||||
raise SetupError(f"Your save_to path is too long, this will cause issues saving files on your computer. Please use a shorter path.")
|
raise SetupError(
|
||||||
|
"Your save_to path is too long, this will cause issues saving files on your computer. Please use a shorter path."
|
||||||
|
)
|
||||||
|
|
||||||
def get_cdn_url(self, media: Media) -> str:
|
def get_cdn_url(self, media: Media) -> str:
|
||||||
dest = media.key
|
dest = media.key
|
||||||
@@ -25,18 +25,18 @@ class LocalStorage(Storage):
|
|||||||
def set_key(self, media, url, metadata):
|
def set_key(self, media, url, metadata):
|
||||||
# clarify we want to save the file to the save_to folder
|
# clarify we want to save the file to the save_to folder
|
||||||
|
|
||||||
old_folder = metadata.get('folder', '')
|
old_folder = metadata.get("folder", "")
|
||||||
metadata.set_context('folder', os.path.join(self.save_to, metadata.get('folder', '')))
|
metadata.set_context("folder", os.path.join(self.save_to, metadata.get("folder", "")))
|
||||||
super().set_key(media, url, metadata)
|
super().set_key(media, url, metadata)
|
||||||
# don't impact other storages that might want a different 'folder' set
|
# don't impact other storages that might want a different 'folder' set
|
||||||
metadata.set_context('folder', old_folder)
|
metadata.set_context("folder", old_folder)
|
||||||
|
|
||||||
def upload(self, media: Media, **kwargs) -> bool:
|
def upload(self, media: Media, **kwargs) -> bool:
|
||||||
# override parent so that we can use shutil.copy2 and keep metadata
|
# override parent so that we can use shutil.copy2 and keep metadata
|
||||||
dest = media.key
|
dest = media.key
|
||||||
|
|
||||||
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
||||||
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}')
|
logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}")
|
||||||
|
|
||||||
res = shutil.copy2(media.filename, dest)
|
res = shutil.copy2(media.filename, dest)
|
||||||
logger.info(res)
|
logger.info(res)
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
"type": ["enricher"],
|
"type": ["enricher"],
|
||||||
"requires_setup": False,
|
"requires_setup": False,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"python": ["loguru"],
|
"python": ["loguru"],
|
||||||
},
|
},
|
||||||
"description": """
|
"description": """
|
||||||
Adds metadata information about the archive operations, Adds metadata about archive operations, including file sizes and archive duration./
|
Adds metadata information about the archive operations, Adds metadata about archive operations, including file sizes and archive duration./
|
||||||
|
|||||||
@@ -23,7 +23,9 @@ class MetaEnricher(Enricher):
|
|||||||
self.enrich_archive_duration(to_enrich)
|
self.enrich_archive_duration(to_enrich)
|
||||||
|
|
||||||
def enrich_file_sizes(self, to_enrich: Metadata):
|
def enrich_file_sizes(self, to_enrich: Metadata):
|
||||||
logger.debug(f"calculating archive file sizes for url={to_enrich.get_url()} ({len(to_enrich.media)} media files)")
|
logger.debug(
|
||||||
|
f"calculating archive file sizes for url={to_enrich.get_url()} ({len(to_enrich.media)} media files)"
|
||||||
|
)
|
||||||
total_size = 0
|
total_size = 0
|
||||||
for media in to_enrich.get_all_media():
|
for media in to_enrich.get_all_media():
|
||||||
file_stats = os.stat(media.filename)
|
file_stats = os.stat(media.filename)
|
||||||
@@ -34,7 +36,6 @@ class MetaEnricher(Enricher):
|
|||||||
to_enrich.set("total_bytes", total_size)
|
to_enrich.set("total_bytes", total_size)
|
||||||
to_enrich.set("total_size", self.human_readable_bytes(total_size))
|
to_enrich.set("total_size", self.human_readable_bytes(total_size))
|
||||||
|
|
||||||
|
|
||||||
def human_readable_bytes(self, size: int) -> str:
|
def human_readable_bytes(self, size: int) -> str:
|
||||||
# receives number of bytes and returns human readble size
|
# receives number of bytes and returns human readble size
|
||||||
for unit in ["bytes", "KB", "MB", "GB", "TB"]:
|
for unit in ["bytes", "KB", "MB", "GB", "TB"]:
|
||||||
|
|||||||
@@ -2,10 +2,7 @@
|
|||||||
"name": "Media Metadata Enricher",
|
"name": "Media Metadata Enricher",
|
||||||
"type": ["enricher"],
|
"type": ["enricher"],
|
||||||
"requires_setup": True,
|
"requires_setup": True,
|
||||||
"dependencies": {
|
"dependencies": {"python": ["loguru"], "bin": ["exiftool"]},
|
||||||
"python": ["loguru"],
|
|
||||||
"bin": ["exiftool"]
|
|
||||||
},
|
|
||||||
"description": """
|
"description": """
|
||||||
Extracts metadata information from files using ExifTool.
|
Extracts metadata information from files using ExifTool.
|
||||||
|
|
||||||
@@ -17,5 +14,5 @@
|
|||||||
### Notes
|
### Notes
|
||||||
- Requires ExifTool to be installed and accessible via the system's PATH.
|
- Requires ExifTool to be installed and accessible via the system's PATH.
|
||||||
- Skips enrichment for files where metadata extraction fails.
|
- Skips enrichment for files where metadata extraction fails.
|
||||||
"""
|
""",
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,7 +11,6 @@ class MetadataEnricher(Enricher):
|
|||||||
Extracts metadata information from files using exiftool.
|
Extracts metadata information from files using exiftool.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def enrich(self, to_enrich: Metadata) -> None:
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
url = to_enrich.get_url()
|
url = to_enrich.get_url()
|
||||||
logger.debug(f"extracting EXIF metadata for {url=}")
|
logger.debug(f"extracting EXIF metadata for {url=}")
|
||||||
@@ -23,13 +22,13 @@ class MetadataEnricher(Enricher):
|
|||||||
def get_metadata(self, filename: str) -> dict:
|
def get_metadata(self, filename: str) -> dict:
|
||||||
try:
|
try:
|
||||||
# Run ExifTool command to extract metadata from the file
|
# Run ExifTool command to extract metadata from the file
|
||||||
cmd = ['exiftool', filename]
|
cmd = ["exiftool", filename]
|
||||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||||
|
|
||||||
# Process the output to extract individual metadata fields
|
# Process the output to extract individual metadata fields
|
||||||
metadata = {}
|
metadata = {}
|
||||||
for line in result.stdout.splitlines():
|
for line in result.stdout.splitlines():
|
||||||
field, value = line.strip().split(':', 1)
|
field, value = line.strip().split(":", 1)
|
||||||
metadata[field.strip()] = value.strip()
|
metadata[field.strip()] = value.strip()
|
||||||
return metadata
|
return metadata
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
|
|||||||
@@ -2,8 +2,7 @@
|
|||||||
"name": "Mute Formatter",
|
"name": "Mute Formatter",
|
||||||
"type": ["formatter"],
|
"type": ["formatter"],
|
||||||
"requires_setup": True,
|
"requires_setup": True,
|
||||||
"dependencies": {
|
"dependencies": {},
|
||||||
},
|
|
||||||
"description": """ Default formatter.
|
"description": """ Default formatter.
|
||||||
""",
|
""",
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,5 +5,5 @@ from auto_archiver.core import Formatter
|
|||||||
|
|
||||||
|
|
||||||
class MuteFormatter(Formatter):
|
class MuteFormatter(Formatter):
|
||||||
|
def format(self, item: Metadata) -> Media:
|
||||||
def format(self, item: Metadata) -> Media: return None
|
return None
|
||||||
|
|||||||
@@ -17,5 +17,5 @@
|
|||||||
### Notes
|
### Notes
|
||||||
- Best used after enrichers like `thumbnail_enricher` or `screenshot_enricher` to ensure images are available.
|
- Best used after enrichers like `thumbnail_enricher` or `screenshot_enricher` to ensure images are available.
|
||||||
- Uses the `pdqhash` library to compute 256-bit perceptual hashes, which are stored as hexadecimal strings.
|
- Uses the `pdqhash` library to compute 256-bit perceptual hashes, which are stored as hexadecimal strings.
|
||||||
"""
|
""",
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ This enricher is typically used after thumbnail or screenshot enrichers
|
|||||||
to ensure images are available for hashing.
|
to ensure images are available for hashing.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import traceback
|
import traceback
|
||||||
import pdqhash
|
import pdqhash
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -34,7 +35,12 @@ class PdqHashEnricher(Enricher):
|
|||||||
for m in to_enrich.media:
|
for m in to_enrich.media:
|
||||||
for media in m.all_inner_media(True):
|
for media in m.all_inner_media(True):
|
||||||
media_id = media.get("id", "")
|
media_id = media.get("id", "")
|
||||||
if media.is_image() and "screenshot" not in media_id and "warc-file-" not in media_id and len(hd := self.calculate_pdq_hash(media.filename)):
|
if (
|
||||||
|
media.is_image()
|
||||||
|
and "screenshot" not in media_id
|
||||||
|
and "warc-file-" not in media_id
|
||||||
|
and len(hd := self.calculate_pdq_hash(media.filename))
|
||||||
|
):
|
||||||
media.set("pdq_hash", hd)
|
media.set("pdq_hash", hd)
|
||||||
media_with_hashes.append(media.filename)
|
media_with_hashes.append(media.filename)
|
||||||
|
|
||||||
@@ -51,5 +57,7 @@ class PdqHashEnricher(Enricher):
|
|||||||
hash = "".join(str(b) for b in hash_array)
|
hash = "".join(str(b) for b in hash_array)
|
||||||
return hex(int(hash, 2))[2:]
|
return hex(int(hash, 2))[2:]
|
||||||
except UnidentifiedImageError as e:
|
except UnidentifiedImageError as e:
|
||||||
logger.error(f"Image {filename=} is likely corrupted or in unsupported format {e}: {traceback.format_exc()}")
|
logger.error(
|
||||||
|
f"Image {filename=} is likely corrupted or in unsupported format {e}: {traceback.format_exc()}"
|
||||||
|
)
|
||||||
return ""
|
return ""
|
||||||
|
|||||||
@@ -20,20 +20,20 @@
|
|||||||
"region": {"default": None, "help": "S3 region name"},
|
"region": {"default": None, "help": "S3 region name"},
|
||||||
"key": {"default": None, "help": "S3 API key"},
|
"key": {"default": None, "help": "S3 API key"},
|
||||||
"secret": {"default": None, "help": "S3 API secret"},
|
"secret": {"default": None, "help": "S3 API secret"},
|
||||||
"random_no_duplicate": {"default": False,
|
"random_no_duplicate": {
|
||||||
"type": "bool",
|
"default": False,
|
||||||
"help": "if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`"},
|
"type": "bool",
|
||||||
|
"help": "if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`",
|
||||||
|
},
|
||||||
"endpoint_url": {
|
"endpoint_url": {
|
||||||
"default": 'https://{region}.digitaloceanspaces.com',
|
"default": "https://{region}.digitaloceanspaces.com",
|
||||||
"help": "S3 bucket endpoint, {region} are inserted at runtime"
|
"help": "S3 bucket endpoint, {region} are inserted at runtime",
|
||||||
},
|
},
|
||||||
"cdn_url": {
|
"cdn_url": {
|
||||||
"default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
|
"default": "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}",
|
||||||
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
|
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime",
|
||||||
},
|
},
|
||||||
"private": {"default": False,
|
"private": {"default": False, "type": "bool", "help": "if true S3 files will not be readable online"},
|
||||||
"type": "bool",
|
|
||||||
"help": "if true S3 files will not be readable online"},
|
|
||||||
},
|
},
|
||||||
"description": """
|
"description": """
|
||||||
S3Storage: A storage module for saving media files to an S3-compatible object storage.
|
S3Storage: A storage module for saving media files to an S3-compatible object storage.
|
||||||
@@ -50,5 +50,5 @@
|
|||||||
- The `random_no_duplicate` option ensures no duplicate uploads by leveraging hash-based folder structures.
|
- The `random_no_duplicate` option ensures no duplicate uploads by leveraging hash-based folder structures.
|
||||||
- Uses `boto3` for interaction with the S3 API.
|
- Uses `boto3` for interaction with the S3 API.
|
||||||
- Depends on the `HashEnricher` module for hash calculation.
|
- Depends on the `HashEnricher` module for hash calculation.
|
||||||
"""
|
""",
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
from typing import IO
|
from typing import IO
|
||||||
|
|
||||||
import boto3
|
import boto3
|
||||||
@@ -11,33 +10,36 @@ from auto_archiver.utils.misc import calculate_file_hash, random_str
|
|||||||
|
|
||||||
NO_DUPLICATES_FOLDER = "no-dups/"
|
NO_DUPLICATES_FOLDER = "no-dups/"
|
||||||
|
|
||||||
class S3Storage(Storage):
|
|
||||||
|
|
||||||
|
class S3Storage(Storage):
|
||||||
def setup(self) -> None:
|
def setup(self) -> None:
|
||||||
self.s3 = boto3.client(
|
self.s3 = boto3.client(
|
||||||
's3',
|
"s3",
|
||||||
region_name=self.region,
|
region_name=self.region,
|
||||||
endpoint_url=self.endpoint_url.format(region=self.region),
|
endpoint_url=self.endpoint_url.format(region=self.region),
|
||||||
aws_access_key_id=self.key,
|
aws_access_key_id=self.key,
|
||||||
aws_secret_access_key=self.secret
|
aws_secret_access_key=self.secret,
|
||||||
)
|
)
|
||||||
if self.random_no_duplicate:
|
if self.random_no_duplicate:
|
||||||
logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.")
|
logger.warning(
|
||||||
|
"random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`."
|
||||||
|
)
|
||||||
|
|
||||||
def get_cdn_url(self, media: Media) -> str:
|
def get_cdn_url(self, media: Media) -> str:
|
||||||
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
|
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
|
||||||
|
|
||||||
def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None:
|
def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None:
|
||||||
if not self.is_upload_needed(media): return True
|
if not self.is_upload_needed(media):
|
||||||
|
return True
|
||||||
|
|
||||||
extra_args = kwargs.get("extra_args", {})
|
extra_args = kwargs.get("extra_args", {})
|
||||||
if not self.private and 'ACL' not in extra_args:
|
if not self.private and "ACL" not in extra_args:
|
||||||
extra_args['ACL'] = 'public-read'
|
extra_args["ACL"] = "public-read"
|
||||||
|
|
||||||
if 'ContentType' not in extra_args:
|
if "ContentType" not in extra_args:
|
||||||
try:
|
try:
|
||||||
if media.mimetype:
|
if media.mimetype:
|
||||||
extra_args['ContentType'] = media.mimetype
|
extra_args["ContentType"] = media.mimetype
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
|
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
|
||||||
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
|
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
|
||||||
@@ -49,7 +51,7 @@ class S3Storage(Storage):
|
|||||||
hd = calculate_file_hash(media.filename)
|
hd = calculate_file_hash(media.filename)
|
||||||
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
|
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
|
||||||
|
|
||||||
if existing_key:=self.file_in_folder(path):
|
if existing_key := self.file_in_folder(path):
|
||||||
media._key = existing_key
|
media._key = existing_key
|
||||||
media.set("previously archived", True)
|
media.set("previously archived", True)
|
||||||
logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
|
logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
|
||||||
@@ -59,11 +61,11 @@ class S3Storage(Storage):
|
|||||||
media._key = os.path.join(path, f"{random_str(24)}{ext}")
|
media._key = os.path.join(path, f"{random_str(24)}{ext}")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def file_in_folder(self, path:str) -> str:
|
def file_in_folder(self, path: str) -> str:
|
||||||
# checks if path exists and is not an empty folder
|
# checks if path exists and is not an empty folder
|
||||||
if not path.endswith('/'):
|
if not path.endswith("/"):
|
||||||
path = path + '/'
|
path = path + "/"
|
||||||
resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter='/', MaxKeys=1)
|
resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter="/", MaxKeys=1)
|
||||||
if 'Contents' in resp:
|
if "Contents" in resp:
|
||||||
return resp['Contents'][0]['Key']
|
return resp["Contents"][0]["Key"]
|
||||||
return False
|
return False
|
||||||
@@ -6,26 +6,29 @@
|
|||||||
"python": ["loguru", "selenium"],
|
"python": ["loguru", "selenium"],
|
||||||
},
|
},
|
||||||
"configs": {
|
"configs": {
|
||||||
"width": {"default": 1280,
|
"width": {"default": 1280, "type": "int", "help": "width of the screenshots"},
|
||||||
"type": "int",
|
"height": {"default": 1024, "type": "int", "help": "height of the screenshots"},
|
||||||
"help": "width of the screenshots"},
|
"timeout": {"default": 60, "type": "int", "help": "timeout for taking the screenshot"},
|
||||||
"height": {"default": 1024,
|
"sleep_before_screenshot": {
|
||||||
"type": "int",
|
"default": 4,
|
||||||
"help": "height of the screenshots"},
|
"type": "int",
|
||||||
"timeout": {"default": 60,
|
"help": "seconds to wait for the pages to load before taking screenshot",
|
||||||
"type": "int",
|
|
||||||
"help": "timeout for taking the screenshot"},
|
|
||||||
"sleep_before_screenshot": {"default": 4,
|
|
||||||
"type": "int",
|
|
||||||
"help": "seconds to wait for the pages to load before taking screenshot"},
|
|
||||||
"http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
|
|
||||||
"save_to_pdf": {"default": False,
|
|
||||||
"type": "bool",
|
|
||||||
"help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
|
|
||||||
"print_options": {"default": {},
|
|
||||||
"help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
|
|
||||||
"type": "json_loader"},
|
|
||||||
},
|
},
|
||||||
|
"http_proxy": {
|
||||||
|
"default": "",
|
||||||
|
"help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port",
|
||||||
|
},
|
||||||
|
"save_to_pdf": {
|
||||||
|
"default": False,
|
||||||
|
"type": "bool",
|
||||||
|
"help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter",
|
||||||
|
},
|
||||||
|
"print_options": {
|
||||||
|
"default": {},
|
||||||
|
"help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
|
||||||
|
"type": "json_loader",
|
||||||
|
},
|
||||||
|
},
|
||||||
"description": """
|
"description": """
|
||||||
Captures screenshots and optionally saves web pages as PDFs using a WebDriver.
|
Captures screenshots and optionally saves web pages as PDFs using a WebDriver.
|
||||||
|
|
||||||
@@ -37,5 +40,5 @@
|
|||||||
|
|
||||||
### Notes
|
### Notes
|
||||||
- Requires a WebDriver (e.g., ChromeDriver) installed and accessible via the system's PATH.
|
- Requires a WebDriver (e.g., ChromeDriver) installed and accessible via the system's PATH.
|
||||||
"""
|
""",
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
from loguru import logger
|
from loguru import logger
|
||||||
import time, os
|
import time
|
||||||
|
import os
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
from selenium.common.exceptions import TimeoutException
|
from selenium.common.exceptions import TimeoutException
|
||||||
@@ -9,8 +10,8 @@ from auto_archiver.core import Enricher
|
|||||||
from auto_archiver.utils import Webdriver, url as UrlUtil, random_str
|
from auto_archiver.utils import Webdriver, url as UrlUtil, random_str
|
||||||
from auto_archiver.core import Media, Metadata
|
from auto_archiver.core import Media, Metadata
|
||||||
|
|
||||||
class ScreenshotEnricher(Enricher):
|
|
||||||
|
|
||||||
|
class ScreenshotEnricher(Enricher):
|
||||||
def __init__(self, webdriver_factory=None):
|
def __init__(self, webdriver_factory=None):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.webdriver_factory = webdriver_factory or Webdriver
|
self.webdriver_factory = webdriver_factory or Webdriver
|
||||||
@@ -25,8 +26,14 @@ class ScreenshotEnricher(Enricher):
|
|||||||
logger.debug(f"Enriching screenshot for {url=}")
|
logger.debug(f"Enriching screenshot for {url=}")
|
||||||
auth = self.auth_for_site(url)
|
auth = self.auth_for_site(url)
|
||||||
with self.webdriver_factory(
|
with self.webdriver_factory(
|
||||||
self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
|
self.width,
|
||||||
http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver:
|
self.height,
|
||||||
|
self.timeout,
|
||||||
|
facebook_accept_cookies="facebook.com" in url,
|
||||||
|
http_proxy=self.http_proxy,
|
||||||
|
print_options=self.print_options,
|
||||||
|
auth=auth,
|
||||||
|
) as driver:
|
||||||
try:
|
try:
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
time.sleep(int(self.sleep_before_screenshot))
|
time.sleep(int(self.sleep_before_screenshot))
|
||||||
@@ -43,4 +50,3 @@ class ScreenshotEnricher(Enricher):
|
|||||||
logger.info("TimeoutException loading page for screenshot")
|
logger.info("TimeoutException loading page for screenshot")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
|
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
|
||||||
|
|
||||||
|
|||||||
@@ -5,11 +5,13 @@
|
|||||||
"dependencies": {
|
"dependencies": {
|
||||||
"python": ["loguru", "slugify"],
|
"python": ["loguru", "slugify"],
|
||||||
},
|
},
|
||||||
'entry_point': 'ssl_enricher::SSLEnricher',
|
"entry_point": "ssl_enricher::SSLEnricher",
|
||||||
"configs": {
|
"configs": {
|
||||||
"skip_when_nothing_archived": {"default": True,
|
"skip_when_nothing_archived": {
|
||||||
"type": 'bool',
|
"default": True,
|
||||||
"help": "if true, will skip enriching when no media is archived"},
|
"type": "bool",
|
||||||
|
"help": "if true, will skip enriching when no media is archived",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"description": """
|
"description": """
|
||||||
Retrieves SSL certificate information for a domain and stores it as a file.
|
Retrieves SSL certificate information for a domain and stores it as a file.
|
||||||
@@ -21,5 +23,5 @@
|
|||||||
|
|
||||||
### Notes
|
### Notes
|
||||||
- Requires the target URL to use the HTTPS scheme; other schemes are not supported.
|
- Requires the target URL to use the HTTPS scheme; other schemes are not supported.
|
||||||
"""
|
""",
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import ssl, os
|
import ssl
|
||||||
|
import os
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
@@ -13,7 +14,8 @@ class SSLEnricher(Enricher):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def enrich(self, to_enrich: Metadata) -> None:
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
if not to_enrich.media and self.skip_when_nothing_archived: return
|
if not to_enrich.media and self.skip_when_nothing_archived:
|
||||||
|
return
|
||||||
|
|
||||||
url = to_enrich.get_url()
|
url = to_enrich.get_url()
|
||||||
parsed = urlparse(url)
|
parsed = urlparse(url)
|
||||||
@@ -24,5 +26,6 @@ class SSLEnricher(Enricher):
|
|||||||
|
|
||||||
cert = ssl.get_server_certificate((domain, 443))
|
cert = ssl.get_server_certificate((domain, 443))
|
||||||
cert_fn = os.path.join(self.tmp_dir, f"{slugify(domain)}.pem")
|
cert_fn = os.path.join(self.tmp_dir, f"{slugify(domain)}.pem")
|
||||||
with open(cert_fn, "w") as f: f.write(cert)
|
with open(cert_fn, "w") as f:
|
||||||
|
f.write(cert)
|
||||||
to_enrich.add_media(Media(filename=cert_fn), id="ssl_certificate")
|
to_enrich.add_media(Media(filename=cert_fn), id="ssl_certificate")
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
import requests, re, html
|
import requests
|
||||||
|
import re
|
||||||
|
import html
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
@@ -15,11 +17,11 @@ class TelegramExtractor(Extractor):
|
|||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
# detect URLs that we definitely cannot handle
|
# detect URLs that we definitely cannot handle
|
||||||
if 't.me' != item.netloc:
|
if "t.me" != item.netloc:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
|
||||||
}
|
}
|
||||||
|
|
||||||
# TODO: check if we can do this more resilient to variable URLs
|
# TODO: check if we can do this more resilient to variable URLs
|
||||||
@@ -27,11 +29,11 @@ class TelegramExtractor(Extractor):
|
|||||||
url += "?embed=1"
|
url += "?embed=1"
|
||||||
|
|
||||||
t = requests.get(url, headers=headers)
|
t = requests.get(url, headers=headers)
|
||||||
s = BeautifulSoup(t.content, 'html.parser')
|
s = BeautifulSoup(t.content, "html.parser")
|
||||||
|
|
||||||
result = Metadata()
|
result = Metadata()
|
||||||
result.set_content(html.escape(str(t.content)))
|
result.set_content(html.escape(str(t.content)))
|
||||||
if (timestamp := (s.find_all('time') or [{}])[0].get('datetime')):
|
if timestamp := (s.find_all("time") or [{}])[0].get("datetime"):
|
||||||
result.set_timestamp(timestamp)
|
result.set_timestamp(timestamp)
|
||||||
|
|
||||||
video = s.find("video")
|
video = s.find("video")
|
||||||
@@ -41,25 +43,26 @@ class TelegramExtractor(Extractor):
|
|||||||
|
|
||||||
image_urls = []
|
image_urls = []
|
||||||
for im in image_tags:
|
for im in image_tags:
|
||||||
urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])]
|
urls = [u.replace("'", "") for u in re.findall(r"url\((.*?)\)", im["style"])]
|
||||||
image_urls += urls
|
image_urls += urls
|
||||||
|
|
||||||
if not len(image_urls): return False
|
if not len(image_urls):
|
||||||
|
return False
|
||||||
for img_url in image_urls:
|
for img_url in image_urls:
|
||||||
result.add_media(Media(self.download_from_url(img_url)))
|
result.add_media(Media(self.download_from_url(img_url)))
|
||||||
else:
|
else:
|
||||||
video_url = video.get('src')
|
video_url = video.get("src")
|
||||||
m_video = Media(self.download_from_url(video_url))
|
m_video = Media(self.download_from_url(video_url))
|
||||||
# extract duration from HTML
|
# extract duration from HTML
|
||||||
try:
|
try:
|
||||||
duration = s.find_all('time')[0].contents[0]
|
duration = s.find_all("time")[0].contents[0]
|
||||||
if ':' in duration:
|
if ":" in duration:
|
||||||
duration = float(duration.split(
|
duration = float(duration.split(":")[0]) * 60 + float(duration.split(":")[1])
|
||||||
':')[0]) * 60 + float(duration.split(':')[1])
|
|
||||||
else:
|
else:
|
||||||
duration = float(duration)
|
duration = float(duration)
|
||||||
m_video.set("duration", duration)
|
m_video.set("duration", duration)
|
||||||
except: pass
|
except Exception:
|
||||||
|
pass
|
||||||
result.add_media(m_video)
|
result.add_media(m_video)
|
||||||
|
|
||||||
return result.success("telegram")
|
return result.success("telegram")
|
||||||
|
|||||||
@@ -3,26 +3,35 @@
|
|||||||
"type": ["extractor"],
|
"type": ["extractor"],
|
||||||
"requires_setup": True,
|
"requires_setup": True,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"python": ["telethon",
|
"python": [
|
||||||
"loguru",
|
"telethon",
|
||||||
"tqdm",
|
"loguru",
|
||||||
],
|
"tqdm",
|
||||||
"bin": [""]
|
],
|
||||||
|
"bin": [""],
|
||||||
},
|
},
|
||||||
"configs": {
|
"configs": {
|
||||||
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
|
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
|
||||||
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
||||||
"bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"},
|
"bot_token": {
|
||||||
"session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
"default": None,
|
||||||
"join_channels": {"default": True,
|
"help": "optional, but allows access to more content such as large videos, talk to @botfather",
|
||||||
"type": "bool",
|
|
||||||
"help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
|
|
||||||
"channel_invites": {
|
|
||||||
"default": {},
|
|
||||||
"help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
|
|
||||||
"type": "json_loader",
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
|
"session_file": {
|
||||||
|
"default": "secrets/anon",
|
||||||
|
"help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value.",
|
||||||
|
},
|
||||||
|
"join_channels": {
|
||||||
|
"default": True,
|
||||||
|
"type": "bool",
|
||||||
|
"help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck",
|
||||||
|
},
|
||||||
|
"channel_invites": {
|
||||||
|
"default": {},
|
||||||
|
"help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
|
||||||
|
"type": "json_loader",
|
||||||
|
},
|
||||||
|
},
|
||||||
"description": """
|
"description": """
|
||||||
The `TelethonExtractor` uses the Telethon library to archive posts and media from Telegram channels and groups.
|
The `TelethonExtractor` uses the Telethon library to archive posts and media from Telegram channels and groups.
|
||||||
It supports private and public channels, downloading grouped posts with media, and can join channels using invite links
|
It supports private and public channels, downloading grouped posts with media, and can join channels using invite links
|
||||||
@@ -46,5 +55,5 @@ To use the `TelethonExtractor`, you must configure the following:
|
|||||||
The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root.
|
The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root.
|
||||||
|
|
||||||
|
|
||||||
"""
|
""",
|
||||||
}
|
}
|
||||||
@@ -1,12 +1,18 @@
|
|||||||
|
|
||||||
import shutil
|
import shutil
|
||||||
from telethon.sync import TelegramClient
|
from telethon.sync import TelegramClient
|
||||||
from telethon.errors import ChannelInvalidError
|
from telethon.errors import ChannelInvalidError
|
||||||
from telethon.tl.functions.messages import ImportChatInviteRequest
|
from telethon.tl.functions.messages import ImportChatInviteRequest
|
||||||
from telethon.errors.rpcerrorlist import UserAlreadyParticipantError, FloodWaitError, InviteRequestSentError, InviteHashExpiredError
|
from telethon.errors.rpcerrorlist import (
|
||||||
|
UserAlreadyParticipantError,
|
||||||
|
FloodWaitError,
|
||||||
|
InviteRequestSentError,
|
||||||
|
InviteHashExpiredError,
|
||||||
|
)
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import re, time, os
|
import re
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
|
||||||
from auto_archiver.core import Extractor
|
from auto_archiver.core import Extractor
|
||||||
from auto_archiver.core import Metadata, Media
|
from auto_archiver.core import Metadata, Media
|
||||||
@@ -17,9 +23,7 @@ class TelethonExtractor(Extractor):
|
|||||||
valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
|
valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
|
||||||
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
|
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
|
||||||
|
|
||||||
|
|
||||||
def setup(self) -> None:
|
def setup(self) -> None:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
1. makes a copy of session_file that is removed in cleanup
|
1. makes a copy of session_file that is removed in cleanup
|
||||||
2. trigger login process for telegram or proceed if already saved in a session file
|
2. trigger login process for telegram or proceed if already saved in a session file
|
||||||
@@ -52,18 +56,20 @@ class TelethonExtractor(Extractor):
|
|||||||
channel_invite = self.channel_invites[i]
|
channel_invite = self.channel_invites[i]
|
||||||
channel_id = channel_invite.get("id", False)
|
channel_id = channel_invite.get("id", False)
|
||||||
invite = channel_invite["invite"]
|
invite = channel_invite["invite"]
|
||||||
if (match := self.invite_pattern.search(invite)):
|
if match := self.invite_pattern.search(invite):
|
||||||
try:
|
try:
|
||||||
if channel_id:
|
if channel_id:
|
||||||
ent = self.client.get_entity(int(channel_id)) # fails if not a member
|
ent = self.client.get_entity(int(channel_id)) # fails if not a member
|
||||||
else:
|
else:
|
||||||
ent = self.client.get_entity(invite) # fails if not a member
|
ent = self.client.get_entity(invite) # fails if not a member
|
||||||
logger.warning(f"please add the property id='{ent.id}' to the 'channel_invites' configuration where {invite=}, not doing so can lead to a minutes-long setup time due to telegram's rate limiting.")
|
logger.warning(
|
||||||
except ValueError as e:
|
f"please add the property id='{ent.id}' to the 'channel_invites' configuration where {invite=}, not doing so can lead to a minutes-long setup time due to telegram's rate limiting."
|
||||||
|
)
|
||||||
|
except ValueError:
|
||||||
logger.info(f"joining new channel {invite=}")
|
logger.info(f"joining new channel {invite=}")
|
||||||
try:
|
try:
|
||||||
self.client(ImportChatInviteRequest(match.group(2)))
|
self.client(ImportChatInviteRequest(match.group(2)))
|
||||||
except UserAlreadyParticipantError as e:
|
except UserAlreadyParticipantError:
|
||||||
logger.info(f"already joined {invite=}")
|
logger.info(f"already joined {invite=}")
|
||||||
except InviteRequestSentError:
|
except InviteRequestSentError:
|
||||||
logger.warning(f"already sent a join request with {invite} still no answer")
|
logger.warning(f"already sent a join request with {invite} still no answer")
|
||||||
@@ -95,7 +101,8 @@ class TelethonExtractor(Extractor):
|
|||||||
# detect URLs that we definitely cannot handle
|
# detect URLs that we definitely cannot handle
|
||||||
match = self.valid_url.search(url)
|
match = self.valid_url.search(url)
|
||||||
logger.debug(f"TELETHON: {match=}")
|
logger.debug(f"TELETHON: {match=}")
|
||||||
if not match: return False
|
if not match:
|
||||||
|
return False
|
||||||
|
|
||||||
is_private = match.group(1) == "/c"
|
is_private = match.group(1) == "/c"
|
||||||
chat = int(match.group(2)) if is_private else match.group(2)
|
chat = int(match.group(2)) if is_private else match.group(2)
|
||||||
@@ -105,39 +112,47 @@ class TelethonExtractor(Extractor):
|
|||||||
|
|
||||||
# NB: not using bot_token since then private channels cannot be archived: self.client.start(bot_token=self.bot_token)
|
# NB: not using bot_token since then private channels cannot be archived: self.client.start(bot_token=self.bot_token)
|
||||||
with self.client.start():
|
with self.client.start():
|
||||||
# with self.client.start(bot_token=self.bot_token):
|
# with self.client.start(bot_token=self.bot_token):
|
||||||
try:
|
try:
|
||||||
post = self.client.get_messages(chat, ids=post_id)
|
post = self.client.get_messages(chat, ids=post_id)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
logger.error(f"Could not fetch telegram {url} possibly it's private: {e}")
|
logger.error(f"Could not fetch telegram {url} possibly it's private: {e}")
|
||||||
return False
|
return False
|
||||||
except ChannelInvalidError as e:
|
except ChannelInvalidError as e:
|
||||||
logger.error(f"Could not fetch telegram {url}. This error may be fixed if you setup a bot_token in addition to api_id and api_hash (but then private channels will not be archived, we need to update this logic to handle both): {e}")
|
logger.error(
|
||||||
|
f"Could not fetch telegram {url}. This error may be fixed if you setup a bot_token in addition to api_id and api_hash (but then private channels will not be archived, we need to update this logic to handle both): {e}"
|
||||||
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
logger.debug(f"TELETHON GOT POST {post=}")
|
logger.debug(f"TELETHON GOT POST {post=}")
|
||||||
if post is None: return False
|
if post is None:
|
||||||
|
return False
|
||||||
|
|
||||||
media_posts = self._get_media_posts_in_group(chat, post)
|
media_posts = self._get_media_posts_in_group(chat, post)
|
||||||
logger.debug(f'got {len(media_posts)=} for {url=}')
|
logger.debug(f"got {len(media_posts)=} for {url=}")
|
||||||
|
|
||||||
tmp_dir = self.tmp_dir
|
tmp_dir = self.tmp_dir
|
||||||
|
|
||||||
group_id = post.grouped_id if post.grouped_id is not None else post.id
|
group_id = post.grouped_id if post.grouped_id is not None else post.id
|
||||||
title = post.message
|
title = post.message
|
||||||
for mp in media_posts:
|
for mp in media_posts:
|
||||||
if len(mp.message) > len(title): title = mp.message # save the longest text found (usually only 1)
|
if len(mp.message) > len(title):
|
||||||
|
title = mp.message # save the longest text found (usually only 1)
|
||||||
|
|
||||||
# media can also be in entities
|
# media can also be in entities
|
||||||
if mp.entities:
|
if mp.entities:
|
||||||
other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]]
|
other_media_urls = [
|
||||||
|
e.url
|
||||||
|
for e in mp.entities
|
||||||
|
if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]
|
||||||
|
]
|
||||||
if len(other_media_urls):
|
if len(other_media_urls):
|
||||||
logger.debug(f"Got {len(other_media_urls)} other media urls from {mp.id=}: {other_media_urls}")
|
logger.debug(f"Got {len(other_media_urls)} other media urls from {mp.id=}: {other_media_urls}")
|
||||||
for i, om_url in enumerate(other_media_urls):
|
for i, om_url in enumerate(other_media_urls):
|
||||||
filename = self.download_from_url(om_url, f'{chat}_{group_id}_{i}')
|
filename = self.download_from_url(om_url, f"{chat}_{group_id}_{i}")
|
||||||
result.add_media(Media(filename=filename), id=f"{group_id}_{i}")
|
result.add_media(Media(filename=filename), id=f"{group_id}_{i}")
|
||||||
|
|
||||||
filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id))
|
filename_dest = os.path.join(tmp_dir, f"{chat}_{group_id}", str(mp.id))
|
||||||
filename = self.client.download_media(mp.media, filename_dest)
|
filename = self.client.download_media(mp.media, filename_dest)
|
||||||
if not filename:
|
if not filename:
|
||||||
logger.debug(f"Empty media found, skipping {str(mp)=}")
|
logger.debug(f"Empty media found, skipping {str(mp)=}")
|
||||||
|
|||||||
@@ -2,18 +2,19 @@
|
|||||||
"name": "Thumbnail Enricher",
|
"name": "Thumbnail Enricher",
|
||||||
"type": ["enricher"],
|
"type": ["enricher"],
|
||||||
"requires_setup": False,
|
"requires_setup": False,
|
||||||
"dependencies": {
|
"dependencies": {"python": ["loguru", "ffmpeg"], "bin": ["ffmpeg"]},
|
||||||
"python": ["loguru", "ffmpeg"],
|
|
||||||
"bin": ["ffmpeg"]
|
|
||||||
},
|
|
||||||
"configs": {
|
"configs": {
|
||||||
"thumbnails_per_minute": {"default": 60,
|
"thumbnails_per_minute": {
|
||||||
"type": "int",
|
"default": 60,
|
||||||
"help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
|
"type": "int",
|
||||||
"max_thumbnails": {"default": 16,
|
"help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails",
|
||||||
"type": "int",
|
|
||||||
"help": "limit the number of thumbnails to generate per video, 0 means no limit"},
|
|
||||||
},
|
},
|
||||||
|
"max_thumbnails": {
|
||||||
|
"default": 16,
|
||||||
|
"type": "int",
|
||||||
|
"help": "limit the number of thumbnails to generate per video, 0 means no limit",
|
||||||
|
},
|
||||||
|
},
|
||||||
"description": """
|
"description": """
|
||||||
Generates thumbnails for video files to provide visual previews.
|
Generates thumbnails for video files to provide visual previews.
|
||||||
|
|
||||||
@@ -27,5 +28,5 @@
|
|||||||
- Requires `ffmpeg` to be installed and accessible via the system's PATH.
|
- Requires `ffmpeg` to be installed and accessible via the system's PATH.
|
||||||
- Handles videos without pre-existing duration metadata by probing with `ffmpeg`.
|
- Handles videos without pre-existing duration metadata by probing with `ffmpeg`.
|
||||||
- Skips enrichment for non-video media files.
|
- Skips enrichment for non-video media files.
|
||||||
"""
|
""",
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,7 +6,9 @@ visual snapshots of the video's keyframes, helping users preview content
|
|||||||
and identify important moments without watching the entire video.
|
and identify important moments without watching the entire video.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
import ffmpeg, os
|
|
||||||
|
import ffmpeg
|
||||||
|
import os
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from auto_archiver.core import Enricher
|
from auto_archiver.core import Enricher
|
||||||
@@ -36,7 +38,9 @@ class ThumbnailEnricher(Enricher):
|
|||||||
if duration is None:
|
if duration is None:
|
||||||
try:
|
try:
|
||||||
probe = ffmpeg.probe(m.filename)
|
probe = ffmpeg.probe(m.filename)
|
||||||
duration = float(next(stream for stream in probe['streams'] if stream['codec_type'] == 'video')['duration'])
|
duration = float(
|
||||||
|
next(stream for stream in probe["streams"] if stream["codec_type"] == "video")["duration"]
|
||||||
|
)
|
||||||
to_enrich.media[m_id].set("duration", duration)
|
to_enrich.media[m_id].set("duration", duration)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"error getting duration of video {m.filename}: {e}")
|
logger.error(f"error getting duration of video {m.filename}: {e}")
|
||||||
@@ -48,11 +52,13 @@ class ThumbnailEnricher(Enricher):
|
|||||||
thumbnails_media = []
|
thumbnails_media = []
|
||||||
for index, timestamp in enumerate(timestamps):
|
for index, timestamp in enumerate(timestamps):
|
||||||
output_path = os.path.join(folder, f"out{index}.jpg")
|
output_path = os.path.join(folder, f"out{index}.jpg")
|
||||||
ffmpeg.input(m.filename, ss=timestamp).filter('scale', 512, -1).output(output_path, vframes=1, loglevel="quiet").run()
|
ffmpeg.input(m.filename, ss=timestamp).filter("scale", 512, -1).output(
|
||||||
|
output_path, vframes=1, loglevel="quiet"
|
||||||
|
).run()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
thumbnails_media.append(Media(
|
thumbnails_media.append(
|
||||||
filename=output_path)
|
Media(filename=output_path)
|
||||||
.set("id", f"thumbnail_{index}")
|
.set("id", f"thumbnail_{index}")
|
||||||
.set("timestamp", "%.3fs" % timestamp)
|
.set("timestamp", "%.3fs" % timestamp)
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -3,38 +3,29 @@
|
|||||||
"type": ["enricher"],
|
"type": ["enricher"],
|
||||||
"requires_setup": True,
|
"requires_setup": True,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"python": [
|
"python": ["loguru", "slugify", "tsp_client", "asn1crypto", "certvalidator", "certifi"],
|
||||||
"loguru",
|
|
||||||
"slugify",
|
|
||||||
"tsp_client",
|
|
||||||
"asn1crypto",
|
|
||||||
"certvalidator",
|
|
||||||
"certifi"
|
|
||||||
],
|
|
||||||
},
|
},
|
||||||
"configs": {
|
"configs": {
|
||||||
"tsa_urls": {
|
"tsa_urls": {
|
||||||
"default": [
|
"default": [
|
||||||
# [Adobe Approved Trust List] and [Windows Cert Store]
|
# [Adobe Approved Trust List] and [Windows Cert Store]
|
||||||
"http://timestamp.digicert.com",
|
"http://timestamp.digicert.com",
|
||||||
"http://timestamp.identrust.com",
|
"http://timestamp.identrust.com",
|
||||||
# "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping
|
# "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping
|
||||||
# "https://timestamp.sectigo.com", # wait 15 seconds between each request.
|
# "https://timestamp.sectigo.com", # wait 15 seconds between each request.
|
||||||
|
# [Adobe: European Union Trusted Lists].
|
||||||
# [Adobe: European Union Trusted Lists].
|
# "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request.
|
||||||
# "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request.
|
# [Windows Cert Store]
|
||||||
|
"http://timestamp.globalsign.com/tsa/r6advanced1",
|
||||||
# [Windows Cert Store]
|
# [Adobe: European Union Trusted Lists] and [Windows Cert Store]
|
||||||
"http://timestamp.globalsign.com/tsa/r6advanced1",
|
# "http://ts.quovadisglobal.com/eu", # not valid for timestamping
|
||||||
# [Adobe: European Union Trusted Lists] and [Windows Cert Store]
|
# "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain
|
||||||
# "http://ts.quovadisglobal.com/eu", # not valid for timestamping
|
# "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain
|
||||||
# "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain
|
# "http://tsa.sep.bg", # self-signed certificate in certificate chain
|
||||||
# "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain
|
# "http://tsa.izenpe.com", #unable to get local issuer certificate
|
||||||
# "http://tsa.sep.bg", # self-signed certificate in certificate chain
|
# "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate
|
||||||
# "http://tsa.izenpe.com", #unable to get local issuer certificate
|
"http://tss.accv.es:8318/tsa",
|
||||||
# "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate
|
],
|
||||||
"http://tss.accv.es:8318/tsa",
|
|
||||||
],
|
|
||||||
"help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
|
"help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@@ -50,5 +41,5 @@
|
|||||||
### Notes
|
### Notes
|
||||||
- Should be run after the `hash_enricher` to ensure file hashes are available.
|
- Should be run after the `hash_enricher` to ensure file hashes are available.
|
||||||
- Requires internet access to interact with the configured TSAs.
|
- Requires internet access to interact with the configured TSAs.
|
||||||
"""
|
""",
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import certifi
|
|||||||
from auto_archiver.core import Enricher
|
from auto_archiver.core import Enricher
|
||||||
from auto_archiver.core import Metadata, Media
|
from auto_archiver.core import Metadata, Media
|
||||||
|
|
||||||
|
|
||||||
class TimestampingEnricher(Enricher):
|
class TimestampingEnricher(Enricher):
|
||||||
"""
|
"""
|
||||||
Uses several RFC3161 Time Stamp Authorities to generate a timestamp token that will be preserved. This can be used to prove that a certain file existed at a certain time, useful for legal purposes, for example, to prove that a certain file was not tampered with after a certain date.
|
Uses several RFC3161 Time Stamp Authorities to generate a timestamp token that will be preserved. This can be used to prove that a certain file existed at a certain time, useful for legal purposes, for example, to prove that a certain file was not tampered with after a certain date.
|
||||||
@@ -25,7 +26,9 @@ class TimestampingEnricher(Enricher):
|
|||||||
logger.debug(f"RFC3161 timestamping existing files for {url=}")
|
logger.debug(f"RFC3161 timestamping existing files for {url=}")
|
||||||
|
|
||||||
# create a new text file with the existing media hashes
|
# create a new text file with the existing media hashes
|
||||||
hashes = [m.get("hash").replace("SHA-256:", "").replace("SHA3-512:", "") for m in to_enrich.media if m.get("hash")]
|
hashes = [
|
||||||
|
m.get("hash").replace("SHA-256:", "").replace("SHA3-512:", "") for m in to_enrich.media if m.get("hash")
|
||||||
|
]
|
||||||
|
|
||||||
if not len(hashes):
|
if not len(hashes):
|
||||||
logger.warning(f"No hashes found in {url=}")
|
logger.warning(f"No hashes found in {url=}")
|
||||||
@@ -41,11 +44,12 @@ class TimestampingEnricher(Enricher):
|
|||||||
|
|
||||||
timestamp_tokens = []
|
timestamp_tokens = []
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
|
|
||||||
for tsa_url in self.tsa_urls:
|
for tsa_url in self.tsa_urls:
|
||||||
try:
|
try:
|
||||||
signing_settings = SigningSettings(tsp_server=tsa_url, digest_algorithm=DigestAlgorithm.SHA256)
|
signing_settings = SigningSettings(tsp_server=tsa_url, digest_algorithm=DigestAlgorithm.SHA256)
|
||||||
signer = TSPSigner()
|
signer = TSPSigner()
|
||||||
message = bytes(data_to_sign, encoding='utf8')
|
message = bytes(data_to_sign, encoding="utf8")
|
||||||
# send TSQ and get TSR from the TSA server
|
# send TSQ and get TSR from the TSA server
|
||||||
signed = signer.sign(message=message, signing_settings=signing_settings)
|
signed = signer.sign(message=message, signing_settings=signing_settings)
|
||||||
# fail if there's any issue with the certificates, uses certifi list of trusted CAs
|
# fail if there's any issue with the certificates, uses certifi list of trusted CAs
|
||||||
@@ -54,7 +58,8 @@ class TimestampingEnricher(Enricher):
|
|||||||
cert_chain = self.download_and_verify_certificate(signed)
|
cert_chain = self.download_and_verify_certificate(signed)
|
||||||
# continue with saving the timestamp token
|
# continue with saving the timestamp token
|
||||||
tst_fn = os.path.join(tmp_dir, f"timestamp_token_{slugify(tsa_url)}")
|
tst_fn = os.path.join(tmp_dir, f"timestamp_token_{slugify(tsa_url)}")
|
||||||
with open(tst_fn, "wb") as f: f.write(signed)
|
with open(tst_fn, "wb") as f:
|
||||||
|
f.write(signed)
|
||||||
timestamp_tokens.append(Media(filename=tst_fn).set("tsa", tsa_url).set("cert_chain", cert_chain))
|
timestamp_tokens.append(Media(filename=tst_fn).set("tsa", tsa_url).set("cert_chain", cert_chain))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Error while timestamping {url=} with {tsa_url=}: {e}")
|
logger.warning(f"Error while timestamping {url=} with {tsa_url=}: {e}")
|
||||||
@@ -75,7 +80,7 @@ class TimestampingEnricher(Enricher):
|
|||||||
tst = ContentInfo.load(signed)
|
tst = ContentInfo.load(signed)
|
||||||
|
|
||||||
trust_roots = []
|
trust_roots = []
|
||||||
with open(certifi.where(), 'rb') as f:
|
with open(certifi.where(), "rb") as f:
|
||||||
for _, _, der_bytes in pem.unarmor(f.read(), multiple=True):
|
for _, _, der_bytes in pem.unarmor(f.read(), multiple=True):
|
||||||
trust_roots.append(der_bytes)
|
trust_roots.append(der_bytes)
|
||||||
context = ValidationContext(trust_roots=trust_roots)
|
context = ValidationContext(trust_roots=trust_roots)
|
||||||
@@ -83,11 +88,11 @@ class TimestampingEnricher(Enricher):
|
|||||||
certificates = tst["content"]["certificates"]
|
certificates = tst["content"]["certificates"]
|
||||||
first_cert = certificates[0].dump()
|
first_cert = certificates[0].dump()
|
||||||
intermediate_certs = []
|
intermediate_certs = []
|
||||||
for i in range(1, len(certificates)): # cannot use list comprehension [1:]
|
for i in range(1, len(certificates)): # cannot use list comprehension [1:]
|
||||||
intermediate_certs.append(certificates[i].dump())
|
intermediate_certs.append(certificates[i].dump())
|
||||||
|
|
||||||
validator = CertificateValidator(first_cert, intermediate_certs=intermediate_certs, validation_context=context)
|
validator = CertificateValidator(first_cert, intermediate_certs=intermediate_certs, validation_context=context)
|
||||||
path = validator.validate_usage({'digital_signature'}, extended_key_usage={'time_stamping'})
|
path = validator.validate_usage({"digital_signature"}, extended_key_usage={"time_stamping"})
|
||||||
|
|
||||||
cert_chain = []
|
cert_chain = []
|
||||||
for cert in path:
|
for cert in path:
|
||||||
|
|||||||
@@ -3,21 +3,28 @@
|
|||||||
"type": ["extractor"],
|
"type": ["extractor"],
|
||||||
"requires_setup": True,
|
"requires_setup": True,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"python": ["requests",
|
"python": [
|
||||||
"loguru",
|
"requests",
|
||||||
"pytwitter",
|
"loguru",
|
||||||
"slugify",],
|
"pytwitter",
|
||||||
"bin": [""]
|
"slugify",
|
||||||
|
],
|
||||||
|
"bin": [""],
|
||||||
},
|
},
|
||||||
"configs": {
|
"configs": {
|
||||||
"bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
|
"bearer_token": {
|
||||||
"bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line",
|
"default": None,
|
||||||
},
|
"help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret",
|
||||||
"consumer_key": {"default": None, "help": "twitter API consumer_key"},
|
|
||||||
"consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
|
|
||||||
"access_token": {"default": None, "help": "twitter API access_token"},
|
|
||||||
"access_secret": {"default": None, "help": "twitter API access_secret"},
|
|
||||||
},
|
},
|
||||||
|
"bearer_tokens": {
|
||||||
|
"default": [],
|
||||||
|
"help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line",
|
||||||
|
},
|
||||||
|
"consumer_key": {"default": None, "help": "twitter API consumer_key"},
|
||||||
|
"consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
|
||||||
|
"access_token": {"default": None, "help": "twitter API access_token"},
|
||||||
|
"access_secret": {"default": None, "help": "twitter API access_secret"},
|
||||||
|
},
|
||||||
"description": """
|
"description": """
|
||||||
The `TwitterApiExtractor` fetches tweets and associated media using the Twitter API.
|
The `TwitterApiExtractor` fetches tweets and associated media using the Twitter API.
|
||||||
It supports multiple API configurations for extended rate limits and reliable access.
|
It supports multiple API configurations for extended rate limits and reliable access.
|
||||||
@@ -39,6 +46,5 @@
|
|||||||
- **Access Token and Secret**: Complements the consumer key for enhanced API capabilities.
|
- **Access Token and Secret**: Complements the consumer key for enhanced API capabilities.
|
||||||
|
|
||||||
Credentials can be obtained by creating a Twitter developer account at [Twitter Developer Platform](https://developer.twitter.com/en).
|
Credentials can be obtained by creating a Twitter developer account at [Twitter Developer Platform](https://developer.twitter.com/en).
|
||||||
"""
|
""",
|
||||||
,
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,8 +11,8 @@ from slugify import slugify
|
|||||||
from auto_archiver.core import Extractor
|
from auto_archiver.core import Extractor
|
||||||
from auto_archiver.core import Metadata, Media
|
from auto_archiver.core import Metadata, Media
|
||||||
|
|
||||||
class TwitterApiExtractor(Extractor):
|
|
||||||
|
|
||||||
|
class TwitterApiExtractor(Extractor):
|
||||||
valid_url: re.Pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
valid_url: re.Pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
||||||
|
|
||||||
def setup(self) -> None:
|
def setup(self) -> None:
|
||||||
@@ -23,9 +23,17 @@ class TwitterApiExtractor(Extractor):
|
|||||||
if self.bearer_token:
|
if self.bearer_token:
|
||||||
self.apis.append(Api(bearer_token=self.bearer_token))
|
self.apis.append(Api(bearer_token=self.bearer_token))
|
||||||
if self.consumer_key and self.consumer_secret and self.access_token and self.access_secret:
|
if self.consumer_key and self.consumer_secret and self.access_token and self.access_secret:
|
||||||
self.apis.append(Api(consumer_key=self.consumer_key, consumer_secret=self.consumer_secret,
|
self.apis.append(
|
||||||
access_token=self.access_token, access_secret=self.access_secret))
|
Api(
|
||||||
assert self.api_client is not None, "Missing Twitter API configurations, please provide either AND/OR (consumer_key, consumer_secret, access_token, access_secret) to use this archiver, you can provide both for better rate-limit results."
|
consumer_key=self.consumer_key,
|
||||||
|
consumer_secret=self.consumer_secret,
|
||||||
|
access_token=self.access_token,
|
||||||
|
access_secret=self.access_secret,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assert self.api_client is not None, (
|
||||||
|
"Missing Twitter API configurations, please provide either AND/OR (consumer_key, consumer_secret, access_token, access_secret) to use this archiver, you can provide both for better rate-limit results."
|
||||||
|
)
|
||||||
|
|
||||||
@property # getter .mimetype
|
@property # getter .mimetype
|
||||||
def api_client(self) -> str:
|
def api_client(self) -> str:
|
||||||
@@ -33,20 +41,20 @@ class TwitterApiExtractor(Extractor):
|
|||||||
|
|
||||||
def sanitize_url(self, url: str) -> str:
|
def sanitize_url(self, url: str) -> str:
|
||||||
# expand URL if t.co and clean tracker GET params
|
# expand URL if t.co and clean tracker GET params
|
||||||
if 'https://t.co/' in url:
|
if "https://t.co/" in url:
|
||||||
try:
|
try:
|
||||||
r = requests.get(url, timeout=30)
|
r = requests.get(url, timeout=30)
|
||||||
logger.debug(f'Expanded url {url} to {r.url}')
|
logger.debug(f"Expanded url {url} to {r.url}")
|
||||||
url = r.url
|
url = r.url
|
||||||
except:
|
except Exception:
|
||||||
logger.error(f'Failed to expand url {url}')
|
logger.error(f"Failed to expand url {url}")
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
# call download retry until success or no more apis
|
# call download retry until success or no more apis
|
||||||
while self.api_index < len(self.apis):
|
while self.api_index < len(self.apis):
|
||||||
if res := self.download_retry(item): return res
|
if res := self.download_retry(item):
|
||||||
|
return res
|
||||||
self.api_index += 1
|
self.api_index += 1
|
||||||
self.api_index = 0
|
self.api_index = 0
|
||||||
return False
|
return False
|
||||||
@@ -54,7 +62,8 @@ class TwitterApiExtractor(Extractor):
|
|||||||
def get_username_tweet_id(self, url):
|
def get_username_tweet_id(self, url):
|
||||||
# detect URLs that we definitely cannot handle
|
# detect URLs that we definitely cannot handle
|
||||||
matches = self.valid_url.findall(url)
|
matches = self.valid_url.findall(url)
|
||||||
if not len(matches): return False, False
|
if not len(matches):
|
||||||
|
return False, False
|
||||||
|
|
||||||
username, tweet_id = matches[0] # only one URL supported
|
username, tweet_id = matches[0] # only one URL supported
|
||||||
logger.debug(f"Found {username=} and {tweet_id=} in {url=}")
|
logger.debug(f"Found {username=} and {tweet_id=} in {url=}")
|
||||||
@@ -65,10 +74,16 @@ class TwitterApiExtractor(Extractor):
|
|||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
# detect URLs that we definitely cannot handle
|
# detect URLs that we definitely cannot handle
|
||||||
username, tweet_id = self.get_username_tweet_id(url)
|
username, tweet_id = self.get_username_tweet_id(url)
|
||||||
if not username: return False
|
if not username:
|
||||||
|
return False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tweet = self.api_client.get_tweet(tweet_id, expansions=["attachments.media_keys"], media_fields=["type", "duration_ms", "url", "variants"], tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"])
|
tweet = self.api_client.get_tweet(
|
||||||
|
tweet_id,
|
||||||
|
expansions=["attachments.media_keys"],
|
||||||
|
media_fields=["type", "duration_ms", "url", "variants"],
|
||||||
|
tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"],
|
||||||
|
)
|
||||||
logger.debug(tweet)
|
logger.debug(tweet)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Could not get tweet: {e}")
|
logger.error(f"Could not get tweet: {e}")
|
||||||
@@ -88,29 +103,35 @@ class TwitterApiExtractor(Extractor):
|
|||||||
mimetype = "image/jpeg"
|
mimetype = "image/jpeg"
|
||||||
elif hasattr(m, "variants"):
|
elif hasattr(m, "variants"):
|
||||||
variant = self.choose_variant(m.variants)
|
variant = self.choose_variant(m.variants)
|
||||||
if not variant: continue
|
if not variant:
|
||||||
|
continue
|
||||||
media.set("src", variant.url)
|
media.set("src", variant.url)
|
||||||
mimetype = variant.content_type
|
mimetype = variant.content_type
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
logger.info(f"Found media {media}")
|
logger.info(f"Found media {media}")
|
||||||
ext = mimetypes.guess_extension(mimetype)
|
ext = mimetypes.guess_extension(mimetype)
|
||||||
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}')
|
media.filename = self.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}")
|
||||||
result.add_media(media)
|
result.add_media(media)
|
||||||
|
|
||||||
result.set_content(json.dumps({
|
result.set_content(
|
||||||
"id": tweet.data.id,
|
json.dumps(
|
||||||
"text": tweet.data.text,
|
{
|
||||||
"created_at": tweet.data.created_at,
|
"id": tweet.data.id,
|
||||||
"author_id": tweet.data.author_id,
|
"text": tweet.data.text,
|
||||||
"geo": tweet.data.geo,
|
"created_at": tweet.data.created_at,
|
||||||
"lang": tweet.data.lang,
|
"author_id": tweet.data.author_id,
|
||||||
"media": urls
|
"geo": tweet.data.geo,
|
||||||
}, ensure_ascii=False, indent=4))
|
"lang": tweet.data.lang,
|
||||||
|
"media": urls,
|
||||||
|
},
|
||||||
|
ensure_ascii=False,
|
||||||
|
indent=4,
|
||||||
|
)
|
||||||
|
)
|
||||||
return result.success("twitter-api")
|
return result.success("twitter-api")
|
||||||
|
|
||||||
def choose_variant(self, variants):
|
def choose_variant(self, variants):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Chooses the highest quality variable possible out of a list of variants
|
Chooses the highest quality variable possible out of a list of variants
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -7,10 +7,8 @@
|
|||||||
"python": ["loguru", "vk_url_scraper"],
|
"python": ["loguru", "vk_url_scraper"],
|
||||||
},
|
},
|
||||||
"configs": {
|
"configs": {
|
||||||
"username": {"required": True,
|
"username": {"required": True, "help": "valid VKontakte username"},
|
||||||
"help": "valid VKontakte username"},
|
"password": {"required": True, "help": "valid VKontakte password"},
|
||||||
"password": {"required": True,
|
|
||||||
"help": "valid VKontakte password"},
|
|
||||||
"session_file": {
|
"session_file": {
|
||||||
"default": "secrets/vk_config.v2.json",
|
"default": "secrets/vk_config.v2.json",
|
||||||
"help": "valid VKontakte password",
|
"help": "valid VKontakte password",
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ from auto_archiver.core import Metadata, Media
|
|||||||
|
|
||||||
|
|
||||||
class VkExtractor(Extractor):
|
class VkExtractor(Extractor):
|
||||||
""""
|
""" "
|
||||||
VK videos are handled by YTDownloader, this archiver gets posts text and images.
|
VK videos are handled by YTDownloader, this archiver gets posts text and images.
|
||||||
Currently only works for /wall posts
|
Currently only works for /wall posts
|
||||||
"""
|
"""
|
||||||
@@ -18,11 +18,13 @@ class VkExtractor(Extractor):
|
|||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
|
|
||||||
if "vk.com" not in item.netloc: return False
|
if "vk.com" not in item.netloc:
|
||||||
|
return False
|
||||||
|
|
||||||
# some urls can contain multiple wall/photo/... parts and all will be fetched
|
# some urls can contain multiple wall/photo/... parts and all will be fetched
|
||||||
vk_scrapes = self.vks.scrape(url)
|
vk_scrapes = self.vks.scrape(url)
|
||||||
if not len(vk_scrapes): return False
|
if not len(vk_scrapes):
|
||||||
|
return False
|
||||||
logger.debug(f"VK: got {len(vk_scrapes)} scraped instances")
|
logger.debug(f"VK: got {len(vk_scrapes)} scraped instances")
|
||||||
|
|
||||||
result = Metadata()
|
result = Metadata()
|
||||||
|
|||||||
@@ -4,34 +4,38 @@
|
|||||||
"entry_point": "wacz_extractor_enricher::WaczExtractorEnricher",
|
"entry_point": "wacz_extractor_enricher::WaczExtractorEnricher",
|
||||||
"requires_setup": True,
|
"requires_setup": True,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"python": [
|
"python": ["loguru", "jsonlines", "warcio"],
|
||||||
"loguru",
|
|
||||||
"jsonlines",
|
|
||||||
"warcio"
|
|
||||||
],
|
|
||||||
# TODO?
|
# TODO?
|
||||||
"bin": [
|
"bin": ["docker"],
|
||||||
"docker"
|
|
||||||
]
|
|
||||||
},
|
},
|
||||||
"configs": {
|
"configs": {
|
||||||
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
|
"profile": {
|
||||||
"docker_commands": {"default": None, "help":"if a custom docker invocation is needed"},
|
"default": None,
|
||||||
"timeout": {"default": 120,
|
"help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles).",
|
||||||
"type": "int",
|
|
||||||
"help": "timeout for WACZ generation in seconds", "type": "int"},
|
|
||||||
"extract_media": {"default": False,
|
|
||||||
"type": 'bool',
|
|
||||||
"help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."
|
|
||||||
},
|
|
||||||
"extract_screenshot": {"default": True,
|
|
||||||
"type": 'bool',
|
|
||||||
"help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."
|
|
||||||
},
|
|
||||||
"socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
|
|
||||||
"socks_proxy_port": {"default": None, "type":"int", "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
|
|
||||||
"proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
|
|
||||||
},
|
},
|
||||||
|
"docker_commands": {"default": None, "help": "if a custom docker invocation is needed"},
|
||||||
|
"timeout": {"default": 120, "help": "timeout for WACZ generation in seconds", "type": "int"},
|
||||||
|
"extract_media": {
|
||||||
|
"default": False,
|
||||||
|
"type": "bool",
|
||||||
|
"help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched.",
|
||||||
|
},
|
||||||
|
"extract_screenshot": {
|
||||||
|
"default": True,
|
||||||
|
"type": "bool",
|
||||||
|
"help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched.",
|
||||||
|
},
|
||||||
|
"socks_proxy_host": {
|
||||||
|
"default": None,
|
||||||
|
"help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host",
|
||||||
|
},
|
||||||
|
"socks_proxy_port": {
|
||||||
|
"default": None,
|
||||||
|
"type": "int",
|
||||||
|
"help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234",
|
||||||
|
},
|
||||||
|
"proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
|
||||||
|
},
|
||||||
"description": """
|
"description": """
|
||||||
Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving.
|
Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving.
|
||||||
[Browsertrix-crawler](https://crawler.docs.browsertrix.com/user-guide/) is a headless browser-based crawler that archives web pages in WACZ format.
|
[Browsertrix-crawler](https://crawler.docs.browsertrix.com/user-guide/) is a headless browser-based crawler that archives web pages in WACZ format.
|
||||||
@@ -45,5 +49,5 @@
|
|||||||
### Notes
|
### Notes
|
||||||
- Requires Docker for running `browsertrix-crawler` .
|
- Requires Docker for running `browsertrix-crawler` .
|
||||||
- Configurable via parameters for timeout, media extraction, screenshots, and proxy settings.
|
- Configurable via parameters for timeout, media extraction, screenshots, and proxy settings.
|
||||||
"""
|
""",
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
import jsonlines
|
import jsonlines
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import os, shutil, subprocess
|
import os
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from warcio.archiveiterator import ArchiveIterator
|
from warcio.archiveiterator import ArchiveIterator
|
||||||
@@ -19,13 +21,12 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def setup(self) -> None:
|
def setup(self) -> None:
|
||||||
|
self.use_docker = os.environ.get("WACZ_ENABLE_DOCKER") or not os.environ.get("RUNNING_IN_DOCKER")
|
||||||
self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
|
self.docker_in_docker = os.environ.get("WACZ_ENABLE_DOCKER") and os.environ.get("RUNNING_IN_DOCKER")
|
||||||
self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER')
|
|
||||||
|
|
||||||
self.cwd_dind = f"/crawls/crawls{random_str(8)}"
|
self.cwd_dind = f"/crawls/crawls{random_str(8)}"
|
||||||
self.browsertrix_home_host = os.environ.get('BROWSERTRIX_HOME_HOST')
|
self.browsertrix_home_host = os.environ.get("BROWSERTRIX_HOME_HOST")
|
||||||
self.browsertrix_home_container = os.environ.get('BROWSERTRIX_HOME_CONTAINER') or self.browsertrix_home_host
|
self.browsertrix_home_container = os.environ.get("BROWSERTRIX_HOME_CONTAINER") or self.browsertrix_home_host
|
||||||
# create crawls folder if not exists, so it can be safely removed in cleanup
|
# create crawls folder if not exists, so it can be safely removed in cleanup
|
||||||
if self.docker_in_docker:
|
if self.docker_in_docker:
|
||||||
os.makedirs(self.cwd_dind, exist_ok=True)
|
os.makedirs(self.cwd_dind, exist_ok=True)
|
||||||
@@ -55,18 +56,29 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
|
|
||||||
cmd = [
|
cmd = [
|
||||||
"crawl",
|
"crawl",
|
||||||
"--url", url,
|
"--url",
|
||||||
"--scopeType", "page",
|
url,
|
||||||
|
"--scopeType",
|
||||||
|
"page",
|
||||||
"--generateWACZ",
|
"--generateWACZ",
|
||||||
"--text", "to-pages",
|
"--text",
|
||||||
"--screenshot", "fullPage",
|
"to-pages",
|
||||||
"--collection", collection,
|
"--screenshot",
|
||||||
"--id", collection,
|
"fullPage",
|
||||||
"--saveState", "never",
|
"--collection",
|
||||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
collection,
|
||||||
"--behaviorTimeout", str(self.timeout),
|
"--id",
|
||||||
"--timeout", str(self.timeout),
|
collection,
|
||||||
"--diskUtilization", "99",
|
"--saveState",
|
||||||
|
"never",
|
||||||
|
"--behaviors",
|
||||||
|
"autoscroll,autoplay,autofetch,siteSpecific",
|
||||||
|
"--behaviorTimeout",
|
||||||
|
str(self.timeout),
|
||||||
|
"--timeout",
|
||||||
|
str(self.timeout),
|
||||||
|
"--diskUtilization",
|
||||||
|
"99",
|
||||||
# "--blockAds" # note: this has been known to cause issues on cloudflare protected sites
|
# "--blockAds" # note: this has been known to cause issues on cloudflare protected sites
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -80,7 +92,14 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
if self.docker_commands:
|
if self.docker_commands:
|
||||||
cmd = self.docker_commands + cmd
|
cmd = self.docker_commands + cmd
|
||||||
else:
|
else:
|
||||||
cmd = ["docker", "run", "--rm", "-v", f"{browsertrix_home_host}:/crawls/", "webrecorder/browsertrix-crawler"] + cmd
|
cmd = [
|
||||||
|
"docker",
|
||||||
|
"run",
|
||||||
|
"--rm",
|
||||||
|
"-v",
|
||||||
|
f"{browsertrix_home_host}:/crawls/",
|
||||||
|
"webrecorder/browsertrix-crawler",
|
||||||
|
] + cmd
|
||||||
|
|
||||||
if self.profile:
|
if self.profile:
|
||||||
profile_fn = os.path.join(browsertrix_home_container, "profile.tar.gz")
|
profile_fn = os.path.join(browsertrix_home_container, "profile.tar.gz")
|
||||||
@@ -109,7 +128,6 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
logger.error(f"WACZ generation failed: {e}")
|
logger.error(f"WACZ generation failed: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
if self.docker_in_docker:
|
if self.docker_in_docker:
|
||||||
wacz_fn = os.path.join(self.cwd_dind, "collections", collection, f"{collection}.wacz")
|
wacz_fn = os.path.join(self.cwd_dind, "collections", collection, f"{collection}.wacz")
|
||||||
elif self.use_docker:
|
elif self.use_docker:
|
||||||
@@ -138,11 +156,10 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
logger.info(f"Parsing pages.jsonl {jsonl_fn=}")
|
logger.info(f"Parsing pages.jsonl {jsonl_fn=}")
|
||||||
with jsonlines.open(jsonl_fn) as reader:
|
with jsonlines.open(jsonl_fn) as reader:
|
||||||
for obj in reader:
|
for obj in reader:
|
||||||
if 'title' in obj:
|
if "title" in obj:
|
||||||
to_enrich.set_title(obj['title'])
|
to_enrich.set_title(obj["title"])
|
||||||
if 'text' in obj:
|
if "text" in obj:
|
||||||
to_enrich.set_content(obj['text'])
|
to_enrich.set_content(obj["text"])
|
||||||
|
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@@ -155,36 +172,41 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
# unzipping the .wacz
|
# unzipping the .wacz
|
||||||
tmp_dir = self.tmp_dir
|
tmp_dir = self.tmp_dir
|
||||||
unzipped_dir = os.path.join(tmp_dir, "unzipped")
|
unzipped_dir = os.path.join(tmp_dir, "unzipped")
|
||||||
with ZipFile(wacz_filename, 'r') as z_obj:
|
with ZipFile(wacz_filename, "r") as z_obj:
|
||||||
z_obj.extractall(path=unzipped_dir)
|
z_obj.extractall(path=unzipped_dir)
|
||||||
|
|
||||||
# if warc is split into multiple gzip chunks, merge those
|
# if warc is split into multiple gzip chunks, merge those
|
||||||
warc_dir = os.path.join(unzipped_dir, "archive")
|
warc_dir = os.path.join(unzipped_dir, "archive")
|
||||||
warc_filename = os.path.join(tmp_dir, "merged.warc")
|
warc_filename = os.path.join(tmp_dir, "merged.warc")
|
||||||
with open(warc_filename, 'wb') as outfile:
|
with open(warc_filename, "wb") as outfile:
|
||||||
for filename in sorted(os.listdir(warc_dir)):
|
for filename in sorted(os.listdir(warc_dir)):
|
||||||
if filename.endswith('.gz'):
|
if filename.endswith(".gz"):
|
||||||
chunk_file = os.path.join(warc_dir, filename)
|
chunk_file = os.path.join(warc_dir, filename)
|
||||||
with open(chunk_file, 'rb') as infile:
|
with open(chunk_file, "rb") as infile:
|
||||||
shutil.copyfileobj(infile, outfile)
|
shutil.copyfileobj(infile, outfile)
|
||||||
|
|
||||||
# get media out of .warc
|
# get media out of .warc
|
||||||
counter = 0
|
counter = 0
|
||||||
seen_urls = set()
|
seen_urls = set()
|
||||||
import json
|
|
||||||
with open(warc_filename, 'rb') as warc_stream:
|
with open(warc_filename, "rb") as warc_stream:
|
||||||
for record in ArchiveIterator(warc_stream):
|
for record in ArchiveIterator(warc_stream):
|
||||||
# only include fetched resources
|
# only include fetched resources
|
||||||
if record.rec_type == "resource" and record.content_type == "image/png" and self.extract_screenshot: # screenshots
|
if (
|
||||||
|
record.rec_type == "resource" and record.content_type == "image/png" and self.extract_screenshot
|
||||||
|
): # screenshots
|
||||||
fn = os.path.join(tmp_dir, f"warc-file-{counter}.png")
|
fn = os.path.join(tmp_dir, f"warc-file-{counter}.png")
|
||||||
with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
|
with open(fn, "wb") as outf:
|
||||||
|
outf.write(record.raw_stream.read())
|
||||||
m = Media(filename=fn)
|
m = Media(filename=fn)
|
||||||
to_enrich.add_media(m, "browsertrix-screenshot")
|
to_enrich.add_media(m, "browsertrix-screenshot")
|
||||||
counter += 1
|
counter += 1
|
||||||
if not self.extract_media: continue
|
if not self.extract_media:
|
||||||
|
continue
|
||||||
|
|
||||||
if record.rec_type != 'response': continue
|
if record.rec_type != "response":
|
||||||
record_url = record.rec_headers.get_header('WARC-Target-URI')
|
continue
|
||||||
|
record_url = record.rec_headers.get_header("WARC-Target-URI")
|
||||||
if not UrlUtil.is_relevant_url(record_url):
|
if not UrlUtil.is_relevant_url(record_url):
|
||||||
logger.debug(f"Skipping irrelevant URL {record_url} but it's still present in the WACZ.")
|
logger.debug(f"Skipping irrelevant URL {record_url} but it's still present in the WACZ.")
|
||||||
continue
|
continue
|
||||||
@@ -194,8 +216,10 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
|
|
||||||
# filter by media mimetypes
|
# filter by media mimetypes
|
||||||
content_type = record.http_headers.get("Content-Type")
|
content_type = record.http_headers.get("Content-Type")
|
||||||
if not content_type: continue
|
if not content_type:
|
||||||
if not any(x in content_type for x in ["video", "image", "audio"]): continue
|
continue
|
||||||
|
if not any(x in content_type for x in ["video", "image", "audio"]):
|
||||||
|
continue
|
||||||
|
|
||||||
# create local file and add media
|
# create local file and add media
|
||||||
ext = mimetypes.guess_extension(content_type)
|
ext = mimetypes.guess_extension(content_type)
|
||||||
@@ -203,7 +227,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
fn = os.path.join(tmp_dir, warc_fn)
|
fn = os.path.join(tmp_dir, warc_fn)
|
||||||
|
|
||||||
record_url_best_qual = UrlUtil.twitter_best_quality_url(record_url)
|
record_url_best_qual = UrlUtil.twitter_best_quality_url(record_url)
|
||||||
with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
|
with open(fn, "wb") as outf:
|
||||||
|
outf.write(record.raw_stream.read())
|
||||||
|
|
||||||
m = Media(filename=fn)
|
m = Media(filename=fn)
|
||||||
m.set("src", record_url)
|
m.set("src", record_url)
|
||||||
@@ -213,10 +238,14 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
m.filename = self.download_from_url(record_url_best_qual, warc_fn)
|
m.filename = self.download_from_url(record_url_best_qual, warc_fn)
|
||||||
m.set("src", record_url_best_qual)
|
m.set("src", record_url_best_qual)
|
||||||
m.set("src_alternative", record_url)
|
m.set("src_alternative", record_url)
|
||||||
except Exception as e: logger.warning(f"Unable to download best quality URL for {record_url=} got error {e}, using original in WARC.")
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
f"Unable to download best quality URL for {record_url=} got error {e}, using original in WARC."
|
||||||
|
)
|
||||||
|
|
||||||
# remove bad videos
|
# remove bad videos
|
||||||
if m.is_video() and not m.is_valid_video(): continue
|
if m.is_video() and not m.is_valid_video():
|
||||||
|
continue
|
||||||
|
|
||||||
to_enrich.add_media(m, warc_fn)
|
to_enrich.add_media(m, warc_fn)
|
||||||
counter += 1
|
counter += 1
|
||||||
|
|||||||
@@ -1,11 +1,13 @@
|
|||||||
import json
|
import json
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
import time, requests
|
import time
|
||||||
|
import requests
|
||||||
|
|
||||||
from auto_archiver.core import Extractor, Enricher
|
from auto_archiver.core import Extractor, Enricher
|
||||||
from auto_archiver.utils import url as UrlUtil
|
from auto_archiver.utils import url as UrlUtil
|
||||||
from auto_archiver.core import Metadata
|
from auto_archiver.core import Metadata
|
||||||
|
|
||||||
|
|
||||||
class WaybackExtractorEnricher(Enricher, Extractor):
|
class WaybackExtractorEnricher(Enricher, Extractor):
|
||||||
"""
|
"""
|
||||||
Submits the current URL to the webarchive and returns a job_id or completed archive.
|
Submits the current URL to the webarchive and returns a job_id or completed archive.
|
||||||
@@ -22,8 +24,10 @@ class WaybackExtractorEnricher(Enricher, Extractor):
|
|||||||
|
|
||||||
def enrich(self, to_enrich: Metadata) -> bool:
|
def enrich(self, to_enrich: Metadata) -> bool:
|
||||||
proxies = {}
|
proxies = {}
|
||||||
if self.proxy_http: proxies["http"] = self.proxy_http
|
if self.proxy_http:
|
||||||
if self.proxy_https: proxies["https"] = self.proxy_https
|
proxies["http"] = self.proxy_http
|
||||||
|
if self.proxy_https:
|
||||||
|
proxies["https"] = self.proxy_https
|
||||||
|
|
||||||
url = to_enrich.get_url()
|
url = to_enrich.get_url()
|
||||||
if UrlUtil.is_auth_wall(url):
|
if UrlUtil.is_auth_wall(url):
|
||||||
@@ -36,15 +40,12 @@ class WaybackExtractorEnricher(Enricher, Extractor):
|
|||||||
logger.info(f"Wayback enricher had already been executed: {to_enrich.get('wayback')}")
|
logger.info(f"Wayback enricher had already been executed: {to_enrich.get('wayback')}")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
ia_headers = {
|
ia_headers = {"Accept": "application/json", "Authorization": f"LOW {self.key}:{self.secret}"}
|
||||||
"Accept": "application/json",
|
post_data = {"url": url}
|
||||||
"Authorization": f"LOW {self.key}:{self.secret}"
|
|
||||||
}
|
|
||||||
post_data = {'url': url}
|
|
||||||
if self.if_not_archived_within:
|
if self.if_not_archived_within:
|
||||||
post_data["if_not_archived_within"] = self.if_not_archived_within
|
post_data["if_not_archived_within"] = self.if_not_archived_within
|
||||||
# see https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA for more options
|
# see https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA for more options
|
||||||
r = requests.post('https://web.archive.org/save/', headers=ia_headers, data=post_data, proxies=proxies)
|
r = requests.post("https://web.archive.org/save/", headers=ia_headers, data=post_data, proxies=proxies)
|
||||||
|
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
logger.error(em := f"Internet archive failed with status of {r.status_code}: {r.json()}")
|
logger.error(em := f"Internet archive failed with status of {r.status_code}: {r.json()}")
|
||||||
@@ -53,15 +54,14 @@ class WaybackExtractorEnricher(Enricher, Extractor):
|
|||||||
|
|
||||||
# check job status
|
# check job status
|
||||||
try:
|
try:
|
||||||
job_id = r.json().get('job_id')
|
job_id = r.json().get("job_id")
|
||||||
if not job_id:
|
if not job_id:
|
||||||
logger.error(f"Wayback failed with {r.json()}")
|
logger.error(f"Wayback failed with {r.json()}")
|
||||||
return False
|
return False
|
||||||
except json.decoder.JSONDecodeError as e:
|
except json.decoder.JSONDecodeError:
|
||||||
logger.error(f"Expected a JSON with job_id from Wayback and got {r.text}")
|
logger.error(f"Expected a JSON with job_id from Wayback and got {r.text}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
# waits at most timeout seconds until job is completed, otherwise only enriches the job_id information
|
# waits at most timeout seconds until job is completed, otherwise only enriches the job_id information
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
wayback_url = False
|
wayback_url = False
|
||||||
@@ -69,17 +69,19 @@ class WaybackExtractorEnricher(Enricher, Extractor):
|
|||||||
while not wayback_url and time.time() - start_time <= self.timeout:
|
while not wayback_url and time.time() - start_time <= self.timeout:
|
||||||
try:
|
try:
|
||||||
logger.debug(f"GETting status for {job_id=} on {url=} ({attempt=})")
|
logger.debug(f"GETting status for {job_id=} on {url=} ({attempt=})")
|
||||||
r_status = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers, proxies=proxies)
|
r_status = requests.get(
|
||||||
|
f"https://web.archive.org/save/status/{job_id}", headers=ia_headers, proxies=proxies
|
||||||
|
)
|
||||||
r_json = r_status.json()
|
r_json = r_status.json()
|
||||||
if r_status.status_code == 200 and r_json['status'] == 'success':
|
if r_status.status_code == 200 and r_json["status"] == "success":
|
||||||
wayback_url = f"https://web.archive.org/web/{r_json['timestamp']}/{r_json['original_url']}"
|
wayback_url = f"https://web.archive.org/web/{r_json['timestamp']}/{r_json['original_url']}"
|
||||||
elif r_status.status_code != 200 or r_json['status'] != 'pending':
|
elif r_status.status_code != 200 or r_json["status"] != "pending":
|
||||||
logger.error(f"Wayback failed with {r_json}")
|
logger.error(f"Wayback failed with {r_json}")
|
||||||
return False
|
return False
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
logger.warning(f"RequestException: fetching status for {url=} due to: {e}")
|
logger.warning(f"RequestException: fetching status for {url=} due to: {e}")
|
||||||
break
|
break
|
||||||
except json.decoder.JSONDecodeError as e:
|
except json.decoder.JSONDecodeError:
|
||||||
logger.error(f"Expected a JSON from Wayback and got {r.text} for {url=}")
|
logger.error(f"Expected a JSON from Wayback and got {r.text} for {url=}")
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -91,6 +93,8 @@ class WaybackExtractorEnricher(Enricher, Extractor):
|
|||||||
if wayback_url:
|
if wayback_url:
|
||||||
to_enrich.set("wayback", wayback_url)
|
to_enrich.set("wayback", wayback_url)
|
||||||
else:
|
else:
|
||||||
to_enrich.set("wayback", {"job_id": job_id, "check_status": f'https://web.archive.org/save/status/{job_id}'})
|
to_enrich.set(
|
||||||
|
"wayback", {"job_id": job_id, "check_status": f"https://web.archive.org/save/status/{job_id}"}
|
||||||
|
)
|
||||||
to_enrich.set("check wayback", f"https://web.archive.org/web/*/{url}")
|
to_enrich.set("check wayback", f"https://web.archive.org/web/*/{url}")
|
||||||
return True
|
return True
|
||||||
|
|||||||
@@ -6,19 +6,26 @@
|
|||||||
"python": ["s3_storage", "loguru", "requests"],
|
"python": ["s3_storage", "loguru", "requests"],
|
||||||
},
|
},
|
||||||
"configs": {
|
"configs": {
|
||||||
"api_endpoint": {"required": True,
|
"api_endpoint": {
|
||||||
"help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
|
"required": True,
|
||||||
"api_key": {"required": True,
|
"help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe.",
|
||||||
"help": "WhisperApi api key for authentication"},
|
},
|
||||||
"include_srt": {"default": False,
|
"api_key": {"required": True, "help": "WhisperApi api key for authentication"},
|
||||||
"type": "bool",
|
"include_srt": {
|
||||||
"help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
|
"default": False,
|
||||||
"timeout": {"default": 90,
|
"type": "bool",
|
||||||
"type": "int",
|
"help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players).",
|
||||||
"help": "How many seconds to wait at most for a successful job completion."},
|
},
|
||||||
"action": {"default": "translate",
|
"timeout": {
|
||||||
"help": "which Whisper operation to execute",
|
"default": 90,
|
||||||
"choices": ["transcribe", "translate", "language_detection"]},
|
"type": "int",
|
||||||
|
"help": "How many seconds to wait at most for a successful job completion.",
|
||||||
|
},
|
||||||
|
"action": {
|
||||||
|
"default": "translate",
|
||||||
|
"help": "which Whisper operation to execute",
|
||||||
|
"choices": ["transcribe", "translate", "language_detection"],
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"description": """
|
"description": """
|
||||||
Integrates with a Whisper API service to transcribe, translate, or detect the language of audio and video files.
|
Integrates with a Whisper API service to transcribe, translate, or detect the language of audio and video files.
|
||||||
@@ -35,5 +42,5 @@
|
|||||||
- Only compatible with S3-compatible storage systems for media file accessibility.
|
- Only compatible with S3-compatible storage systems for media file accessibility.
|
||||||
- ** This stores the media files in S3 prior to enriching them as Whisper requires public URLs to access the media files.
|
- ** This stores the media files in S3 prior to enriching them as Whisper requires public URLs to access the media files.
|
||||||
- Handles multiple jobs and retries for failed or incomplete processing.
|
- Handles multiple jobs and retries for failed or incomplete processing.
|
||||||
"""
|
""",
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,10 +1,12 @@
|
|||||||
import traceback
|
import traceback
|
||||||
import requests, time
|
import requests
|
||||||
|
import time
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from auto_archiver.core import Enricher
|
from auto_archiver.core import Enricher
|
||||||
from auto_archiver.core import Metadata, Media
|
from auto_archiver.core import Metadata, Media
|
||||||
|
|
||||||
|
|
||||||
class WhisperEnricher(Enricher):
|
class WhisperEnricher(Enricher):
|
||||||
"""
|
"""
|
||||||
Connects with a Whisper API service to get texts out of audio
|
Connects with a Whisper API service to get texts out of audio
|
||||||
@@ -13,15 +15,15 @@ class WhisperEnricher(Enricher):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def setup(self) -> None:
|
def setup(self) -> None:
|
||||||
self.stores = self.config['steps']['storages']
|
self.stores = self.config["steps"]["storages"]
|
||||||
self.s3 = self.module_factory.get_module("s3_storage", self.config)
|
self.s3 = self.module_factory.get_module("s3_storage", self.config)
|
||||||
if not "s3_storage" in self.stores:
|
if "s3_storage" not in self.stores:
|
||||||
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
|
logger.error(
|
||||||
|
"WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called."
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def enrich(self, to_enrich: Metadata) -> None:
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
|
|
||||||
url = to_enrich.get_url()
|
url = to_enrich.get_url()
|
||||||
logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.")
|
logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.")
|
||||||
|
|
||||||
@@ -36,28 +38,33 @@ class WhisperEnricher(Enricher):
|
|||||||
logger.debug(f"JOB SUBMITTED: {job_id=} for {m.key=}")
|
logger.debug(f"JOB SUBMITTED: {job_id=} for {m.key=}")
|
||||||
to_enrich.media[i].set("whisper_model", {"job_id": job_id})
|
to_enrich.media[i].set("whisper_model", {"job_id": job_id})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to submit whisper job for {m.filename=} with error {e}\n{traceback.format_exc()}")
|
logger.error(
|
||||||
|
f"Failed to submit whisper job for {m.filename=} with error {e}\n{traceback.format_exc()}"
|
||||||
|
)
|
||||||
|
|
||||||
job_results = self.check_jobs(job_results)
|
job_results = self.check_jobs(job_results)
|
||||||
|
|
||||||
for i, m in enumerate(to_enrich.media):
|
for i, m in enumerate(to_enrich.media):
|
||||||
if m.is_video() or m.is_audio():
|
if m.is_video() or m.is_audio():
|
||||||
job_id = to_enrich.media[i].get("whisper_model", {}).get("job_id")
|
job_id = to_enrich.media[i].get("whisper_model", {}).get("job_id")
|
||||||
if not job_id: continue
|
if not job_id:
|
||||||
to_enrich.media[i].set("whisper_model", {
|
continue
|
||||||
"job_id": job_id,
|
to_enrich.media[i].set(
|
||||||
"job_status_check": f"{self.api_endpoint}/jobs/{job_id}",
|
"whisper_model",
|
||||||
"job_artifacts_check": f"{self.api_endpoint}/jobs/{job_id}/artifacts",
|
{
|
||||||
**(job_results[job_id] if job_results[job_id] else {"result": "incomplete or failed job"})
|
"job_id": job_id,
|
||||||
})
|
"job_status_check": f"{self.api_endpoint}/jobs/{job_id}",
|
||||||
|
"job_artifacts_check": f"{self.api_endpoint}/jobs/{job_id}/artifacts",
|
||||||
|
**(job_results[job_id] if job_results[job_id] else {"result": "incomplete or failed job"}),
|
||||||
|
},
|
||||||
|
)
|
||||||
# append the extracted text to the content of the post so it gets written to the DBs like gsheets text column
|
# append the extracted text to the content of the post so it gets written to the DBs like gsheets text column
|
||||||
if job_results[job_id]:
|
if job_results[job_id]:
|
||||||
for k,v in job_results[job_id].items():
|
for k, v in job_results[job_id].items():
|
||||||
if "_text" in k and len(v):
|
if "_text" in k and len(v):
|
||||||
to_enrich.set_content(f"\n[automatic video transcript]: {v}")
|
to_enrich.set_content(f"\n[automatic video transcript]: {v}")
|
||||||
|
|
||||||
def submit_job(self, media: Media):
|
def submit_job(self, media: Media):
|
||||||
|
|
||||||
s3_url = self.s3.get_cdn_url(media)
|
s3_url = self.s3.get_cdn_url(media)
|
||||||
assert s3_url in media.urls, f"Could not find S3 url ({s3_url}) in list of stored media urls "
|
assert s3_url in media.urls, f"Could not find S3 url ({s3_url}) in list of stored media urls "
|
||||||
payload = {
|
payload = {
|
||||||
@@ -66,10 +73,14 @@ class WhisperEnricher(Enricher):
|
|||||||
# "language": "string" # may be a config
|
# "language": "string" # may be a config
|
||||||
}
|
}
|
||||||
logger.debug(f"calling API with {payload=}")
|
logger.debug(f"calling API with {payload=}")
|
||||||
response = requests.post(f'{self.api_endpoint}/jobs', json=payload, headers={'Authorization': f'Bearer {self.api_key}'})
|
response = requests.post(
|
||||||
assert response.status_code == 201, f"calling the whisper api {self.api_endpoint} returned a non-success code: {response.status_code}"
|
f"{self.api_endpoint}/jobs", json=payload, headers={"Authorization": f"Bearer {self.api_key}"}
|
||||||
|
)
|
||||||
|
assert response.status_code == 201, (
|
||||||
|
f"calling the whisper api {self.api_endpoint} returned a non-success code: {response.status_code}"
|
||||||
|
)
|
||||||
logger.debug(response.json())
|
logger.debug(response.json())
|
||||||
return response.json()['id']
|
return response.json()["id"]
|
||||||
|
|
||||||
def check_jobs(self, job_results: dict):
|
def check_jobs(self, job_results: dict):
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
@@ -77,37 +88,50 @@ class WhisperEnricher(Enricher):
|
|||||||
while not all_completed and (time.time() - start_time) <= self.timeout:
|
while not all_completed and (time.time() - start_time) <= self.timeout:
|
||||||
all_completed = True
|
all_completed = True
|
||||||
for job_id in job_results:
|
for job_id in job_results:
|
||||||
if job_results[job_id] != False: continue
|
if job_results[job_id] is not False:
|
||||||
|
continue
|
||||||
all_completed = False # at least one not ready
|
all_completed = False # at least one not ready
|
||||||
try: job_results[job_id] = self.check_job(job_id)
|
try:
|
||||||
|
job_results[job_id] = self.check_job(job_id)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to check {job_id=} with error {e}\n{traceback.format_exc()}")
|
logger.error(f"Failed to check {job_id=} with error {e}\n{traceback.format_exc()}")
|
||||||
if not all_completed: time.sleep(3)
|
if not all_completed:
|
||||||
|
time.sleep(3)
|
||||||
return job_results
|
return job_results
|
||||||
|
|
||||||
def check_job(self, job_id):
|
def check_job(self, job_id):
|
||||||
r = requests.get(f'{self.api_endpoint}/jobs/{job_id}', headers={'Authorization': f'Bearer {self.api_key}'})
|
r = requests.get(f"{self.api_endpoint}/jobs/{job_id}", headers={"Authorization": f"Bearer {self.api_key}"})
|
||||||
assert r.status_code == 200, f"Job status did not respond with 200, instead with: {r.status_code}"
|
assert r.status_code == 200, f"Job status did not respond with 200, instead with: {r.status_code}"
|
||||||
j = r.json()
|
j = r.json()
|
||||||
logger.debug(f"Checked job {job_id=} with status='{j['status']}'")
|
logger.debug(f"Checked job {job_id=} with status='{j['status']}'")
|
||||||
if j['status'] == "processing": return False
|
if j["status"] == "processing":
|
||||||
elif j['status'] == "error": return f"Error: {j['meta']['error']}"
|
return False
|
||||||
elif j['status'] == "success":
|
elif j["status"] == "error":
|
||||||
r_res = requests.get(f'{self.api_endpoint}/jobs/{job_id}/artifacts', headers={'Authorization': f'Bearer {self.api_key}'})
|
return f"Error: {j['meta']['error']}"
|
||||||
assert r_res.status_code == 200, f"Job artifacts did not respond with 200, instead with: {r_res.status_code}"
|
elif j["status"] == "success":
|
||||||
|
r_res = requests.get(
|
||||||
|
f"{self.api_endpoint}/jobs/{job_id}/artifacts", headers={"Authorization": f"Bearer {self.api_key}"}
|
||||||
|
)
|
||||||
|
assert r_res.status_code == 200, (
|
||||||
|
f"Job artifacts did not respond with 200, instead with: {r_res.status_code}"
|
||||||
|
)
|
||||||
logger.success(r_res.json())
|
logger.success(r_res.json())
|
||||||
result = {}
|
result = {}
|
||||||
for art_id, artifact in enumerate(r_res.json()):
|
for art_id, artifact in enumerate(r_res.json()):
|
||||||
subtitle = []
|
subtitle = []
|
||||||
full_text = []
|
full_text = []
|
||||||
for i, d in enumerate(artifact.get("data")):
|
for i, d in enumerate(artifact.get("data")):
|
||||||
subtitle.append(f"{i+1}\n{d.get('start')} --> {d.get('end')}\n{d.get('text').strip()}")
|
subtitle.append(f"{i + 1}\n{d.get('start')} --> {d.get('end')}\n{d.get('text').strip()}")
|
||||||
full_text.append(d.get('text').strip())
|
full_text.append(d.get("text").strip())
|
||||||
if not len(subtitle): continue
|
if not len(subtitle):
|
||||||
if self.include_srt: result[f"artifact_{art_id}_subtitle"] = "\n".join(subtitle)
|
continue
|
||||||
|
if self.include_srt:
|
||||||
|
result[f"artifact_{art_id}_subtitle"] = "\n".join(subtitle)
|
||||||
result[f"artifact_{art_id}_text"] = "\n".join(full_text)
|
result[f"artifact_{art_id}_text"] = "\n".join(full_text)
|
||||||
# call /delete endpoint on timely success
|
# call /delete endpoint on timely success
|
||||||
r_del = requests.delete(f'{self.api_endpoint}/jobs/{job_id}', headers={'Authorization': f'Bearer {self.api_key}'})
|
r_del = requests.delete(
|
||||||
|
f"{self.api_endpoint}/jobs/{job_id}", headers={"Authorization": f"Bearer {self.api_key}"}
|
||||||
|
)
|
||||||
logger.debug(f"DELETE whisper {job_id=} result: {r_del.status_code}")
|
logger.debug(f"DELETE whisper {job_id=} result: {r_del.status_code}")
|
||||||
return result
|
return result
|
||||||
return False
|
return False
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
""" Auto Archiver Utilities. """
|
"""Auto Archiver Utilities."""
|
||||||
|
|
||||||
# we need to explicitly expose the available imports here
|
# we need to explicitly expose the available imports here
|
||||||
from .misc import *
|
from .misc import *
|
||||||
from .webdriver import Webdriver
|
from .webdriver import Webdriver
|
||||||
|
|
||||||
# handy utils from ytdlp
|
# handy utils from ytdlp
|
||||||
from yt_dlp.utils import (clean_html, traverse_obj, strip_or_none, url_or_none)
|
from yt_dlp.utils import clean_html, traverse_obj, strip_or_none, url_or_none
|
||||||
|
|||||||
@@ -16,22 +16,23 @@ def mkdir_if_not_exists(folder):
|
|||||||
|
|
||||||
def expand_url(url):
|
def expand_url(url):
|
||||||
# expand short URL links
|
# expand short URL links
|
||||||
if 'https://t.co/' in url:
|
if "https://t.co/" in url:
|
||||||
try:
|
try:
|
||||||
r = requests.get(url)
|
r = requests.get(url)
|
||||||
logger.debug(f'Expanded url {url} to {r.url}')
|
logger.debug(f"Expanded url {url} to {r.url}")
|
||||||
return r.url
|
return r.url
|
||||||
except:
|
except Exception:
|
||||||
logger.error(f'Failed to expand url {url}')
|
logger.error(f"Failed to expand url {url}")
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
def getattr_or(o: object, prop: str, default=None):
|
def getattr_or(o: object, prop: str, default=None):
|
||||||
try:
|
try:
|
||||||
res = getattr(o, prop)
|
res = getattr(o, prop)
|
||||||
if res is None: raise
|
if res is None:
|
||||||
|
raise
|
||||||
return res
|
return res
|
||||||
except:
|
except Exception:
|
||||||
return default
|
return default
|
||||||
|
|
||||||
|
|
||||||
@@ -61,18 +62,19 @@ def random_str(length: int = 32) -> str:
|
|||||||
return str(uuid.uuid4()).replace("-", "")[:length]
|
return str(uuid.uuid4()).replace("-", "")[:length]
|
||||||
|
|
||||||
|
|
||||||
def calculate_file_hash(filename: str, hash_algo = hashlib.sha256, chunksize: int = 16000000) -> str:
|
def calculate_file_hash(filename: str, hash_algo=hashlib.sha256, chunksize: int = 16000000) -> str:
|
||||||
hash = hash_algo()
|
hash = hash_algo()
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
while True:
|
while True:
|
||||||
buf = f.read(chunksize)
|
buf = f.read(chunksize)
|
||||||
if not buf: break
|
if not buf:
|
||||||
|
break
|
||||||
hash.update(buf)
|
hash.update(buf)
|
||||||
return hash.hexdigest()
|
return hash.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
def get_datetime_from_str(dt_str: str, fmt: str | None = None, dayfirst=True) -> datetime | None:
|
def get_datetime_from_str(dt_str: str, fmt: str | None = None, dayfirst=True) -> datetime | None:
|
||||||
""" parse a datetime string with option of passing a specific format
|
"""parse a datetime string with option of passing a specific format
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
dt_str: the datetime string to parse
|
dt_str: the datetime string to parse
|
||||||
@@ -88,19 +90,24 @@ def get_datetime_from_str(dt_str: str, fmt: str | None = None, dayfirst=True) ->
|
|||||||
|
|
||||||
|
|
||||||
def get_timestamp(ts, utc=True, iso=True, dayfirst=True) -> str | datetime | None:
|
def get_timestamp(ts, utc=True, iso=True, dayfirst=True) -> str | datetime | None:
|
||||||
""" Consistent parsing of timestamps.
|
"""Consistent parsing of timestamps.
|
||||||
Args:
|
Args:
|
||||||
If utc=True, the timezone is set to UTC,
|
If utc=True, the timezone is set to UTC,
|
||||||
if iso=True, the output is an iso string
|
if iso=True, the output is an iso string
|
||||||
Use dayfirst to signify between date formats which put the date vs month first:
|
Use dayfirst to signify between date formats which put the date vs month first:
|
||||||
e.g. DD/MM/YYYY vs MM/DD/YYYY
|
e.g. DD/MM/YYYY vs MM/DD/YYYY
|
||||||
"""
|
"""
|
||||||
if not ts: return
|
if not ts:
|
||||||
|
return
|
||||||
try:
|
try:
|
||||||
if isinstance(ts, str): ts = parse_dt(ts, dayfirst=dayfirst)
|
if isinstance(ts, str):
|
||||||
if isinstance(ts, (int, float)): ts = datetime.fromtimestamp(ts)
|
ts = parse_dt(ts, dayfirst=dayfirst)
|
||||||
if utc: ts = ts.replace(tzinfo=timezone.utc)
|
if isinstance(ts, (int, float)):
|
||||||
if iso: return ts.isoformat()
|
ts = datetime.fromtimestamp(ts)
|
||||||
|
if utc:
|
||||||
|
ts = ts.replace(tzinfo=timezone.utc)
|
||||||
|
if iso:
|
||||||
|
return ts.isoformat()
|
||||||
return ts
|
return ts
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Unable to parse timestamp {ts}: {e}")
|
logger.error(f"Unable to parse timestamp {ts}: {e}")
|
||||||
|
|||||||
@@ -4,8 +4,8 @@ from ipaddress import ip_address
|
|||||||
|
|
||||||
|
|
||||||
AUTHWALL_URLS = [
|
AUTHWALL_URLS = [
|
||||||
re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels
|
re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels
|
||||||
re.compile(r"https:\/\/www\.instagram\.com"), # instagram
|
re.compile(r"https:\/\/www\.instagram\.com"), # instagram
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@@ -14,7 +14,6 @@ def check_url_or_raise(url: str) -> bool | ValueError:
|
|||||||
Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes.
|
Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
if not (url.startswith("http://") or url.startswith("https://")):
|
if not (url.startswith("http://") or url.startswith("https://")):
|
||||||
raise ValueError(f"Invalid URL scheme for url {url}")
|
raise ValueError(f"Invalid URL scheme for url {url}")
|
||||||
|
|
||||||
@@ -45,15 +44,18 @@ def check_url_or_raise(url: str) -> bool | ValueError:
|
|||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def domain_for_url(url: str) -> str:
|
def domain_for_url(url: str) -> str:
|
||||||
"""
|
"""
|
||||||
SECURITY: parse the domain using urllib to avoid any potential security issues
|
SECURITY: parse the domain using urllib to avoid any potential security issues
|
||||||
"""
|
"""
|
||||||
return urlparse(url).netloc
|
return urlparse(url).netloc
|
||||||
|
|
||||||
|
|
||||||
def clean(url: str) -> str:
|
def clean(url: str) -> str:
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
def is_auth_wall(url: str) -> bool:
|
def is_auth_wall(url: str) -> bool:
|
||||||
"""
|
"""
|
||||||
checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
|
checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
|
||||||
@@ -64,13 +66,15 @@ def is_auth_wall(url: str) -> bool:
|
|||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def remove_get_parameters(url: str) -> str:
|
def remove_get_parameters(url: str) -> str:
|
||||||
# http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
|
# http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
|
||||||
# useful for mimetypes to work
|
# useful for mimetypes to work
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
new_url = urlunparse(parsed_url._replace(query=''))
|
new_url = urlunparse(parsed_url._replace(query=""))
|
||||||
return new_url
|
return new_url
|
||||||
|
|
||||||
|
|
||||||
def is_relevant_url(url: str) -> bool:
|
def is_relevant_url(url: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc.
|
Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc.
|
||||||
@@ -78,42 +82,59 @@ def is_relevant_url(url: str) -> bool:
|
|||||||
clean_url = remove_get_parameters(url)
|
clean_url = remove_get_parameters(url)
|
||||||
|
|
||||||
# favicons
|
# favicons
|
||||||
if "favicon" in url: return False
|
if "favicon" in url:
|
||||||
|
return False
|
||||||
# ifnore icons
|
# ifnore icons
|
||||||
if clean_url.endswith(".ico"): return False
|
if clean_url.endswith(".ico"):
|
||||||
|
return False
|
||||||
# ignore SVGs
|
# ignore SVGs
|
||||||
if remove_get_parameters(url).endswith(".svg"): return False
|
if remove_get_parameters(url).endswith(".svg"):
|
||||||
|
return False
|
||||||
|
|
||||||
# twitter profile pictures
|
# twitter profile pictures
|
||||||
if "twimg.com/profile_images" in url: return False
|
if "twimg.com/profile_images" in url:
|
||||||
if "twimg.com" in url and "/default_profile_images" in url: return False
|
return False
|
||||||
|
if "twimg.com" in url and "/default_profile_images" in url:
|
||||||
|
return False
|
||||||
|
|
||||||
# instagram profile pictures
|
# instagram profile pictures
|
||||||
if "https://scontent.cdninstagram.com/" in url and "150x150" in url: return False
|
if "https://scontent.cdninstagram.com/" in url and "150x150" in url:
|
||||||
|
return False
|
||||||
# instagram recurring images
|
# instagram recurring images
|
||||||
if "https://static.cdninstagram.com/rsrc.php/" in url: return False
|
if "https://static.cdninstagram.com/rsrc.php/" in url:
|
||||||
|
return False
|
||||||
|
|
||||||
# telegram
|
# telegram
|
||||||
if "https://telegram.org/img/emoji/" in url: return False
|
if "https://telegram.org/img/emoji/" in url:
|
||||||
|
return False
|
||||||
|
|
||||||
# youtube
|
# youtube
|
||||||
if "https://www.youtube.com/s/gaming/emoji/" in url: return False
|
if "https://www.youtube.com/s/gaming/emoji/" in url:
|
||||||
if "https://yt3.ggpht.com" in url and "default-user=" in url: return False
|
return False
|
||||||
if "https://www.youtube.com/s/search/audio/" in url: return False
|
if "https://yt3.ggpht.com" in url and "default-user=" in url:
|
||||||
|
return False
|
||||||
|
if "https://www.youtube.com/s/search/audio/" in url:
|
||||||
|
return False
|
||||||
|
|
||||||
# ok
|
# ok
|
||||||
if " https://ok.ru/res/i/" in url: return False
|
if " https://ok.ru/res/i/" in url:
|
||||||
|
return False
|
||||||
|
|
||||||
# vk
|
# vk
|
||||||
if "https://vk.com/emoji/" in url: return False
|
if "https://vk.com/emoji/" in url:
|
||||||
if "vk.com/images/" in url: return False
|
return False
|
||||||
if "vk.com/images/reaction/" in url: return False
|
if "vk.com/images/" in url:
|
||||||
|
return False
|
||||||
|
if "vk.com/images/reaction/" in url:
|
||||||
|
return False
|
||||||
|
|
||||||
# wikipedia
|
# wikipedia
|
||||||
if "wikipedia.org/static" in url: return False
|
if "wikipedia.org/static" in url:
|
||||||
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def twitter_best_quality_url(url: str) -> str:
|
def twitter_best_quality_url(url: str) -> str:
|
||||||
"""
|
"""
|
||||||
some twitter image URLs point to a less-than best quality
|
some twitter image URLs point to a less-than best quality
|
||||||
|
|||||||
@@ -1,11 +1,12 @@
|
|||||||
""" This Webdriver class acts as a context manager for the selenium webdriver. """
|
"""This Webdriver class acts as a context manager for the selenium webdriver."""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
|
|
||||||
#import domain_for_url
|
# import domain_for_url
|
||||||
from urllib.parse import urlparse, urlunparse
|
from urllib.parse import urlparse, urlunparse
|
||||||
from http.cookiejar import MozillaCookieJar
|
from http.cookiejar import MozillaCookieJar
|
||||||
|
|
||||||
@@ -20,15 +21,14 @@ from loguru import logger
|
|||||||
|
|
||||||
|
|
||||||
class CookieSettingDriver(webdriver.Firefox):
|
class CookieSettingDriver(webdriver.Firefox):
|
||||||
|
|
||||||
facebook_accept_cookies: bool
|
facebook_accept_cookies: bool
|
||||||
cookies: str
|
cookies: str
|
||||||
cookiejar: MozillaCookieJar
|
cookiejar: MozillaCookieJar
|
||||||
|
|
||||||
def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs):
|
def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs):
|
||||||
if os.environ.get('RUNNING_IN_DOCKER'):
|
if os.environ.get("RUNNING_IN_DOCKER"):
|
||||||
# Selenium doesn't support linux-aarch64 driver, we need to set this manually
|
# Selenium doesn't support linux-aarch64 driver, we need to set this manually
|
||||||
kwargs['service'] = webdriver.FirefoxService(executable_path='/usr/local/bin/geckodriver')
|
kwargs["service"] = webdriver.FirefoxService(executable_path="/usr/local/bin/geckodriver")
|
||||||
|
|
||||||
super(CookieSettingDriver, self).__init__(*args, **kwargs)
|
super(CookieSettingDriver, self).__init__(*args, **kwargs)
|
||||||
self.cookies = cookies
|
self.cookies = cookies
|
||||||
@@ -39,38 +39,44 @@ class CookieSettingDriver(webdriver.Firefox):
|
|||||||
if self.cookies or self.cookiejar:
|
if self.cookies or self.cookiejar:
|
||||||
# set up the driver to make it not 'cookie averse' (needs a context/URL)
|
# set up the driver to make it not 'cookie averse' (needs a context/URL)
|
||||||
# get the 'robots.txt' file which should be quick and easy
|
# get the 'robots.txt' file which should be quick and easy
|
||||||
robots_url = urlunparse(urlparse(url)._replace(path='/robots.txt', query='', fragment=''))
|
robots_url = urlunparse(urlparse(url)._replace(path="/robots.txt", query="", fragment=""))
|
||||||
super(CookieSettingDriver, self).get(robots_url)
|
super(CookieSettingDriver, self).get(robots_url)
|
||||||
|
|
||||||
if self.cookies:
|
if self.cookies:
|
||||||
# an explicit cookie is set for this site, use that first
|
# an explicit cookie is set for this site, use that first
|
||||||
for cookie in self.cookies.split(";"):
|
for cookie in self.cookies.split(";"):
|
||||||
for name, value in cookie.split("="):
|
for name, value in cookie.split("="):
|
||||||
self.driver.add_cookie({'name': name, 'value': value})
|
self.driver.add_cookie({"name": name, "value": value})
|
||||||
elif self.cookiejar:
|
elif self.cookiejar:
|
||||||
domain = urlparse(url).netloc
|
domain = urlparse(url).netloc
|
||||||
regex = re.compile(f"(www)?\.?{domain}$")
|
regex = re.compile(f"(www)?\.?{domain}$")
|
||||||
for cookie in self.cookiejar:
|
for cookie in self.cookiejar:
|
||||||
if regex.match(cookie.domain):
|
if regex.match(cookie.domain):
|
||||||
try:
|
try:
|
||||||
self.add_cookie({
|
self.add_cookie(
|
||||||
'name': cookie.name,
|
{
|
||||||
'value': cookie.value,
|
"name": cookie.name,
|
||||||
'path': cookie.path,
|
"value": cookie.value,
|
||||||
'domain': cookie.domain,
|
"path": cookie.path,
|
||||||
'secure': bool(cookie.secure),
|
"domain": cookie.domain,
|
||||||
'expiry': cookie.expires
|
"secure": bool(cookie.secure),
|
||||||
})
|
"expiry": cookie.expires,
|
||||||
|
}
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Failed to add cookie ({cookie.domain}) to webdriver for url {domain}: {e}")
|
logger.warning(f"Failed to add cookie ({cookie.domain}) to webdriver for url {domain}: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
super(CookieSettingDriver, self).get(url)
|
super(CookieSettingDriver, self).get(url)
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
|
|
||||||
# Try and use some common button text to reject/accept cookies
|
# Try and use some common button text to reject/accept cookies
|
||||||
for text in ["Refuse non-essential cookies", "Decline optional cookies", "Reject additional cookies", "Reject all", "Accept all cookies"]:
|
for text in [
|
||||||
|
"Refuse non-essential cookies",
|
||||||
|
"Decline optional cookies",
|
||||||
|
"Reject additional cookies",
|
||||||
|
"Reject all",
|
||||||
|
"Accept all cookies",
|
||||||
|
]:
|
||||||
try:
|
try:
|
||||||
xpath = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]"
|
xpath = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]"
|
||||||
self.find_element(By.XPATH, xpath).click()
|
self.find_element(By.XPATH, xpath).click()
|
||||||
@@ -89,11 +95,34 @@ class CookieSettingDriver(webdriver.Firefox):
|
|||||||
logger.warning("Unable to find the 'close' button on the facebook login window")
|
logger.warning("Unable to find the 'close' button on the facebook login window")
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
else:
|
||||||
|
# for all other sites, try and use some common button text to reject/accept cookies
|
||||||
|
for text in [
|
||||||
|
"Refuse non-essential cookies",
|
||||||
|
"Decline optional cookies",
|
||||||
|
"Reject additional cookies",
|
||||||
|
"Reject all",
|
||||||
|
"Accept all cookies",
|
||||||
|
]:
|
||||||
|
try:
|
||||||
|
xpath = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]"
|
||||||
|
WebDriverWait(self, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))).click()
|
||||||
|
break
|
||||||
|
except selenium_exceptions.WebDriverException:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Webdriver:
|
class Webdriver:
|
||||||
def __init__(self, width: int, height: int, timeout_seconds: int,
|
def __init__(
|
||||||
facebook_accept_cookies: bool = False, http_proxy: str = "",
|
self,
|
||||||
print_options: dict = {}, auth: dict = {}) -> webdriver:
|
width: int,
|
||||||
|
height: int,
|
||||||
|
timeout_seconds: int,
|
||||||
|
facebook_accept_cookies: bool = False,
|
||||||
|
http_proxy: str = "",
|
||||||
|
print_options: dict = {},
|
||||||
|
auth: dict = {},
|
||||||
|
) -> webdriver:
|
||||||
self.width = width
|
self.width = width
|
||||||
self.height = height
|
self.height = height
|
||||||
self.timeout_seconds = timeout_seconds
|
self.timeout_seconds = timeout_seconds
|
||||||
@@ -108,20 +137,26 @@ class Webdriver:
|
|||||||
def __enter__(self) -> webdriver:
|
def __enter__(self) -> webdriver:
|
||||||
options = webdriver.FirefoxOptions()
|
options = webdriver.FirefoxOptions()
|
||||||
options.add_argument("--headless")
|
options.add_argument("--headless")
|
||||||
options.add_argument(f'--proxy-server={self.http_proxy}')
|
options.add_argument(f"--proxy-server={self.http_proxy}")
|
||||||
options.set_preference('network.protocol-handler.external.tg', False)
|
options.set_preference("network.protocol-handler.external.tg", False)
|
||||||
# if facebook cookie popup is present, force the browser to English since then it's easier to click the 'Decline optional cookies' option
|
# if facebook cookie popup is present, force the browser to English since then it's easier to click the 'Decline optional cookies' option
|
||||||
if self.facebook_accept_cookies:
|
if self.facebook_accept_cookies:
|
||||||
options.add_argument('--lang=en')
|
options.add_argument("--lang=en")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.driver = CookieSettingDriver(cookies=self.auth.get('cookies'), cookiejar=self.auth.get('cookies_jar'),
|
self.driver = CookieSettingDriver(
|
||||||
facebook_accept_cookies=self.facebook_accept_cookies, options=options)
|
cookies=self.auth.get("cookies"),
|
||||||
|
cookiejar=self.auth.get("cookies_jar"),
|
||||||
|
facebook_accept_cookies=self.facebook_accept_cookies,
|
||||||
|
options=options,
|
||||||
|
)
|
||||||
self.driver.set_window_size(self.width, self.height)
|
self.driver.set_window_size(self.width, self.height)
|
||||||
self.driver.set_page_load_timeout(self.timeout_seconds)
|
self.driver.set_page_load_timeout(self.timeout_seconds)
|
||||||
self.driver.print_options = self.print_options
|
self.driver.print_options = self.print_options
|
||||||
except selenium_exceptions.TimeoutException as e:
|
except selenium_exceptions.TimeoutException as e:
|
||||||
logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")
|
logger.error(
|
||||||
|
f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}"
|
||||||
|
)
|
||||||
|
|
||||||
return self.driver
|
return self.driver
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
""" Version information for the auto_archiver package.
|
"""Version information for the auto_archiver package.
|
||||||
TODO: This is a placeholder to replicate previous versioning.
|
TODO: This is a placeholder to replicate previous versioning.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from importlib.metadata import version as get_version
|
from importlib.metadata import version as get_version
|
||||||
|
|
||||||
VERSION_SHORT = get_version("auto_archiver")
|
VERSION_SHORT = get_version("auto_archiver")
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
"""
|
"""
|
||||||
pytest conftest file, for shared fixtures and configuration
|
pytest conftest file, for shared fixtures and configuration
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
@@ -16,18 +17,20 @@ from auto_archiver.core.module import ModuleFactory
|
|||||||
# that you only want to run if everything else succeeds (e.g. API calls). The order here is important
|
# that you only want to run if everything else succeeds (e.g. API calls). The order here is important
|
||||||
# what comes first will be run first (at the end of all other tests not mentioned)
|
# what comes first will be run first (at the end of all other tests not mentioned)
|
||||||
# format is the name of the module (python file) without the .py extension
|
# format is the name of the module (python file) without the .py extension
|
||||||
TESTS_TO_RUN_LAST = ['test_twitter_api_archiver']
|
TESTS_TO_RUN_LAST = ["test_twitter_api_archiver"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def setup_module(request):
|
def setup_module(request):
|
||||||
def _setup_module(module_name, config={}):
|
def _setup_module(module_name, config=None):
|
||||||
|
if config is None:
|
||||||
|
config = {}
|
||||||
module_factory = ModuleFactory()
|
module_factory = ModuleFactory()
|
||||||
|
|
||||||
if isinstance(module_name, type):
|
if isinstance(module_name, type):
|
||||||
# get the module name:
|
# get the module name:
|
||||||
# if the class does not have a .name, use the name of the parent folder
|
# if the class does not have a .name, use the name of the parent folder
|
||||||
module_name = module_name.__module__.rsplit(".",2)[-2]
|
module_name = module_name.__module__.rsplit(".", 2)[-2]
|
||||||
|
|
||||||
m = module_factory.get_module(module_name, {module_name: config})
|
m = module_factory.get_module(module_name, {module_name: config})
|
||||||
# add the tmp_dir to the module
|
# add the tmp_dir to the module
|
||||||
@@ -36,12 +39,14 @@ def setup_module(request):
|
|||||||
|
|
||||||
def cleanup():
|
def cleanup():
|
||||||
tmp_dir.cleanup()
|
tmp_dir.cleanup()
|
||||||
|
|
||||||
request.addfinalizer(cleanup)
|
request.addfinalizer(cleanup)
|
||||||
|
|
||||||
return m
|
return m
|
||||||
|
|
||||||
return _setup_module
|
return _setup_module
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def check_hash():
|
def check_hash():
|
||||||
def _check_hash(filename: str, hash: str):
|
def _check_hash(filename: str, hash: str):
|
||||||
@@ -51,6 +56,7 @@ def check_hash():
|
|||||||
|
|
||||||
return _check_hash
|
return _check_hash
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def make_item():
|
def make_item():
|
||||||
def _make_item(url: str, **kwargs) -> Metadata:
|
def _make_item(url: str, **kwargs) -> Metadata:
|
||||||
@@ -62,7 +68,6 @@ def make_item():
|
|||||||
return _make_item
|
return _make_item
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_collection_modifyitems(items):
|
def pytest_collection_modifyitems(items):
|
||||||
module_mapping = {item: item.module.__name__.split(".")[-1] for item in items}
|
module_mapping = {item: item.module.__name__.split(".")[-1] for item in items}
|
||||||
|
|
||||||
@@ -78,13 +83,13 @@ def pytest_collection_modifyitems(items):
|
|||||||
items[:] = sorted_items
|
items[:] = sorted_items
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Incremental testing - fail tests in a class if any previous test fails
|
# Incremental testing - fail tests in a class if any previous test fails
|
||||||
# taken from https://docs.pytest.org/en/latest/example/simple.html#incremental-testing-test-steps
|
# taken from https://docs.pytest.org/en/latest/example/simple.html#incremental-testing-test-steps
|
||||||
|
|
||||||
# store history of failures per test class name and per index in parametrize (if parametrize used)
|
# store history of failures per test class name and per index in parametrize (if parametrize used)
|
||||||
_test_failed_incremental: Dict[str, Dict[Tuple[int, ...], str]] = {}
|
_test_failed_incremental: Dict[str, Dict[Tuple[int, ...], str]] = {}
|
||||||
|
|
||||||
|
|
||||||
def pytest_runtest_makereport(item, call):
|
def pytest_runtest_makereport(item, call):
|
||||||
if "incremental" in item.keywords:
|
if "incremental" in item.keywords:
|
||||||
# incremental marker is used
|
# incremental marker is used
|
||||||
@@ -93,17 +98,11 @@ def pytest_runtest_makereport(item, call):
|
|||||||
# retrieve the class name of the test
|
# retrieve the class name of the test
|
||||||
cls_name = str(item.cls)
|
cls_name = str(item.cls)
|
||||||
# retrieve the index of the test (if parametrize is used in combination with incremental)
|
# retrieve the index of the test (if parametrize is used in combination with incremental)
|
||||||
parametrize_index = (
|
parametrize_index = tuple(item.callspec.indices.values()) if hasattr(item, "callspec") else ()
|
||||||
tuple(item.callspec.indices.values())
|
|
||||||
if hasattr(item, "callspec")
|
|
||||||
else ()
|
|
||||||
)
|
|
||||||
# retrieve the name of the test function
|
# retrieve the name of the test function
|
||||||
test_name = item.originalname or item.name
|
test_name = item.originalname or item.name
|
||||||
# store in _test_failed_incremental the original name of the failed test
|
# store in _test_failed_incremental the original name of the failed test
|
||||||
_test_failed_incremental.setdefault(cls_name, {}).setdefault(
|
_test_failed_incremental.setdefault(cls_name, {}).setdefault(parametrize_index, test_name)
|
||||||
parametrize_index, test_name
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_runtest_setup(item):
|
def pytest_runtest_setup(item):
|
||||||
@@ -119,16 +118,17 @@ def pytest_runtest_setup(item):
|
|||||||
pytest.xfail(f"previous test failed ({test_name})")
|
pytest.xfail(f"previous test failed ({test_name})")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
@pytest.fixture()
|
||||||
def unpickle():
|
def unpickle():
|
||||||
"""
|
"""
|
||||||
Returns a helper function that unpickles a file
|
Returns a helper function that unpickles a file
|
||||||
** gets the file from the test_files directory: tests/data/ **
|
** gets the file from the test_files directory: tests/data/ **
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def _unpickle(path):
|
def _unpickle(path):
|
||||||
with open(os.path.join("tests/data", path), "rb") as f:
|
with open(os.path.join("tests/data", path), "rb") as f:
|
||||||
return pickle.load(f)
|
return pickle.load(f)
|
||||||
|
|
||||||
return _unpickle
|
return _unpickle
|
||||||
|
|
||||||
|
|
||||||
@@ -145,9 +145,9 @@ def sample_datetime():
|
|||||||
return datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc)
|
return datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture
|
||||||
def mock_sleep(mocker):
|
def mock_sleep(mocker):
|
||||||
"""Globally mock time.sleep to avoid delays."""
|
"""Mock time.sleep to avoid delays."""
|
||||||
return mocker.patch("time.sleep")
|
return mocker.patch("time.sleep")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user