mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-07 19:08:30 +03:00
Ruff format with defaults.
This commit is contained in:
22
.github/workflows/ruff.yaml
vendored
Normal file
22
.github/workflows/ruff.yaml
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
name: Ruff Formatting & Linting
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
ruff:
|
||||
name: Run Ruff Checks
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Run Ruff (Lint & Format Check)
|
||||
uses: astral-sh/ruff-action@v1
|
||||
with:
|
||||
args: "check . --output-format=concise"
|
||||
|
||||
- name: Run Ruff Format Check
|
||||
uses: astral-sh/ruff-action@v1
|
||||
with:
|
||||
args: "format --check ."
|
||||
@@ -1 +1 @@
|
||||
from scripts import generate_module_docs
|
||||
from scripts import generate_module_docs
|
||||
|
||||
@@ -10,12 +10,12 @@ MODULES_FOLDER = Path(__file__).parent.parent.parent.parent / "src" / "auto_arch
|
||||
SAVE_FOLDER = Path(__file__).parent.parent / "source" / "modules" / "autogen"
|
||||
|
||||
type_color = {
|
||||
'feeder': "<span style='color: #FFA500'>[feeder](/core_modules.md#feeder-modules)</a></span>",
|
||||
'extractor': "<span style='color: #00FF00'>[extractor](/core_modules.md#extractor-modules)</a></span>",
|
||||
'enricher': "<span style='color: #0000FF'>[enricher](/core_modules.md#enricher-modules)</a></span>",
|
||||
'database': "<span style='color: #FF00FF'>[database](/core_modules.md#database-modules)</a></span>",
|
||||
'storage': "<span style='color: #FFFF00'>[storage](/core_modules.md#storage-modules)</a></span>",
|
||||
'formatter': "<span style='color: #00FFFF'>[formatter](/core_modules.md#formatter-modules)</a></span>",
|
||||
"feeder": "<span style='color: #FFA500'>[feeder](/core_modules.md#feeder-modules)</a></span>",
|
||||
"extractor": "<span style='color: #00FF00'>[extractor](/core_modules.md#extractor-modules)</a></span>",
|
||||
"enricher": "<span style='color: #0000FF'>[enricher](/core_modules.md#enricher-modules)</a></span>",
|
||||
"database": "<span style='color: #FF00FF'>[database](/core_modules.md#database-modules)</a></span>",
|
||||
"storage": "<span style='color: #FFFF00'>[storage](/core_modules.md#storage-modules)</a></span>",
|
||||
"formatter": "<span style='color: #00FFFF'>[formatter](/core_modules.md#formatter-modules)</a></span>",
|
||||
}
|
||||
|
||||
TABLE_HEADER = ("Option", "Description", "Default", "Type")
|
||||
@@ -34,6 +34,7 @@ steps:
|
||||
|
||||
"""
|
||||
|
||||
|
||||
def generate_module_docs():
|
||||
yaml = YAML()
|
||||
SAVE_FOLDER.mkdir(exist_ok=True)
|
||||
@@ -48,49 +49,49 @@ def generate_module_docs():
|
||||
# generate the markdown file from the __manifest__.py file.
|
||||
|
||||
manifest = module.manifest
|
||||
for type in manifest['type']:
|
||||
for type in manifest["type"]:
|
||||
modules_by_type.setdefault(type, []).append(module)
|
||||
|
||||
description = "\n".join(l.lstrip() for l in manifest['description'].split("\n"))
|
||||
types = ", ".join(type_color[t] for t in manifest['type'])
|
||||
description = "\n".join(l.lstrip() for l in manifest["description"].split("\n"))
|
||||
types = ", ".join(type_color[t] for t in manifest["type"])
|
||||
readme_str = f"""
|
||||
# {manifest['name']}
|
||||
# {manifest["name"]}
|
||||
```{{admonition}} Module type
|
||||
|
||||
{types}
|
||||
```
|
||||
{description}
|
||||
"""
|
||||
steps_str = "\n".join(f" {t}s:\n - {module.name}" for t in manifest['type'])
|
||||
"""
|
||||
steps_str = "\n".join(f" {t}s:\n - {module.name}" for t in manifest["type"])
|
||||
|
||||
if not manifest['configs']:
|
||||
if not manifest["configs"]:
|
||||
config_string = f"# No configuration options for {module.name}.*\n"
|
||||
else:
|
||||
|
||||
config_table = header_row
|
||||
config_yaml = {}
|
||||
|
||||
global_yaml[module.name] = CommentedMap()
|
||||
global_yaml.yaml_set_comment_before_after_key(module.name, f"\n\n{module.display_name} configuration options")
|
||||
global_yaml.yaml_set_comment_before_after_key(
|
||||
module.name, f"\n\n{module.display_name} configuration options"
|
||||
)
|
||||
|
||||
|
||||
for key, value in manifest['configs'].items():
|
||||
type = value.get('type', 'string')
|
||||
if type == 'json_loader':
|
||||
value['type'] = 'json'
|
||||
elif type == 'str':
|
||||
for key, value in manifest["configs"].items():
|
||||
type = value.get("type", "string")
|
||||
if type == "json_loader":
|
||||
value["type"] = "json"
|
||||
elif type == "str":
|
||||
type = "string"
|
||||
|
||||
default = value.get('default', '')
|
||||
|
||||
default = value.get("default", "")
|
||||
config_yaml[key] = default
|
||||
|
||||
global_yaml[module.name][key] = default
|
||||
|
||||
if value.get('help', ''):
|
||||
global_yaml[module.name].yaml_add_eol_comment(value.get('help', ''), key)
|
||||
if value.get("help", ""):
|
||||
global_yaml[module.name].yaml_add_eol_comment(value.get("help", ""), key)
|
||||
|
||||
help = "**Required**. " if value.get('required', False) else "Optional. "
|
||||
help += value.get('help', '')
|
||||
help = "**Required**. " if value.get("required", False) else "Optional. "
|
||||
help += value.get("help", "")
|
||||
config_table += f"| `{module.name}.{key}` | {help} | {value.get('default', '')} | {type} |\n"
|
||||
global_table += f"| `{module.name}.{key}` | {help} | {default} | {type} |\n"
|
||||
readme_str += "\n## Configuration Options\n"
|
||||
@@ -98,18 +99,18 @@ def generate_module_docs():
|
||||
|
||||
config_string = io.BytesIO()
|
||||
yaml.dump({module.name: config_yaml}, config_string)
|
||||
config_string = config_string.getvalue().decode('utf-8')
|
||||
config_string = config_string.getvalue().decode("utf-8")
|
||||
yaml_string = EXAMPLE_YAML.format(steps_str=steps_str, config_string=config_string)
|
||||
readme_str += f"```{{code}} yaml\n{yaml_string}\n```\n"
|
||||
|
||||
if manifest['configs']:
|
||||
if manifest["configs"]:
|
||||
readme_str += "\n### Command Line:\n"
|
||||
readme_str += config_table
|
||||
|
||||
# add a link to the autodoc refs
|
||||
readme_str += f"\n[API Reference](../../../autoapi/{module.name}/index)\n"
|
||||
# create the module.type folder, use the first type just for where to store the file
|
||||
for type in manifest['type']:
|
||||
for type in manifest["type"]:
|
||||
type_folder = SAVE_FOLDER / type
|
||||
type_folder.mkdir(exist_ok=True)
|
||||
with open(type_folder / f"{module.name}.md", "w") as f:
|
||||
@@ -117,10 +118,10 @@ def generate_module_docs():
|
||||
f.write(readme_str)
|
||||
generate_index(modules_by_type)
|
||||
|
||||
del global_yaml['placeholder']
|
||||
del global_yaml["placeholder"]
|
||||
global_string = io.BytesIO()
|
||||
global_yaml = yaml.dump(global_yaml, global_string)
|
||||
global_string = global_string.getvalue().decode('utf-8')
|
||||
global_string = global_string.getvalue().decode("utf-8")
|
||||
global_yaml = f"```yaml\n{global_string}\n```"
|
||||
with open(SAVE_FOLDER / "configs_cheatsheet.md", "w") as f:
|
||||
f.write("### Configuration File\n" + global_yaml + "\n### Command Line\n" + global_table)
|
||||
@@ -144,4 +145,4 @@ def generate_index(modules_by_type):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
generate_module_docs()
|
||||
generate_module_docs()
|
||||
|
||||
@@ -5,7 +5,7 @@ import os
|
||||
from importlib.metadata import metadata
|
||||
from datetime import datetime
|
||||
|
||||
sys.path.append(os.path.abspath('../scripts'))
|
||||
sys.path.append(os.path.abspath("../scripts"))
|
||||
from scripts import generate_module_docs
|
||||
from auto_archiver.version import __version__
|
||||
|
||||
@@ -20,33 +20,35 @@ project = package_metadata["name"]
|
||||
copyright = str(datetime.now().year)
|
||||
author = "Bellingcat"
|
||||
release = package_metadata["version"]
|
||||
language = 'en'
|
||||
language = "en"
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
extensions = [
|
||||
"myst_parser", # Markdown support
|
||||
"autoapi.extension", # Generate API documentation from docstrings
|
||||
"sphinxcontrib.mermaid", # Mermaid diagrams
|
||||
"sphinx.ext.viewcode", # Source code links
|
||||
"myst_parser", # Markdown support
|
||||
"autoapi.extension", # Generate API documentation from docstrings
|
||||
"sphinxcontrib.mermaid", # Mermaid diagrams
|
||||
"sphinx.ext.viewcode", # Source code links
|
||||
"sphinx_copybutton",
|
||||
"sphinx.ext.napoleon", # Google-style and NumPy-style docstrings
|
||||
"sphinx.ext.napoleon", # Google-style and NumPy-style docstrings
|
||||
"sphinx.ext.autosectionlabel",
|
||||
# 'sphinx.ext.autosummary', # Summarize module/class/function docs
|
||||
]
|
||||
|
||||
templates_path = ['_templates']
|
||||
templates_path = ["_templates"]
|
||||
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", ""]
|
||||
|
||||
|
||||
# -- AutoAPI Configuration ---------------------------------------------------
|
||||
autoapi_type = 'python'
|
||||
autoapi_type = "python"
|
||||
autoapi_dirs = ["../../src/auto_archiver/core/", "../../src/auto_archiver/utils/"]
|
||||
# get all the modules and add them to the autoapi_dirs
|
||||
autoapi_dirs.extend([f"../../src/auto_archiver/modules/{m}" for m in os.listdir("../../src/auto_archiver/modules")])
|
||||
autodoc_typehints = "signature" # Include type hints in the signature
|
||||
autoapi_ignore = ["*/version.py", ] # Ignore specific modules
|
||||
autoapi_keep_files = True # Option to retain intermediate JSON files for debugging
|
||||
autoapi_add_toctree_entry = True # Include API docs in the TOC
|
||||
autodoc_typehints = "signature" # Include type hints in the signature
|
||||
autoapi_ignore = [
|
||||
"*/version.py",
|
||||
] # Ignore specific modules
|
||||
autoapi_keep_files = True # Option to retain intermediate JSON files for debugging
|
||||
autoapi_add_toctree_entry = True # Include API docs in the TOC
|
||||
autoapi_python_use_implicit_namespaces = True
|
||||
autoapi_template_dir = "../_templates/autoapi"
|
||||
autoapi_options = [
|
||||
@@ -59,13 +61,13 @@ autoapi_options = [
|
||||
|
||||
# -- Markdown Support --------------------------------------------------------
|
||||
myst_enable_extensions = [
|
||||
"deflist", # Definition lists
|
||||
"html_admonition", # HTML-style admonitions
|
||||
"html_image", # Inline HTML images
|
||||
"replacements", # Substitutions like (C)
|
||||
"smartquotes", # Smart quotes
|
||||
"linkify", # Auto-detect links
|
||||
"substitution", # Text substitutions
|
||||
"deflist", # Definition lists
|
||||
"html_admonition", # HTML-style admonitions
|
||||
"html_image", # Inline HTML images
|
||||
"replacements", # Substitutions like (C)
|
||||
"smartquotes", # Smart quotes
|
||||
"linkify", # Auto-detect links
|
||||
"substitution", # Text substitutions
|
||||
]
|
||||
myst_heading_anchors = 2
|
||||
myst_fence_as_directive = ["mermaid"]
|
||||
@@ -76,7 +78,7 @@ source_suffix = {
|
||||
}
|
||||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
html_theme = 'sphinx_book_theme'
|
||||
html_theme = "sphinx_book_theme"
|
||||
html_static_path = ["../_static"]
|
||||
html_css_files = ["custom.css"]
|
||||
html_title = f"Auto Archiver v{__version__}"
|
||||
@@ -87,7 +89,6 @@ html_theme_options = {
|
||||
}
|
||||
|
||||
|
||||
|
||||
copybutton_prompt_text = r">>> |\.\.\."
|
||||
copybutton_prompt_is_regexp = True
|
||||
copybutton_only_copy_prompt_lines = False
|
||||
copybutton_only_copy_prompt_lines = False
|
||||
|
||||
@@ -98,6 +98,9 @@ line-length = 120
|
||||
|
||||
[tool.ruff.lint]
|
||||
#add bugbear?
|
||||
# I : isort
|
||||
# UP : upgrade, e.g. use fstrings
|
||||
# ANN : annotations
|
||||
#extend-select = ["B"]
|
||||
|
||||
# E701 - multiple statements on one line (I vote to keep this but I notice it's used quite a lot!)
|
||||
|
||||
@@ -70,11 +70,7 @@ def main(credentials, token):
|
||||
print(emailAddress)
|
||||
|
||||
# Call the Drive v3 API and return some files
|
||||
results = (
|
||||
service.files()
|
||||
.list(pageSize=10, fields="nextPageToken, files(id, name)")
|
||||
.execute()
|
||||
)
|
||||
results = service.files().list(pageSize=10, fields="nextPageToken, files(id, name)").execute()
|
||||
items = results.get("files", [])
|
||||
|
||||
if not items:
|
||||
|
||||
@@ -8,12 +8,14 @@ from auto_archiver.core.module import ModuleFactory
|
||||
from auto_archiver.core.consts import MODULE_TYPES
|
||||
from auto_archiver.core.config import EMPTY_CONFIG
|
||||
|
||||
|
||||
class SchemaEncoder(json.JSONEncoder):
|
||||
def default(self, obj):
|
||||
if isinstance(obj, set):
|
||||
return list(obj)
|
||||
return json.JSONEncoder.default(self, obj)
|
||||
|
||||
|
||||
# Get available modules
|
||||
module_factory = ModuleFactory()
|
||||
available_modules = module_factory.available_modules()
|
||||
@@ -21,32 +23,40 @@ available_modules = module_factory.available_modules()
|
||||
modules_by_type = {}
|
||||
# Categorize modules by type
|
||||
for module in available_modules:
|
||||
for type in module.manifest.get('type', []):
|
||||
for type in module.manifest.get("type", []):
|
||||
modules_by_type.setdefault(type, []).append(module)
|
||||
|
||||
all_modules_ordered_by_type = sorted(available_modules, key=lambda x: (MODULE_TYPES.index(x.type[0]), not x.requires_setup))
|
||||
all_modules_ordered_by_type = sorted(
|
||||
available_modules, key=lambda x: (MODULE_TYPES.index(x.type[0]), not x.requires_setup)
|
||||
)
|
||||
|
||||
yaml: YAML = YAML()
|
||||
|
||||
config_string = io.BytesIO()
|
||||
yaml.dump(EMPTY_CONFIG, config_string)
|
||||
config_string = config_string.getvalue().decode('utf-8')
|
||||
config_string = config_string.getvalue().decode("utf-8")
|
||||
output_schema = {
|
||||
'modules': dict((module.name,
|
||||
{
|
||||
'name': module.name,
|
||||
'display_name': module.display_name,
|
||||
'manifest': module.manifest,
|
||||
'configs': module.configs or None
|
||||
}
|
||||
) for module in all_modules_ordered_by_type),
|
||||
'steps': dict((f"{module_type}s", [module.name for module in modules_by_type[module_type]]) for module_type in MODULE_TYPES),
|
||||
'configs': [m.name for m in all_modules_ordered_by_type if m.configs],
|
||||
'module_types': MODULE_TYPES,
|
||||
'empty_config': config_string
|
||||
"modules": dict(
|
||||
(
|
||||
module.name,
|
||||
{
|
||||
"name": module.name,
|
||||
"display_name": module.display_name,
|
||||
"manifest": module.manifest,
|
||||
"configs": module.configs or None,
|
||||
},
|
||||
)
|
||||
for module in all_modules_ordered_by_type
|
||||
),
|
||||
"steps": dict(
|
||||
(f"{module_type}s", [module.name for module in modules_by_type[module_type]]) for module_type in MODULE_TYPES
|
||||
),
|
||||
"configs": [m.name for m in all_modules_ordered_by_type if m.configs],
|
||||
"module_types": MODULE_TYPES,
|
||||
"empty_config": config_string,
|
||||
}
|
||||
|
||||
current_file_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
output_file = os.path.join(current_file_dir, 'settings/src/schema.json')
|
||||
with open(output_file, 'w') as file:
|
||||
json.dump(output_schema, file, indent=4, cls=SchemaEncoder)
|
||||
output_file = os.path.join(current_file_dir, "settings/src/schema.json")
|
||||
with open(output_file, "w") as file:
|
||||
json.dump(output_schema, file, indent=4, cls=SchemaEncoder)
|
||||
|
||||
@@ -12,7 +12,6 @@ Then run this script to create a new session file.
|
||||
You will need to provide your phone number and a 2FA code the first time you run this script.
|
||||
"""
|
||||
|
||||
|
||||
import os
|
||||
from telethon.sync import TelegramClient
|
||||
from loguru import logger
|
||||
@@ -26,4 +25,3 @@ SESSION_FILE = "secrets/anon-insta"
|
||||
os.makedirs("secrets", exist_ok=True)
|
||||
with TelegramClient(SESSION_FILE, API_ID, API_HASH) as client:
|
||||
logger.success(f"New session file created: {SESSION_FILE}.session")
|
||||
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
""" Entry point for the auto_archiver package. """
|
||||
"""Entry point for the auto_archiver package."""
|
||||
|
||||
from auto_archiver.core.orchestrator import ArchivingOrchestrator
|
||||
import sys
|
||||
|
||||
|
||||
def main():
|
||||
for _ in ArchivingOrchestrator()._command_line_run(sys.argv[1:]): pass
|
||||
for _ in ArchivingOrchestrator()._command_line_run(sys.argv[1:]):
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
""" Core modules to handle things such as orchestration, metadata and configs..
|
||||
"""Core modules to handle things such as orchestration, metadata and configs.."""
|
||||
|
||||
"""
|
||||
from .metadata import Metadata
|
||||
from .media import Media
|
||||
from .base_module import BaseModule
|
||||
@@ -14,4 +13,4 @@ from .enricher import Enricher
|
||||
from .feeder import Feeder
|
||||
from .storage import Storage
|
||||
from .extractor import Extractor
|
||||
from .formatter import Formatter
|
||||
from .formatter import Formatter
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Mapping, Any, Type, TYPE_CHECKING
|
||||
from typing import Mapping, Any, Type, TYPE_CHECKING
|
||||
from abc import ABC
|
||||
from copy import deepcopy, copy
|
||||
from tempfile import TemporaryDirectory
|
||||
@@ -13,8 +12,8 @@ from loguru import logger
|
||||
if TYPE_CHECKING:
|
||||
from .module import ModuleFactory
|
||||
|
||||
class BaseModule(ABC):
|
||||
|
||||
class BaseModule(ABC):
|
||||
"""
|
||||
Base module class. All modules should inherit from this class.
|
||||
|
||||
@@ -46,14 +45,13 @@ class BaseModule(ABC):
|
||||
|
||||
@property
|
||||
def storages(self) -> list:
|
||||
return self.config.get('storages', [])
|
||||
return self.config.get("storages", [])
|
||||
|
||||
def config_setup(self, config: dict):
|
||||
|
||||
# this is important. Each instance is given its own deepcopied config, so modules cannot
|
||||
# change values to affect other modules
|
||||
config = deepcopy(config)
|
||||
authentication = deepcopy(config.pop('authentication', {}))
|
||||
authentication = deepcopy(config.pop("authentication", {}))
|
||||
|
||||
self.authentication = authentication
|
||||
self.config = config
|
||||
@@ -68,7 +66,7 @@ class BaseModule(ABC):
|
||||
"""
|
||||
Returns the authentication information for a given site. This is used to authenticate
|
||||
with a site before extracting data. The site should be the domain of the site, e.g. 'twitter.com'
|
||||
|
||||
|
||||
:param site: the domain of the site to get authentication information for
|
||||
:param extract_cookies: whether or not to extract cookies from the given browser/file and return the cookie jar (disabling can speed up processing if you don't actually need the cookies jar).
|
||||
|
||||
@@ -94,7 +92,6 @@ class BaseModule(ABC):
|
||||
# add the 'www' version of the site to the list of sites to check
|
||||
authdict = {}
|
||||
|
||||
|
||||
for to_try in [site, f"www.{site}"]:
|
||||
if to_try in self.authentication:
|
||||
authdict.update(self.authentication[to_try])
|
||||
@@ -104,17 +101,20 @@ class BaseModule(ABC):
|
||||
if not authdict:
|
||||
for key in self.authentication.keys():
|
||||
if key in site or site in key:
|
||||
logger.debug(f"Could not find exact authentication information for site '{site}'. \
|
||||
logger.debug(
|
||||
f"Could not find exact authentication information for site '{site}'. \
|
||||
did find information for '{key}' which is close, is this what you meant? \
|
||||
If so, edit your authentication settings to make sure it exactly matches.")
|
||||
If so, edit your authentication settings to make sure it exactly matches."
|
||||
)
|
||||
|
||||
def get_ytdlp_cookiejar(args):
|
||||
import yt_dlp
|
||||
from yt_dlp import parse_options
|
||||
|
||||
logger.debug(f"Extracting cookies from settings: {args[1]}")
|
||||
# parse_options returns a named tuple as follows, we only need the ydl_options part
|
||||
# collections.namedtuple('ParsedOptions', ('parser', 'options', 'urls', 'ydl_opts'))
|
||||
ytdlp_opts = getattr(parse_options(args), 'ydl_opts')
|
||||
ytdlp_opts = getattr(parse_options(args), "ydl_opts")
|
||||
return yt_dlp.YoutubeDL(ytdlp_opts).cookiejar
|
||||
|
||||
get_cookiejar_options = None
|
||||
@@ -125,22 +125,21 @@ If so, edit your authentication settings to make sure it exactly matches.")
|
||||
# 3. cookies_from_browser setting in global config
|
||||
# 4. cookies_file setting in global config
|
||||
|
||||
if 'cookies_from_browser' in authdict:
|
||||
get_cookiejar_options = ['--cookies-from-browser', authdict['cookies_from_browser']]
|
||||
elif 'cookies_file' in authdict:
|
||||
get_cookiejar_options = ['--cookies', authdict['cookies_file']]
|
||||
elif 'cookies_from_browser' in self.authentication:
|
||||
authdict['cookies_from_browser'] = self.authentication['cookies_from_browser']
|
||||
get_cookiejar_options = ['--cookies-from-browser', self.authentication['cookies_from_browser']]
|
||||
elif 'cookies_file' in self.authentication:
|
||||
authdict['cookies_file'] = self.authentication['cookies_file']
|
||||
get_cookiejar_options = ['--cookies', self.authentication['cookies_file']]
|
||||
if "cookies_from_browser" in authdict:
|
||||
get_cookiejar_options = ["--cookies-from-browser", authdict["cookies_from_browser"]]
|
||||
elif "cookies_file" in authdict:
|
||||
get_cookiejar_options = ["--cookies", authdict["cookies_file"]]
|
||||
elif "cookies_from_browser" in self.authentication:
|
||||
authdict["cookies_from_browser"] = self.authentication["cookies_from_browser"]
|
||||
get_cookiejar_options = ["--cookies-from-browser", self.authentication["cookies_from_browser"]]
|
||||
elif "cookies_file" in self.authentication:
|
||||
authdict["cookies_file"] = self.authentication["cookies_file"]
|
||||
get_cookiejar_options = ["--cookies", self.authentication["cookies_file"]]
|
||||
|
||||
|
||||
if get_cookiejar_options:
|
||||
authdict['cookies_jar'] = get_ytdlp_cookiejar(get_cookiejar_options)
|
||||
authdict["cookies_jar"] = get_ytdlp_cookiejar(get_cookiejar_options)
|
||||
|
||||
return authdict
|
||||
|
||||
|
||||
def repr(self):
|
||||
return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
|
||||
return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
|
||||
|
||||
@@ -20,12 +20,14 @@ _yaml: YAML = YAML()
|
||||
|
||||
DEFAULT_CONFIG_FILE = "secrets/orchestration.yaml"
|
||||
|
||||
EMPTY_CONFIG = _yaml.load("""
|
||||
EMPTY_CONFIG = _yaml.load(
|
||||
"""
|
||||
# Auto Archiver Configuration
|
||||
|
||||
# Steps are the modules that will be run in the order they are defined
|
||||
steps:""" + "".join([f"\n {module}s: []" for module in MODULE_TYPES]) + \
|
||||
"""
|
||||
steps:"""
|
||||
+ "".join([f"\n {module}s: []" for module in MODULE_TYPES])
|
||||
+ """
|
||||
|
||||
# Global configuration
|
||||
|
||||
@@ -52,14 +54,14 @@ authentication: {}
|
||||
logging:
|
||||
level: INFO
|
||||
|
||||
""")
|
||||
"""
|
||||
)
|
||||
# note: 'logging' is explicitly added above in order to better format the config file
|
||||
|
||||
|
||||
# Arg Parse Actions/Classes
|
||||
class AuthenticationJsonParseAction(argparse.Action):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
|
||||
try:
|
||||
auth_dict = json.loads(values)
|
||||
setattr(namespace, self.dest, auth_dict)
|
||||
@@ -68,34 +70,38 @@ class AuthenticationJsonParseAction(argparse.Action):
|
||||
|
||||
def load_from_file(path):
|
||||
try:
|
||||
with open(path, 'r') as f:
|
||||
with open(path, "r") as f:
|
||||
try:
|
||||
auth_dict = json.load(f)
|
||||
except json.JSONDecodeError:
|
||||
f.seek(0)
|
||||
# maybe it's yaml, try that
|
||||
auth_dict = _yaml.load(f)
|
||||
if auth_dict.get('authentication'):
|
||||
auth_dict = auth_dict['authentication']
|
||||
auth_dict['load_from_file'] = path
|
||||
if auth_dict.get("authentication"):
|
||||
auth_dict = auth_dict["authentication"]
|
||||
auth_dict["load_from_file"] = path
|
||||
return auth_dict
|
||||
except:
|
||||
return None
|
||||
|
||||
if isinstance(auth_dict, dict) and auth_dict.get('from_file'):
|
||||
auth_dict = load_from_file(auth_dict['from_file'])
|
||||
if isinstance(auth_dict, dict) and auth_dict.get("from_file"):
|
||||
auth_dict = load_from_file(auth_dict["from_file"])
|
||||
elif isinstance(auth_dict, str):
|
||||
# if it's a string
|
||||
auth_dict = load_from_file(auth_dict)
|
||||
|
||||
|
||||
if not isinstance(auth_dict, dict):
|
||||
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
|
||||
global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file']
|
||||
raise argparse.ArgumentTypeError(
|
||||
"Authentication must be a dictionary of site names and their authentication methods"
|
||||
)
|
||||
global_options = ["cookies_from_browser", "cookies_file", "load_from_file"]
|
||||
for key, auth in auth_dict.items():
|
||||
if key in global_options:
|
||||
continue
|
||||
if not isinstance(key, str) or not isinstance(auth, dict):
|
||||
raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}")
|
||||
raise argparse.ArgumentTypeError(
|
||||
f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}"
|
||||
)
|
||||
|
||||
setattr(namespace, self.dest, auth_dict)
|
||||
|
||||
@@ -106,8 +112,8 @@ class UniqueAppendAction(argparse.Action):
|
||||
if value not in getattr(namespace, self.dest):
|
||||
getattr(namespace, self.dest).append(value)
|
||||
|
||||
class DefaultValidatingParser(argparse.ArgumentParser):
|
||||
|
||||
class DefaultValidatingParser(argparse.ArgumentParser):
|
||||
def error(self, message):
|
||||
"""
|
||||
Override of error to format a nicer looking error message using logger
|
||||
@@ -136,8 +142,10 @@ class DefaultValidatingParser(argparse.ArgumentParser):
|
||||
|
||||
return super().parse_known_args(args, namespace)
|
||||
|
||||
|
||||
# Config Utils
|
||||
|
||||
|
||||
def to_dot_notation(yaml_conf: CommentedMap | dict) -> dict:
|
||||
dotdict = {}
|
||||
|
||||
@@ -151,6 +159,7 @@ def to_dot_notation(yaml_conf: CommentedMap | dict) -> dict:
|
||||
process_subdict(yaml_conf)
|
||||
return dotdict
|
||||
|
||||
|
||||
def from_dot_notation(dotdict: dict) -> dict:
|
||||
normal_dict = {}
|
||||
|
||||
@@ -171,9 +180,11 @@ def from_dot_notation(dotdict: dict) -> dict:
|
||||
def is_list_type(value):
|
||||
return isinstance(value, list) or isinstance(value, tuple) or isinstance(value, set)
|
||||
|
||||
|
||||
def is_dict_type(value):
|
||||
return isinstance(value, dict) or isinstance(value, CommentedMap)
|
||||
|
||||
|
||||
def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
|
||||
yaml_dict: CommentedMap = deepcopy(yaml_dict)
|
||||
|
||||
@@ -184,7 +195,7 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
|
||||
yaml_subdict[key] = value
|
||||
continue
|
||||
|
||||
if key == 'steps':
|
||||
if key == "steps":
|
||||
for module_type, modules in value.items():
|
||||
# overwrite the 'steps' from the config file with the ones from the CLI
|
||||
yaml_subdict[key][module_type] = modules
|
||||
@@ -199,6 +210,7 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
|
||||
update_dict(from_dot_notation(dotdict), yaml_dict)
|
||||
return yaml_dict
|
||||
|
||||
|
||||
def read_yaml(yaml_filename: str) -> CommentedMap:
|
||||
config = None
|
||||
try:
|
||||
@@ -212,6 +224,7 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
|
||||
|
||||
return config
|
||||
|
||||
|
||||
# TODO: make this tidier/find a way to notify of which keys should not be stored
|
||||
|
||||
|
||||
@@ -219,13 +232,14 @@ def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
|
||||
config_to_save = deepcopy(config)
|
||||
|
||||
auth_dict = config_to_save.get("authentication", {})
|
||||
if auth_dict and auth_dict.get('load_from_file'):
|
||||
if auth_dict and auth_dict.get("load_from_file"):
|
||||
# remove all other values from the config, don't want to store it in the config file
|
||||
auth_dict = {"load_from_file": auth_dict["load_from_file"]}
|
||||
|
||||
config_to_save.pop('urls', None)
|
||||
config_to_save.pop("urls", None)
|
||||
with open(yaml_filename, "w", encoding="utf-8") as outf:
|
||||
_yaml.dump(config_to_save, outf)
|
||||
|
||||
|
||||
def is_valid_config(config: CommentedMap) -> bool:
|
||||
return config and config != EMPTY_CONFIG
|
||||
return config and config != EMPTY_CONFIG
|
||||
|
||||
@@ -1,23 +1,15 @@
|
||||
|
||||
MODULE_TYPES = [
|
||||
'feeder',
|
||||
'extractor',
|
||||
'enricher',
|
||||
'database',
|
||||
'storage',
|
||||
'formatter'
|
||||
]
|
||||
MODULE_TYPES = ["feeder", "extractor", "enricher", "database", "storage", "formatter"]
|
||||
|
||||
MANIFEST_FILE = "__manifest__.py"
|
||||
|
||||
DEFAULT_MANIFEST = {
|
||||
'name': '', # the display name of the module
|
||||
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
|
||||
'type': [], # the type of the module, can be one or more of MODULE_TYPES
|
||||
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional software
|
||||
'description': '', # a description of the module
|
||||
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
|
||||
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
|
||||
'version': '1.0', # the version of the module
|
||||
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
|
||||
}
|
||||
"name": "", # the display name of the module
|
||||
"author": "Bellingcat", # creator of the module, leave this as Bellingcat or set your own name!
|
||||
"type": [], # the type of the module, can be one or more of MODULE_TYPES
|
||||
"requires_setup": True, # whether or not this module requires additional setup such as setting API Keys or installing additional software
|
||||
"description": "", # a description of the module
|
||||
"dependencies": {}, # external dependencies, e.g. python packages or binaries, in dictionary format
|
||||
"entry_point": "", # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
|
||||
"version": "1.0", # the version of the module
|
||||
"configs": {}, # any configuration options this module has, these will be exposed to the user in the config file or via the command line
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""
|
||||
Database module for the auto-archiver that defines the interface for implementing database modules
|
||||
in the media archiving framework.
|
||||
in the media archiving framework.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -9,6 +9,7 @@ from typing import Union
|
||||
|
||||
from auto_archiver.core import Metadata, BaseModule
|
||||
|
||||
|
||||
class Database(BaseModule):
|
||||
"""
|
||||
Base class for implementing database modules in the media archiving framework.
|
||||
@@ -20,7 +21,7 @@ class Database(BaseModule):
|
||||
"""signals the DB that the given item archival has started"""
|
||||
pass
|
||||
|
||||
def failed(self, item: Metadata, reason:str) -> None:
|
||||
def failed(self, item: Metadata, reason: str) -> None:
|
||||
"""update DB accordingly for failure"""
|
||||
pass
|
||||
|
||||
@@ -34,6 +35,6 @@ class Database(BaseModule):
|
||||
return False
|
||||
|
||||
@abstractmethod
|
||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
pass
|
||||
|
||||
@@ -8,13 +8,15 @@ the archiving step and before storage or formatting.
|
||||
|
||||
Enrichers are optional but highly useful for making the archived data more powerful.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from auto_archiver.core import Metadata, BaseModule
|
||||
|
||||
|
||||
class Enricher(BaseModule):
|
||||
"""Base classes and utilities for enrichers in the Auto Archiver system.
|
||||
|
||||
|
||||
Enricher modules must implement the `enrich` method to define their behavior.
|
||||
"""
|
||||
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
""" The `extractor` module defines the base functionality for implementing extractors in the media archiving framework.
|
||||
This class provides common utility methods and a standard interface for extractors.
|
||||
"""The `extractor` module defines the base functionality for implementing extractors in the media archiving framework.
|
||||
This class provides common utility methods and a standard interface for extractors.
|
||||
|
||||
Factory method to initialize an extractor instance based on its name.
|
||||
Factory method to initialize an extractor instance based on its name.
|
||||
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from pathlib import Path
|
||||
from abc import abstractmethod
|
||||
@@ -39,7 +40,7 @@ class Extractor(BaseModule):
|
||||
Used to clean unnecessary URL parameters OR unfurl redirect links
|
||||
"""
|
||||
return url
|
||||
|
||||
|
||||
def match_link(self, url: str) -> re.Match:
|
||||
"""
|
||||
Returns a match object if the given URL matches the valid_url pattern or False/None if not.
|
||||
@@ -58,7 +59,7 @@ class Extractor(BaseModule):
|
||||
"""
|
||||
if self.valid_url:
|
||||
return self.match_link(url) is not None
|
||||
|
||||
|
||||
return True
|
||||
|
||||
def _guess_file_type(self, path: str) -> str:
|
||||
@@ -74,16 +75,17 @@ class Extractor(BaseModule):
|
||||
@retry(wait_random_min=500, wait_random_max=3500, stop_max_attempt_number=5)
|
||||
def download_from_url(self, url: str, to_filename: str = None, verbose=True) -> str:
|
||||
"""
|
||||
downloads a URL to provided filename, or inferred from URL, returns local filename
|
||||
downloads a URL to provided filename, or inferred from URL, returns local filename
|
||||
"""
|
||||
if not to_filename:
|
||||
to_filename = url.split('/')[-1].split('?')[0]
|
||||
to_filename = url.split("/")[-1].split("?")[0]
|
||||
if len(to_filename) > 64:
|
||||
to_filename = to_filename[-64:]
|
||||
to_filename = os.path.join(self.tmp_dir, to_filename)
|
||||
if verbose: logger.debug(f"downloading {url[0:50]=} {to_filename=}")
|
||||
if verbose:
|
||||
logger.debug(f"downloading {url[0:50]=} {to_filename=}")
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
|
||||
}
|
||||
try:
|
||||
d = requests.get(url, stream=True, headers=headers, timeout=30)
|
||||
@@ -91,12 +93,12 @@ class Extractor(BaseModule):
|
||||
|
||||
# get mimetype from the response headers
|
||||
if not mimetypes.guess_type(to_filename)[0]:
|
||||
content_type = d.headers.get('Content-Type') or self._guess_file_type(url)
|
||||
content_type = d.headers.get("Content-Type") or self._guess_file_type(url)
|
||||
extension = mimetypes.guess_extension(content_type)
|
||||
if extension:
|
||||
to_filename += extension
|
||||
|
||||
with open(to_filename, 'wb') as f:
|
||||
with open(to_filename, "wb") as f:
|
||||
for chunk in d.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
return to_filename
|
||||
@@ -108,8 +110,8 @@ class Extractor(BaseModule):
|
||||
def download(self, item: Metadata) -> Metadata | False:
|
||||
"""
|
||||
Downloads the media from the given URL and returns a Metadata object with the downloaded media.
|
||||
|
||||
|
||||
If the URL is not supported or the download fails, this method should return False.
|
||||
|
||||
"""
|
||||
pass
|
||||
pass
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
"""
|
||||
The feeder base module defines the interface for implementing feeders in the media archiving framework.
|
||||
The feeder base module defines the interface for implementing feeders in the media archiving framework.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -7,8 +7,8 @@ from abc import abstractmethod
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.core import BaseModule
|
||||
|
||||
class Feeder(BaseModule):
|
||||
|
||||
class Feeder(BaseModule):
|
||||
"""
|
||||
Base class for implementing feeders in the media archiving framework.
|
||||
|
||||
@@ -19,7 +19,7 @@ class Feeder(BaseModule):
|
||||
def __iter__(self) -> Metadata:
|
||||
"""
|
||||
Returns an iterator (use `yield`) over the items to be archived.
|
||||
|
||||
|
||||
These should be instances of Metadata, typically created with Metadata().set_url(url).
|
||||
"""
|
||||
return None
|
||||
return None
|
||||
|
||||
@@ -12,7 +12,7 @@ from auto_archiver.core import Metadata, Media, BaseModule
|
||||
class Formatter(BaseModule):
|
||||
"""
|
||||
Base class for implementing formatters in the media archiving framework.
|
||||
|
||||
|
||||
Subclasses must implement the `format` method to define their behavior.
|
||||
"""
|
||||
|
||||
@@ -21,4 +21,4 @@ class Formatter(BaseModule):
|
||||
"""
|
||||
Formats a Metadata object into a user-viewable format (e.g. HTML) and stores it if needed.
|
||||
"""
|
||||
return None
|
||||
return None
|
||||
|
||||
@@ -27,6 +27,7 @@ class Media:
|
||||
- properties: Additional metadata or transformations for the media.
|
||||
- _mimetype: The media's mimetype (e.g., image/jpeg, video/mp4).
|
||||
"""
|
||||
|
||||
filename: str
|
||||
key: str = None
|
||||
urls: List[str] = field(default_factory=list)
|
||||
@@ -52,14 +53,15 @@ class Media:
|
||||
This function returns a generator for all the inner media.
|
||||
|
||||
"""
|
||||
if include_self: yield self
|
||||
if include_self:
|
||||
yield self
|
||||
for prop in self.properties.values():
|
||||
if isinstance(prop, Media):
|
||||
if isinstance(prop, Media):
|
||||
for inner_media in prop.all_inner_media(include_self=True):
|
||||
yield inner_media
|
||||
if isinstance(prop, list):
|
||||
for prop_media in prop:
|
||||
if isinstance(prop_media, Media):
|
||||
if isinstance(prop_media, Media):
|
||||
for inner_media in prop_media.all_inner_media(include_self=True):
|
||||
yield inner_media
|
||||
|
||||
@@ -110,15 +112,17 @@ class Media:
|
||||
# checks for video streams with ffmpeg, or min file size for a video
|
||||
# self.is_video() should be used together with this method
|
||||
try:
|
||||
streams = ffmpeg.probe(self.filename, select_streams='v')['streams']
|
||||
streams = ffmpeg.probe(self.filename, select_streams="v")["streams"]
|
||||
logger.warning(f"STREAMS FOR {self.filename} {streams}")
|
||||
return any(s.get("duration_ts", 0) > 0 for s in streams)
|
||||
except Error: return False # ffmpeg errors when reading bad files
|
||||
except Error:
|
||||
return False # ffmpeg errors when reading bad files
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
logger.error(traceback.format_exc())
|
||||
try:
|
||||
fsize = os.path.getsize(self.filename)
|
||||
return fsize > 20_000
|
||||
except: pass
|
||||
except:
|
||||
pass
|
||||
return True
|
||||
|
||||
@@ -21,6 +21,7 @@ from loguru import logger
|
||||
|
||||
from .media import Media
|
||||
|
||||
|
||||
@dataclass_json # annotation order matters
|
||||
@dataclass
|
||||
class Metadata:
|
||||
@@ -40,7 +41,8 @@ class Metadata:
|
||||
- If `True`, this instance's values are overwritten by `right`.
|
||||
- If `False`, the inverse applies.
|
||||
"""
|
||||
if not right: return self
|
||||
if not right:
|
||||
return self
|
||||
if overwrite_left:
|
||||
if right.status and len(right.status):
|
||||
self.status = right.status
|
||||
@@ -50,8 +52,10 @@ class Metadata:
|
||||
if type(v) not in [dict, list, set] or k not in self.metadata:
|
||||
self.set(k, v)
|
||||
else: # key conflict
|
||||
if type(v) in [dict, set]: self.set(k, self.get(k) | v)
|
||||
elif type(v) == list: self.set(k, self.get(k) + v)
|
||||
if type(v) in [dict, set]:
|
||||
self.set(k, self.get(k) | v)
|
||||
elif type(v) == list:
|
||||
self.set(k, self.get(k) + v)
|
||||
self.media.extend(right.media)
|
||||
else: # invert and do same logic
|
||||
return right.merge(self)
|
||||
@@ -69,7 +73,7 @@ class Metadata:
|
||||
|
||||
def append(self, key: str, val: Any) -> Metadata:
|
||||
if key not in self.metadata:
|
||||
self.metadata[key] = []
|
||||
self.metadata[key] = []
|
||||
self.metadata[key] = val
|
||||
return self
|
||||
|
||||
@@ -80,24 +84,26 @@ class Metadata:
|
||||
return self.metadata.get(key, default)
|
||||
|
||||
def success(self, context: str = None) -> Metadata:
|
||||
if context: self.status = f"{context}: success"
|
||||
else: self.status = "success"
|
||||
if context:
|
||||
self.status = f"{context}: success"
|
||||
else:
|
||||
self.status = "success"
|
||||
return self
|
||||
|
||||
def is_success(self) -> bool:
|
||||
return "success" in self.status
|
||||
|
||||
def is_empty(self) -> bool:
|
||||
meaningfull_ids = set(self.metadata.keys()) - set(["_processed_at", "url", "total_bytes", "total_size", "archive_duration_seconds"])
|
||||
meaningfull_ids = set(self.metadata.keys()) - set(
|
||||
["_processed_at", "url", "total_bytes", "total_size", "archive_duration_seconds"]
|
||||
)
|
||||
return not self.is_success() and len(self.media) == 0 and len(meaningfull_ids) == 0
|
||||
|
||||
@property # getter .netloc
|
||||
def netloc(self) -> str:
|
||||
return urlparse(self.get_url()).netloc
|
||||
|
||||
|
||||
# custom getter/setters
|
||||
|
||||
# custom getter/setters
|
||||
|
||||
def set_url(self, url: str) -> Metadata:
|
||||
assert type(url) is str and len(url) > 0, "invalid URL"
|
||||
@@ -127,12 +133,17 @@ class Metadata:
|
||||
|
||||
def get_timestamp(self, utc=True, iso=True) -> datetime.datetime:
|
||||
ts = self.get("timestamp")
|
||||
if not ts: return
|
||||
if not ts:
|
||||
return
|
||||
try:
|
||||
if type(ts) == str: ts = datetime.datetime.fromisoformat(ts)
|
||||
if type(ts) == float: ts = datetime.datetime.fromtimestamp(ts)
|
||||
if utc: ts = ts.replace(tzinfo=datetime.timezone.utc)
|
||||
if iso: return ts.isoformat()
|
||||
if type(ts) == str:
|
||||
ts = datetime.datetime.fromisoformat(ts)
|
||||
if type(ts) == float:
|
||||
ts = datetime.datetime.fromtimestamp(ts)
|
||||
if utc:
|
||||
ts = ts.replace(tzinfo=datetime.timezone.utc)
|
||||
if iso:
|
||||
return ts.isoformat()
|
||||
return ts
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to parse timestamp {ts}: {e}")
|
||||
@@ -140,16 +151,20 @@ class Metadata:
|
||||
|
||||
def add_media(self, media: Media, id: str = None) -> Metadata:
|
||||
# adds a new media, optionally including an id
|
||||
if media is None: return
|
||||
if media is None:
|
||||
return
|
||||
if id is not None:
|
||||
assert not len([1 for m in self.media if m.get("id") == id]), f"cannot add 2 pieces of media with the same id {id}"
|
||||
assert not len([1 for m in self.media if m.get("id") == id]), (
|
||||
f"cannot add 2 pieces of media with the same id {id}"
|
||||
)
|
||||
media.set("id", id)
|
||||
self.media.append(media)
|
||||
return media
|
||||
|
||||
def get_media_by_id(self, id: str, default=None) -> Media:
|
||||
for m in self.media:
|
||||
if m.get("id") == id: return m
|
||||
if m.get("id") == id:
|
||||
return m
|
||||
return default
|
||||
|
||||
def remove_duplicate_media_by_hash(self) -> None:
|
||||
@@ -159,7 +174,8 @@ class Metadata:
|
||||
with open(filename, "rb") as f:
|
||||
while True:
|
||||
buf = f.read(chunksize)
|
||||
if not buf: break
|
||||
if not buf:
|
||||
break
|
||||
hash_algo.update(buf)
|
||||
return hash_algo.hexdigest()
|
||||
|
||||
@@ -167,15 +183,18 @@ class Metadata:
|
||||
new_media = []
|
||||
for m in self.media:
|
||||
h = m.get("hash")
|
||||
if not h: h = calculate_hash_in_chunks(hashlib.sha256(), int(1.6e7), m.filename)
|
||||
if len(h) and h in media_hashes: continue
|
||||
if not h:
|
||||
h = calculate_hash_in_chunks(hashlib.sha256(), int(1.6e7), m.filename)
|
||||
if len(h) and h in media_hashes:
|
||||
continue
|
||||
media_hashes.add(h)
|
||||
new_media.append(m)
|
||||
self.media = new_media
|
||||
|
||||
def get_first_image(self, default=None) -> Media:
|
||||
for m in self.media:
|
||||
if "image" in m.mimetype: return m
|
||||
if "image" in m.mimetype:
|
||||
return m
|
||||
return default
|
||||
|
||||
def set_final_media(self, final: Media) -> Metadata:
|
||||
@@ -193,22 +212,25 @@ class Metadata:
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
|
||||
@staticmethod
|
||||
def choose_most_complete(results: List[Metadata]) -> Metadata:
|
||||
# returns the most complete result from a list of results
|
||||
# prioritizes results with more media, then more metadata
|
||||
if len(results) == 0: return None
|
||||
if len(results) == 1: return results[0]
|
||||
if len(results) == 0:
|
||||
return None
|
||||
if len(results) == 1:
|
||||
return results[0]
|
||||
most_complete = results[0]
|
||||
for r in results[1:]:
|
||||
if len(r.media) > len(most_complete.media): most_complete = r
|
||||
elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r
|
||||
if len(r.media) > len(most_complete.media):
|
||||
most_complete = r
|
||||
elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata):
|
||||
most_complete = r
|
||||
return most_complete
|
||||
|
||||
def set_context(self, key: str, val: Any) -> Metadata:
|
||||
self._context[key] = val
|
||||
return self
|
||||
|
||||
|
||||
def get_context(self, key: str, default: Any = None) -> Any:
|
||||
return self._context.get(key, default)
|
||||
return self._context.get(key, default)
|
||||
|
||||
@@ -3,6 +3,7 @@ Defines the Step abstract base class, which acts as a blueprint for steps in the
|
||||
by handling user configuration, validating the steps properties, and implementing dynamic instantiation.
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
@@ -24,17 +25,17 @@ if TYPE_CHECKING:
|
||||
|
||||
HAS_SETUP_PATHS = False
|
||||
|
||||
class ModuleFactory:
|
||||
|
||||
class ModuleFactory:
|
||||
def __init__(self):
|
||||
self._lazy_modules = {}
|
||||
|
||||
def setup_paths(self, paths: list[str]) -> None:
|
||||
"""
|
||||
Sets up the paths for the modules to be loaded from
|
||||
|
||||
|
||||
This is necessary for the modules to be imported correctly
|
||||
|
||||
|
||||
"""
|
||||
global HAS_SETUP_PATHS
|
||||
|
||||
@@ -46,11 +47,13 @@ class ModuleFactory:
|
||||
|
||||
# see odoo/module/module.py -> initialize_sys_path
|
||||
if path not in auto_archiver.modules.__path__:
|
||||
if HAS_SETUP_PATHS == True:
|
||||
logger.warning(f"You are attempting to re-initialise the module paths with: '{path}' for a 2nd time. \
|
||||
if HAS_SETUP_PATHS == True:
|
||||
logger.warning(
|
||||
f"You are attempting to re-initialise the module paths with: '{path}' for a 2nd time. \
|
||||
This could lead to unexpected behaviour. It is recommended to only use a single modules path. \
|
||||
If you wish to load modules from different paths then load a 2nd python interpreter (e.g. using multiprocessing).")
|
||||
auto_archiver.modules.__path__.append(path)
|
||||
If you wish to load modules from different paths then load a 2nd python interpreter (e.g. using multiprocessing)."
|
||||
)
|
||||
auto_archiver.modules.__path__.append(path)
|
||||
|
||||
# sort based on the length of the path, so that the longest path is last in the list
|
||||
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
|
||||
@@ -60,20 +63,20 @@ class ModuleFactory:
|
||||
def get_module(self, module_name: str, config: dict) -> BaseModule:
|
||||
"""
|
||||
Gets and sets up a module using the provided config
|
||||
|
||||
|
||||
This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy)
|
||||
|
||||
|
||||
"""
|
||||
return self.get_module_lazy(module_name).load(config)
|
||||
|
||||
def get_module_lazy(self, module_name: str, suppress_warnings: bool = False) -> LazyBaseModule:
|
||||
"""
|
||||
Lazily loads a module, returning a LazyBaseModule
|
||||
|
||||
|
||||
This has all the information about the module, but does not load the module itself or its dependencies
|
||||
|
||||
|
||||
To load an actual module, call .setup() on a lazy module
|
||||
|
||||
|
||||
"""
|
||||
if module_name in self._lazy_modules:
|
||||
return self._lazy_modules[module_name]
|
||||
@@ -81,13 +84,14 @@ class ModuleFactory:
|
||||
available = self.available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
|
||||
if not available:
|
||||
message = f"Module '{module_name}' not found. Are you sure it's installed/exists?"
|
||||
if 'archiver' in module_name:
|
||||
if "archiver" in module_name:
|
||||
message += f" Did you mean {module_name.replace('archiver', 'extractor')}?"
|
||||
raise IndexError(message)
|
||||
return available[0]
|
||||
|
||||
def available_modules(self, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
|
||||
|
||||
def available_modules(
|
||||
self, limit_to_modules: List[str] = [], suppress_warnings: bool = False
|
||||
) -> List[LazyBaseModule]:
|
||||
# search through all valid 'modules' paths. Default is 'modules' in the current directory
|
||||
|
||||
# see odoo/modules/module.py -> get_modules
|
||||
@@ -119,7 +123,7 @@ class ModuleFactory:
|
||||
self._lazy_modules[possible_module] = lazy_module
|
||||
|
||||
all_modules.append(lazy_module)
|
||||
|
||||
|
||||
if not suppress_warnings:
|
||||
for module in limit_to_modules:
|
||||
if not any(module == m.name for m in all_modules):
|
||||
@@ -127,15 +131,16 @@ class ModuleFactory:
|
||||
|
||||
return all_modules
|
||||
|
||||
|
||||
@dataclass
|
||||
class LazyBaseModule:
|
||||
|
||||
"""
|
||||
A lazy module class, which only loads the manifest and does not load the module itself.
|
||||
|
||||
This is useful for getting information about a module without actually loading it.
|
||||
|
||||
"""
|
||||
|
||||
name: str
|
||||
description: str
|
||||
path: str
|
||||
@@ -152,30 +157,30 @@ class LazyBaseModule:
|
||||
|
||||
@property
|
||||
def type(self):
|
||||
return self.manifest['type']
|
||||
return self.manifest["type"]
|
||||
|
||||
@property
|
||||
def entry_point(self):
|
||||
if not self._entry_point and not self.manifest['entry_point']:
|
||||
if not self._entry_point and not self.manifest["entry_point"]:
|
||||
# try to create the entry point from the module name
|
||||
self._entry_point = f"{self.name}::{self.name.replace('_', ' ').title().replace(' ', '')}"
|
||||
return self._entry_point
|
||||
|
||||
@property
|
||||
def dependencies(self) -> dict:
|
||||
return self.manifest['dependencies']
|
||||
|
||||
return self.manifest["dependencies"]
|
||||
|
||||
@property
|
||||
def configs(self) -> dict:
|
||||
return self.manifest['configs']
|
||||
|
||||
return self.manifest["configs"]
|
||||
|
||||
@property
|
||||
def requires_setup(self) -> bool:
|
||||
return self.manifest['requires_setup']
|
||||
|
||||
return self.manifest["requires_setup"]
|
||||
|
||||
@property
|
||||
def display_name(self) -> str:
|
||||
return self.manifest['name']
|
||||
return self.manifest["name"]
|
||||
|
||||
@property
|
||||
def manifest(self) -> dict:
|
||||
@@ -190,16 +195,15 @@ class LazyBaseModule:
|
||||
manifest.update(ast.literal_eval(f.read()))
|
||||
except (ValueError, TypeError, SyntaxError, MemoryError, RecursionError) as e:
|
||||
raise ValueError(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}")
|
||||
|
||||
|
||||
self._manifest = manifest
|
||||
self._entry_point = manifest['entry_point']
|
||||
self.description = manifest['description']
|
||||
self.version = manifest['version']
|
||||
self._entry_point = manifest["entry_point"]
|
||||
self.description = manifest["description"]
|
||||
self.version = manifest["version"]
|
||||
|
||||
return manifest
|
||||
|
||||
def load(self, config) -> BaseModule:
|
||||
|
||||
if self._instance:
|
||||
return self._instance
|
||||
|
||||
@@ -210,8 +214,10 @@ class LazyBaseModule:
|
||||
# clear out any empty strings that a user may have erroneously added
|
||||
continue
|
||||
if not check(dep):
|
||||
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
|
||||
Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
|
||||
logger.error(
|
||||
f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
|
||||
Have you installed the required dependencies for the '{self.name}' module? See the README for more information."
|
||||
)
|
||||
exit(1)
|
||||
|
||||
def check_python_dep(dep):
|
||||
@@ -219,7 +225,7 @@ class LazyBaseModule:
|
||||
try:
|
||||
m = self.module_factory.get_module_lazy(dep, suppress_warnings=True)
|
||||
try:
|
||||
# we must now load this module and set it up with the config
|
||||
# we must now load this module and set it up with the config
|
||||
m.load(config)
|
||||
return True
|
||||
except:
|
||||
@@ -231,13 +237,12 @@ class LazyBaseModule:
|
||||
|
||||
return find_spec(dep)
|
||||
|
||||
check_deps(self.dependencies.get('python', []), check_python_dep)
|
||||
check_deps(self.dependencies.get('bin', []), lambda dep: shutil.which(dep))
|
||||
|
||||
check_deps(self.dependencies.get("python", []), check_python_dep)
|
||||
check_deps(self.dependencies.get("bin", []), lambda dep: shutil.which(dep))
|
||||
|
||||
logger.debug(f"Loading module '{self.display_name}'...")
|
||||
|
||||
for qualname in [self.name, f'auto_archiver.modules.{self.name}']:
|
||||
for qualname in [self.name, f"auto_archiver.modules.{self.name}"]:
|
||||
try:
|
||||
# first import the whole module, to make sure it's working properly
|
||||
__import__(qualname)
|
||||
@@ -246,10 +251,10 @@ class LazyBaseModule:
|
||||
pass
|
||||
|
||||
# then import the file for the entry point
|
||||
file_name, class_name = self.entry_point.split('::')
|
||||
sub_qualname = f'{qualname}.{file_name}'
|
||||
file_name, class_name = self.entry_point.split("::")
|
||||
sub_qualname = f"{qualname}.{file_name}"
|
||||
|
||||
__import__(f'{qualname}.{file_name}', fromlist=[self.entry_point])
|
||||
__import__(f"{qualname}.{file_name}", fromlist=[self.entry_point])
|
||||
# finally, get the class instance
|
||||
instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
|
||||
|
||||
@@ -257,11 +262,11 @@ class LazyBaseModule:
|
||||
instance.name = self.name
|
||||
instance.display_name = self.display_name
|
||||
instance.module_factory = self.module_factory
|
||||
|
||||
# merge the default config with the user config
|
||||
default_config = dict((k, v['default']) for k, v in self.configs.items() if 'default' in v)
|
||||
|
||||
config[self.name] = default_config | config.get(self.name, {})
|
||||
# merge the default config with the user config
|
||||
default_config = dict((k, v["default"]) for k, v in self.configs.items() if "default" in v)
|
||||
|
||||
config[self.name] = default_config | config.get(self.name, {})
|
||||
instance.config_setup(config)
|
||||
instance.setup()
|
||||
|
||||
@@ -270,4 +275,4 @@ class LazyBaseModule:
|
||||
return instance
|
||||
|
||||
def __repr__(self):
|
||||
return f"Module<'{self.display_name}' ({self.name})>"
|
||||
return f"Module<'{self.display_name}' ({self.name})>"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
""" Orchestrates all archiving steps, including feeding items,
|
||||
archiving them with specific archivers, enrichment, storage,
|
||||
formatting, database operations and clean up.
|
||||
"""Orchestrates all archiving steps, including feeding items,
|
||||
archiving them with specific archivers, enrichment, storage,
|
||||
formatting, database operations and clean up.
|
||||
|
||||
"""
|
||||
|
||||
@@ -19,8 +19,17 @@ import requests
|
||||
|
||||
from .metadata import Metadata, Media
|
||||
from auto_archiver.version import __version__
|
||||
from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, is_valid_config, \
|
||||
DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE
|
||||
from .config import (
|
||||
read_yaml,
|
||||
store_yaml,
|
||||
to_dot_notation,
|
||||
merge_dicts,
|
||||
is_valid_config,
|
||||
DefaultValidatingParser,
|
||||
UniqueAppendAction,
|
||||
AuthenticationJsonParseAction,
|
||||
DEFAULT_CONFIG_FILE,
|
||||
)
|
||||
from .module import ModuleFactory, LazyBaseModule
|
||||
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
|
||||
from .consts import MODULE_TYPES
|
||||
@@ -30,10 +39,12 @@ if TYPE_CHECKING:
|
||||
from .base_module import BaseModule
|
||||
from .module import LazyBaseModule
|
||||
|
||||
|
||||
class SetupError(ValueError):
|
||||
pass
|
||||
class ArchivingOrchestrator:
|
||||
|
||||
|
||||
class ArchivingOrchestrator:
|
||||
# instance variables
|
||||
module_factory: ModuleFactory
|
||||
setup_finished: bool
|
||||
@@ -63,30 +74,63 @@ class ArchivingOrchestrator:
|
||||
epilog="Check the code at https://github.com/bellingcat/auto-archiver",
|
||||
formatter_class=RichHelpFormatter,
|
||||
)
|
||||
parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit')
|
||||
parser.add_argument('--version', action='version', version=__version__)
|
||||
parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
|
||||
parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
|
||||
parser.add_argument("--help", "-h", action="store_true", dest="help", help="show a full help message and exit")
|
||||
parser.add_argument("--version", action="version", version=__version__)
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
action="store",
|
||||
dest="config_file",
|
||||
help="the filename of the YAML configuration file (defaults to 'config.yaml')",
|
||||
default=DEFAULT_CONFIG_FILE,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mode",
|
||||
action="store",
|
||||
dest="mode",
|
||||
type=str,
|
||||
choices=["simple", "full"],
|
||||
help="the mode to run the archiver in",
|
||||
default="simple",
|
||||
)
|
||||
# override the default 'help' so we can inject all the configs and show those
|
||||
parser.add_argument('-s', '--store', dest='store', default=False, help='Store the created config in the config file', action=argparse.BooleanOptionalAction)
|
||||
parser.add_argument('--module_paths', dest='module_paths', nargs='+', default=[], help='additional paths to search for modules', action=UniqueAppendAction)
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--store",
|
||||
dest="store",
|
||||
default=False,
|
||||
help="Store the created config in the config file",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--module_paths",
|
||||
dest="module_paths",
|
||||
nargs="+",
|
||||
default=[],
|
||||
help="additional paths to search for modules",
|
||||
action=UniqueAppendAction,
|
||||
)
|
||||
|
||||
self.basic_parser = parser
|
||||
return parser
|
||||
|
||||
|
||||
def check_steps(self, config):
|
||||
for module_type in MODULE_TYPES:
|
||||
if not config['steps'].get(f"{module_type}s", []):
|
||||
if module_type == 'feeder' or module_type == 'formatter' and config['steps'].get(f"{module_type}"):
|
||||
raise SetupError(f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_type}_name_here]\n {'extractors:...' if module_type == 'feeder' else '...'}\n")
|
||||
if module_type == 'extractor' and config['steps'].get('archivers'):
|
||||
raise SetupError(f"As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_here]\n enrichers:...\n")
|
||||
raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
|
||||
if not config["steps"].get(f"{module_type}s", []):
|
||||
if module_type == "feeder" or module_type == "formatter" and config["steps"].get(f"{module_type}"):
|
||||
raise SetupError(
|
||||
f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_type}_name_here]\n {'extractors:...' if module_type == 'feeder' else '...'}\n"
|
||||
)
|
||||
if module_type == "extractor" and config["steps"].get("archivers"):
|
||||
raise SetupError(
|
||||
f"As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_here]\n enrichers:...\n"
|
||||
)
|
||||
raise SetupError(
|
||||
f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
|
||||
)
|
||||
|
||||
def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
|
||||
|
||||
# modules parser to get the overridden 'steps' values
|
||||
modules_parser = argparse.ArgumentParser(
|
||||
add_help=False,
|
||||
@@ -94,7 +138,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
self.add_modules_args(modules_parser)
|
||||
cli_modules, unused_args = modules_parser.parse_known_args(unused_args)
|
||||
for module_type in MODULE_TYPES:
|
||||
yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", [])
|
||||
yaml_config["steps"][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config[
|
||||
"steps"
|
||||
].get(f"{module_type}s", [])
|
||||
|
||||
parser = DefaultValidatingParser(
|
||||
add_help=False,
|
||||
@@ -117,30 +163,32 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
enabled_modules = []
|
||||
# first loads the modules from the config file, then from the command line
|
||||
for module_type in MODULE_TYPES:
|
||||
enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
|
||||
enabled_modules.extend(yaml_config["steps"].get(f"{module_type}s", []))
|
||||
|
||||
# clear out duplicates, but keep the order
|
||||
enabled_modules = list(dict.fromkeys(enabled_modules))
|
||||
avail_modules = self.module_factory.available_modules(limit_to_modules=enabled_modules, suppress_warnings=True)
|
||||
avail_modules = self.module_factory.available_modules(
|
||||
limit_to_modules=enabled_modules, suppress_warnings=True
|
||||
)
|
||||
self.add_individual_module_args(avail_modules, parser)
|
||||
elif basic_config.mode == 'simple':
|
||||
elif basic_config.mode == "simple":
|
||||
simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
|
||||
self.add_individual_module_args(simple_modules, parser)
|
||||
|
||||
# add them to the config
|
||||
for module in simple_modules:
|
||||
for module_type in module.type:
|
||||
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
|
||||
yaml_config["steps"].setdefault(f"{module_type}s", []).append(module.name)
|
||||
else:
|
||||
# load all modules, they're not using the 'simple' mode
|
||||
all_modules = self.module_factory.available_modules()
|
||||
# add all the modules to the steps
|
||||
for module in all_modules:
|
||||
for module_type in module.type:
|
||||
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
|
||||
yaml_config["steps"].setdefault(f"{module_type}s", []).append(module.name)
|
||||
|
||||
self.add_individual_module_args(all_modules, parser)
|
||||
|
||||
|
||||
parser.set_defaults(**to_dot_notation(yaml_config))
|
||||
|
||||
# reload the parser with the new arguments, now that we have them
|
||||
@@ -166,43 +214,76 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
store_yaml(config, basic_config.config_file)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def add_modules_args(self, parser: argparse.ArgumentParser = None):
|
||||
if not parser:
|
||||
parser = self.parser
|
||||
|
||||
# Module loading from the command line
|
||||
for module_type in MODULE_TYPES:
|
||||
parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction)
|
||||
parser.add_argument(
|
||||
f"--{module_type}s",
|
||||
dest=f"{module_type}s",
|
||||
nargs="+",
|
||||
help=f"the {module_type}s to use",
|
||||
default=[],
|
||||
action=UniqueAppendAction,
|
||||
)
|
||||
|
||||
def add_additional_args(self, parser: argparse.ArgumentParser = None):
|
||||
if not parser:
|
||||
parser = self.parser
|
||||
|
||||
parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
|
||||
parser.add_argument(
|
||||
"--authentication",
|
||||
dest="authentication",
|
||||
help="A dictionary of sites and their authentication methods \
|
||||
(token, username etc.) that extractors can use to log into \
|
||||
a website. If passing this on the command line, use a JSON string. \
|
||||
You may also pass a path to a valid JSON/YAML file which will be parsed.',
|
||||
default={},
|
||||
nargs="?",
|
||||
action=AuthenticationJsonParseAction)
|
||||
You may also pass a path to a valid JSON/YAML file which will be parsed.",
|
||||
default={},
|
||||
nargs="?",
|
||||
action=AuthenticationJsonParseAction,
|
||||
)
|
||||
|
||||
# logging arguments
|
||||
parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper)
|
||||
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
|
||||
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
|
||||
|
||||
def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
|
||||
parser.add_argument(
|
||||
"--logging.level",
|
||||
action="store",
|
||||
dest="logging.level",
|
||||
choices=["INFO", "DEBUG", "ERROR", "WARNING"],
|
||||
help="the logging level to use",
|
||||
default="INFO",
|
||||
type=str.upper,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging.file", action="store", dest="logging.file", help="the logging file to write to", default=None
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging.rotation",
|
||||
action="store",
|
||||
dest="logging.rotation",
|
||||
help="the logging rotation to use",
|
||||
default=None,
|
||||
)
|
||||
|
||||
def add_individual_module_args(
|
||||
self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None
|
||||
) -> None:
|
||||
if not modules:
|
||||
modules = self.module_factory.available_modules()
|
||||
|
||||
|
||||
for module in modules:
|
||||
if module.name == 'cli_feeder':
|
||||
if module.name == "cli_feeder":
|
||||
# special case. For the CLI feeder, allow passing URLs directly on the command line without setting --cli_feeder.urls=
|
||||
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
|
||||
parser.add_argument(
|
||||
"urls",
|
||||
nargs="*",
|
||||
default=[],
|
||||
help="URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||
)
|
||||
continue
|
||||
|
||||
|
||||
if not module.configs:
|
||||
# this module has no configs, don't show anything in the help
|
||||
# (TODO: do we want to show something about this module though, like a description?)
|
||||
@@ -211,21 +292,21 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")
|
||||
|
||||
for name, kwargs in module.configs.items():
|
||||
if not kwargs.get('metavar', None):
|
||||
if not kwargs.get("metavar", None):
|
||||
# make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR]
|
||||
kwargs['metavar'] = name.upper()
|
||||
kwargs["metavar"] = name.upper()
|
||||
|
||||
if kwargs.get('required', False):
|
||||
if kwargs.get("required", False):
|
||||
# required args shouldn't have a 'default' value, remove it
|
||||
kwargs.pop('default', None)
|
||||
kwargs.pop("default", None)
|
||||
|
||||
kwargs.pop('cli_set', None)
|
||||
should_store = kwargs.pop('should_store', False)
|
||||
kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
|
||||
kwargs.pop("cli_set", None)
|
||||
should_store = kwargs.pop("should_store", False)
|
||||
kwargs["dest"] = f"{module.name}.{kwargs.pop('dest', name)}"
|
||||
try:
|
||||
kwargs['type'] = getattr(validators, kwargs.get('type', '__invalid__'))
|
||||
kwargs["type"] = getattr(validators, kwargs.get("type", "__invalid__"))
|
||||
except AttributeError:
|
||||
kwargs['type'] = __builtins__.get(kwargs.get('type'), str)
|
||||
kwargs["type"] = __builtins__.get(kwargs.get("type"), str)
|
||||
arg = group.add_argument(f"--{module.name}.{name}", **kwargs)
|
||||
arg.should_store = should_store
|
||||
|
||||
@@ -240,12 +321,11 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
self.basic_parser.exit()
|
||||
|
||||
def setup_logging(self, config):
|
||||
logging_config = config["logging"]
|
||||
|
||||
logging_config = config['logging']
|
||||
|
||||
if logging_config.get('enabled', True) is False:
|
||||
if logging_config.get("enabled", True) is False:
|
||||
# disabled logging settings, they're set on a higher level
|
||||
logger.disable('auto_archiver')
|
||||
logger.disable("auto_archiver")
|
||||
return
|
||||
|
||||
# setup loguru logging
|
||||
@@ -255,38 +335,45 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
pass
|
||||
|
||||
# add other logging info
|
||||
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
|
||||
self.logger_id = logger.add(sys.stderr, level=logging_config['level'])
|
||||
if log_file := logging_config['file']:
|
||||
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
|
||||
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
|
||||
self.logger_id = logger.add(sys.stderr, level=logging_config["level"])
|
||||
if log_file := logging_config["file"]:
|
||||
logger.add(log_file) if not logging_config["rotation"] else logger.add(
|
||||
log_file, rotation=logging_config["rotation"]
|
||||
)
|
||||
|
||||
def install_modules(self, modules_by_type):
|
||||
"""
|
||||
Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the
|
||||
Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the
|
||||
orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type
|
||||
are loaded, the program will exit with an error message.
|
||||
"""
|
||||
|
||||
invalid_modules = []
|
||||
for module_type in MODULE_TYPES:
|
||||
|
||||
step_items = []
|
||||
modules_to_load = modules_by_type[f"{module_type}s"]
|
||||
if not modules_to_load:
|
||||
raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
|
||||
raise SetupError(
|
||||
f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
|
||||
)
|
||||
|
||||
def check_steps_ok():
|
||||
if not len(step_items):
|
||||
if len(modules_to_load):
|
||||
logger.error(f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}")
|
||||
raise SetupError(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.")
|
||||
|
||||
logger.error(
|
||||
f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}"
|
||||
)
|
||||
raise SetupError(
|
||||
f"NO {module_type.upper()}S LOADED. Please check your configuration and try again."
|
||||
)
|
||||
|
||||
if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1:
|
||||
raise SetupError(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
|
||||
if (module_type == "feeder" or module_type == "formatter") and len(step_items) > 1:
|
||||
raise SetupError(
|
||||
f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}"
|
||||
)
|
||||
|
||||
for module in modules_to_load:
|
||||
|
||||
if module in invalid_modules:
|
||||
continue
|
||||
|
||||
@@ -295,7 +382,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
|
||||
if loaded_module and module_type == 'extractor':
|
||||
if loaded_module and module_type == "extractor":
|
||||
loaded_module.cleanup()
|
||||
raise e
|
||||
|
||||
@@ -310,11 +397,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
|
||||
def load_config(self, config_file: str) -> dict:
|
||||
if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
|
||||
logger.error(f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
|
||||
logger.error(
|
||||
f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings."
|
||||
)
|
||||
raise FileNotFoundError(f"Configuration file {config_file} not found")
|
||||
|
||||
return read_yaml(config_file)
|
||||
|
||||
|
||||
def setup_config(self, args: list) -> dict:
|
||||
"""
|
||||
Sets up the configuration file, merging the default config with the user's config
|
||||
@@ -337,13 +426,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
yaml_config = self.load_config(basic_config.config_file)
|
||||
|
||||
return self.setup_complete_parser(basic_config, yaml_config, unused_args)
|
||||
|
||||
|
||||
def check_for_updates(self):
|
||||
response = requests.get("https://pypi.org/pypi/auto-archiver/json").json()
|
||||
latest_version = response['info']['version']
|
||||
latest_version = response["info"]["version"]
|
||||
# check version compared to current version
|
||||
if latest_version != __version__:
|
||||
if os.environ.get('RUNNING_IN_DOCKER'):
|
||||
if os.environ.get("RUNNING_IN_DOCKER"):
|
||||
update_cmd = "`docker pull bellingcat/auto-archiver:latest`"
|
||||
else:
|
||||
update_cmd = "`pip install --upgrade auto-archiver`"
|
||||
@@ -353,33 +442,36 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
logger.warning(f"Make sure to update to the latest version using: {update_cmd}")
|
||||
logger.warning("")
|
||||
|
||||
|
||||
def setup(self, args: list):
|
||||
"""
|
||||
Function to configure all setup of the orchestrator: setup configs and load modules.
|
||||
|
||||
|
||||
This method should only ever be called once
|
||||
"""
|
||||
|
||||
self.check_for_updates()
|
||||
|
||||
if self.setup_finished:
|
||||
logger.warning("The `setup_config()` function should only ever be run once. \
|
||||
logger.warning(
|
||||
"The `setup_config()` function should only ever be run once. \
|
||||
If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \
|
||||
For code implementatations, you should call .setup_config() once then you may call .feed() \
|
||||
multiple times to archive multiple URLs.")
|
||||
multiple times to archive multiple URLs."
|
||||
)
|
||||
return
|
||||
|
||||
self.setup_basic_parser()
|
||||
self.config = self.setup_config(args)
|
||||
|
||||
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
|
||||
self.install_modules(self.config['steps'])
|
||||
self.install_modules(self.config["steps"])
|
||||
|
||||
# log out the modules that were loaded
|
||||
for module_type in MODULE_TYPES:
|
||||
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
|
||||
|
||||
logger.info(
|
||||
f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s"))
|
||||
)
|
||||
|
||||
self.setup_finished = True
|
||||
|
||||
def _command_line_run(self, args: list) -> Generator[Metadata]:
|
||||
@@ -387,9 +479,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
This is the main entry point for the orchestrator, when run from the command line.
|
||||
|
||||
:param args: list of arguments to pass to the orchestrator - these are the command line args
|
||||
|
||||
|
||||
You should not call this method from code implementations.
|
||||
|
||||
|
||||
This method sets up the configuration, loads the modules, and runs the feed.
|
||||
If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately.
|
||||
To test configurations, without loading any modules you can also first call 'setup_configs'
|
||||
@@ -407,7 +499,6 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
e.cleanup()
|
||||
|
||||
def feed(self) -> Generator[Metadata]:
|
||||
|
||||
url_count = 0
|
||||
for feeder in self.feeders:
|
||||
for item in feeder:
|
||||
@@ -438,7 +529,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
self.cleanup()
|
||||
exit()
|
||||
except Exception as e:
|
||||
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
|
||||
logger.error(f"Got unexpected error on item {item}: {e}\n{traceback.format_exc()}")
|
||||
for d in self.databases:
|
||||
if type(e) == AssertionError:
|
||||
d.failed(item, str(e))
|
||||
@@ -453,13 +544,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
|
||||
def archive(self, result: Metadata) -> Union[Metadata, None]:
|
||||
"""
|
||||
Runs the archiving process for a single URL
|
||||
1. Each archiver can sanitize its own URLs
|
||||
2. Check for cached results in Databases, and signal start to the databases
|
||||
3. Call Archivers until one succeeds
|
||||
4. Call Enrichers
|
||||
5. Store all downloaded/generated media
|
||||
6. Call selected Formatter and store formatted if needed
|
||||
Runs the archiving process for a single URL
|
||||
1. Each archiver can sanitize its own URLs
|
||||
2. Check for cached results in Databases, and signal start to the databases
|
||||
3. Call Archivers until one succeeds
|
||||
4. Call Enrichers
|
||||
5. Store all downloaded/generated media
|
||||
6. Call selected Formatter and store formatted if needed
|
||||
"""
|
||||
|
||||
original_url = result.get_url().strip()
|
||||
@@ -475,7 +566,8 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
url = a.sanitize_url(url)
|
||||
|
||||
result.set_url(url)
|
||||
if original_url != url: result.set("original_url", original_url)
|
||||
if original_url != url:
|
||||
result.set("original_url", original_url)
|
||||
|
||||
# 2 - notify start to DBs, propagate already archived if feature enabled in DBs
|
||||
cached_result = None
|
||||
@@ -486,7 +578,8 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
if cached_result:
|
||||
logger.debug("Found previously archived entry")
|
||||
for d in self.databases:
|
||||
try: d.done(cached_result, cached=True)
|
||||
try:
|
||||
d.done(cached_result, cached=True)
|
||||
except Exception as e:
|
||||
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
||||
return cached_result
|
||||
@@ -496,13 +589,15 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
logger.info(f"Trying extractor {a.name} for {url}")
|
||||
try:
|
||||
result.merge(a.download(result))
|
||||
if result.is_success(): break
|
||||
if result.is_success():
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")
|
||||
|
||||
# 4 - call enrichers to work with archived content
|
||||
for e in self.enrichers:
|
||||
try: e.enrich(result)
|
||||
try:
|
||||
e.enrich(result)
|
||||
except Exception as exc:
|
||||
logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
|
||||
|
||||
@@ -520,12 +615,12 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
|
||||
# signal completion to databases and archivers
|
||||
for d in self.databases:
|
||||
try: d.done(result)
|
||||
try:
|
||||
d.done(result)
|
||||
except Exception as e:
|
||||
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def setup_authentication(self, config: dict) -> dict:
|
||||
"""
|
||||
@@ -534,7 +629,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
Split up strings into multiple sites if they are comma separated
|
||||
"""
|
||||
|
||||
authentication = config.get('authentication', {})
|
||||
authentication = config.get("authentication", {})
|
||||
|
||||
# extract out concatenated sites
|
||||
for key, val in copy(authentication).items():
|
||||
@@ -543,8 +638,8 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
site = site.strip()
|
||||
authentication[site] = val
|
||||
del authentication[key]
|
||||
|
||||
config['authentication'] = authentication
|
||||
|
||||
config["authentication"] = authentication
|
||||
return config
|
||||
|
||||
# Helper Properties
|
||||
|
||||
@@ -15,16 +15,16 @@ from auto_archiver.utils.misc import random_str
|
||||
from auto_archiver.core import Media, BaseModule, Metadata
|
||||
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
|
||||
|
||||
|
||||
class Storage(BaseModule):
|
||||
|
||||
"""
|
||||
Base class for implementing storage modules in the media archiving framework.
|
||||
|
||||
Subclasses must implement the `get_cdn_url` and `uploadf` methods to define their behavior.
|
||||
"""
|
||||
|
||||
def store(self, media: Media, url: str, metadata: Metadata=None) -> None:
|
||||
if media.is_stored(in_storage=self):
|
||||
def store(self, media: Media, url: str, metadata: Metadata = None) -> None:
|
||||
if media.is_stored(in_storage=self):
|
||||
logger.debug(f"{media.key} already stored, skipping")
|
||||
return
|
||||
self.set_key(media, url, metadata)
|
||||
@@ -46,14 +46,15 @@ class Storage(BaseModule):
|
||||
pass
|
||||
|
||||
def upload(self, media: Media, **kwargs) -> bool:
|
||||
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
|
||||
with open(media.filename, 'rb') as f:
|
||||
logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key}")
|
||||
with open(media.filename, "rb") as f:
|
||||
return self.uploadf(f, media, **kwargs)
|
||||
|
||||
def set_key(self, media: Media, url, metadata: Metadata) -> None:
|
||||
"""takes the media and optionally item info and generates a key"""
|
||||
if media.key is not None and len(media.key) > 0: return
|
||||
folder = metadata.get_context('folder', '')
|
||||
if media.key is not None and len(media.key) > 0:
|
||||
return
|
||||
folder = metadata.get_context("folder", "")
|
||||
filename, ext = os.path.splitext(media.filename)
|
||||
|
||||
# Handle path_generator logic
|
||||
|
||||
@@ -3,11 +3,13 @@ from pathlib import Path
|
||||
import argparse
|
||||
import json
|
||||
|
||||
|
||||
def example_validator(value):
|
||||
if "example" not in value:
|
||||
raise argparse.ArgumentTypeError(f"{value} is not a valid value for this argument")
|
||||
return value
|
||||
|
||||
|
||||
def positive_number(value):
|
||||
if value < 0:
|
||||
raise argparse.ArgumentTypeError(f"{value} is not a positive number")
|
||||
@@ -19,5 +21,6 @@ def valid_file(value):
|
||||
raise argparse.ArgumentTypeError(f"File '{value}' does not exist.")
|
||||
return value
|
||||
|
||||
|
||||
def json_loader(cli_val):
|
||||
return json.loads(cli_val)
|
||||
return json.loads(cli_val)
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .api_db import AAApiDb
|
||||
from .api_db import AAApiDb
|
||||
|
||||
@@ -11,8 +11,7 @@
|
||||
"required": True,
|
||||
"help": "API endpoint where calls are made to",
|
||||
},
|
||||
"api_token": {"default": None,
|
||||
"help": "API Bearer token."},
|
||||
"api_token": {"default": None, "help": "API Bearer token."},
|
||||
"public": {
|
||||
"default": False,
|
||||
"type": "bool",
|
||||
|
||||
@@ -12,10 +12,11 @@ class AAApiDb(Database):
|
||||
"""Connects to auto-archiver-api instance"""
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
""" query the database for the existence of this item.
|
||||
Helps avoid re-archiving the same URL multiple times.
|
||||
"""query the database for the existence of this item.
|
||||
Helps avoid re-archiving the same URL multiple times.
|
||||
"""
|
||||
if not self.use_api_cache: return
|
||||
if not self.use_api_cache:
|
||||
return
|
||||
|
||||
params = {"url": item.get_url(), "limit": 15}
|
||||
headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
|
||||
@@ -32,22 +33,25 @@ class AAApiDb(Database):
|
||||
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
if not self.store_results: return
|
||||
if not self.store_results:
|
||||
return
|
||||
if cached:
|
||||
logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
|
||||
return
|
||||
logger.debug(f"saving archive of {item.get_url()} to the AA API.")
|
||||
|
||||
payload = {
|
||||
'author_id': self.author_id,
|
||||
'url': item.get_url(),
|
||||
'public': self.public,
|
||||
'group_id': self.group_id,
|
||||
'tags': list(self.tags),
|
||||
'result': item.to_json(),
|
||||
"author_id": self.author_id,
|
||||
"url": item.get_url(),
|
||||
"public": self.public,
|
||||
"group_id": self.group_id,
|
||||
"tags": list(self.tags),
|
||||
"result": item.to_json(),
|
||||
}
|
||||
headers = {"Authorization": f"Bearer {self.api_token}"}
|
||||
response = requests.post(os.path.join(self.api_endpoint, "interop/submit-archive"), json=payload, headers=headers)
|
||||
response = requests.post(
|
||||
os.path.join(self.api_endpoint, "interop/submit-archive"), json=payload, headers=headers
|
||||
)
|
||||
|
||||
if response.status_code == 201:
|
||||
logger.success(f"AA API: {response.json()}")
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .atlos_feeder_db_storage import AtlosFeederDbStorage
|
||||
from .atlos_feeder_db_storage import AtlosFeederDbStorage
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"name": "Atlos Feeder Database Storage",
|
||||
"type": ["feeder", "database", "storage"],
|
||||
"entry_point": "atlos_feeder_db_storage::AtlosFeederDbStorage",
|
||||
"entry_point": "atlos_feeder_db_storage::AtlosFeederDbStorage",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
@@ -15,7 +15,7 @@
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"type": "str"
|
||||
"type": "str",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
@@ -42,5 +42,5 @@
|
||||
- Requires an Atlos account with a project and a valid API token for authentication.
|
||||
- Ensures only unprocessed, visible, and ready-to-archive URLs are returned.
|
||||
- Feches any media items within an Atlos project, regardless of separation into incidents.
|
||||
"""
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -10,7 +10,6 @@ from auto_archiver.utils import calculate_file_hash
|
||||
|
||||
|
||||
class AtlosFeederDbStorage(Feeder, Database, Storage):
|
||||
|
||||
def setup(self) -> requests.Session:
|
||||
"""create and return a persistent session."""
|
||||
self.session = requests.Session()
|
||||
@@ -18,9 +17,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
||||
def _get(self, endpoint: str, params: Optional[dict] = None) -> dict:
|
||||
"""Wrapper for GET requests to the Atlos API."""
|
||||
url = f"{self.atlos_url}{endpoint}"
|
||||
response = self.session.get(
|
||||
url, headers={"Authorization": f"Bearer {self.api_token}"}, params=params
|
||||
)
|
||||
response = self.session.get(url, headers={"Authorization": f"Bearer {self.api_token}"}, params=params)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
@@ -85,10 +82,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
||||
def _process_metadata(self, item: Metadata) -> dict:
|
||||
"""Process metadata for storage on Atlos. Will convert any datetime
|
||||
objects to ISO format."""
|
||||
return {
|
||||
k: v.isoformat() if hasattr(v, "isoformat") else v
|
||||
for k, v in item.metadata.items()
|
||||
}
|
||||
return {k: v.isoformat() if hasattr(v, "isoformat") else v for k, v in item.metadata.items()}
|
||||
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""Mark an item as successfully archived in Atlos."""
|
||||
@@ -129,10 +123,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
||||
|
||||
# Check whether the media has already been uploaded
|
||||
source_material = self._get(f"/api/v2/source_material/{atlos_id}")["result"]
|
||||
existing_media = [
|
||||
artifact.get("file_hash_sha256")
|
||||
for artifact in source_material.get("artifacts", [])
|
||||
]
|
||||
existing_media = [artifact.get("file_hash_sha256") for artifact in source_material.get("artifacts", [])]
|
||||
if media_hash in existing_media:
|
||||
logger.info(f"{media.filename} with SHA256 {media_hash} already uploaded to Atlos")
|
||||
return True
|
||||
@@ -150,4 +141,3 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||||
"""Upload a file-like object; not implemented."""
|
||||
pass
|
||||
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
{
|
||||
'name': 'Command Line Feeder',
|
||||
'type': ['feeder'],
|
||||
'entry_point': 'cli_feeder::CLIFeeder',
|
||||
'requires_setup': False,
|
||||
'description': 'Feeds URLs to orchestrator from the command line',
|
||||
'configs': {
|
||||
'urls': {
|
||||
'default': None,
|
||||
'help': 'URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml',
|
||||
"name": "Command Line Feeder",
|
||||
"type": ["feeder"],
|
||||
"entry_point": "cli_feeder::CLIFeeder",
|
||||
"requires_setup": False,
|
||||
"description": "Feeds URLs to orchestrator from the command line",
|
||||
"configs": {
|
||||
"urls": {
|
||||
"default": None,
|
||||
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||
},
|
||||
},
|
||||
'description': """
|
||||
"description": """
|
||||
The Command Line Feeder is the default enabled feeder for the Auto Archiver. It allows you to pass URLs directly to the orchestrator from the command line
|
||||
without the need to specify any additional configuration or command line arguments:
|
||||
|
||||
@@ -20,4 +20,4 @@ You can pass multiple URLs by separating them with a space. The URLs will be pro
|
||||
|
||||
`auto-archiver --feeder cli_feeder -- https://example.com/1/ https://example.com/2/`
|
||||
""",
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,19 +3,21 @@ from loguru import logger
|
||||
from auto_archiver.core.feeder import Feeder
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
|
||||
class CLIFeeder(Feeder):
|
||||
|
||||
class CLIFeeder(Feeder):
|
||||
def setup(self) -> None:
|
||||
self.urls = self.config['urls']
|
||||
self.urls = self.config["urls"]
|
||||
if not self.urls:
|
||||
raise ValueError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
|
||||
raise ValueError(
|
||||
"No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information."
|
||||
)
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
urls = self.config['urls']
|
||||
urls = self.config["urls"]
|
||||
for url in urls:
|
||||
logger.debug(f"Processing {url}")
|
||||
m = Metadata().set_url(url)
|
||||
m.set_context("folder", "cli")
|
||||
yield m
|
||||
|
||||
logger.success(f"Processed {len(urls)} URL(s)")
|
||||
logger.success(f"Processed {len(urls)} URL(s)")
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .console_db import ConsoleDb
|
||||
from .console_db import ConsoleDb
|
||||
|
||||
@@ -6,18 +6,18 @@ from auto_archiver.core import Metadata
|
||||
|
||||
class ConsoleDb(Database):
|
||||
"""
|
||||
Outputs results to the console
|
||||
Outputs results to the console
|
||||
"""
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.info(f"STARTED {item}")
|
||||
|
||||
def failed(self, item: Metadata, reason:str) -> None:
|
||||
def failed(self, item: Metadata, reason: str) -> None:
|
||||
logger.error(f"FAILED {item}: {reason}")
|
||||
|
||||
def aborted(self, item: Metadata) -> None:
|
||||
logger.warning(f"ABORTED {item}")
|
||||
|
||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
logger.success(f"DONE {item}")
|
||||
logger.success(f"DONE {item}")
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .csv_db import CSVDb
|
||||
from .csv_db import CSVDb
|
||||
|
||||
@@ -2,12 +2,11 @@
|
||||
"name": "CSV Database",
|
||||
"type": ["database"],
|
||||
"requires_setup": False,
|
||||
"dependencies": {"python": ["loguru"]
|
||||
},
|
||||
'entry_point': 'csv_db::CSVDb',
|
||||
"dependencies": {"python": ["loguru"]},
|
||||
"entry_point": "csv_db::CSVDb",
|
||||
"configs": {
|
||||
"csv_file": {"default": "db.csv", "help": "CSV file name to save metadata to"},
|
||||
},
|
||||
"csv_file": {"default": "db.csv", "help": "CSV file name to save metadata to"},
|
||||
},
|
||||
"description": """
|
||||
Handles exporting archival results to a CSV file.
|
||||
|
||||
|
||||
@@ -9,14 +9,15 @@ from auto_archiver.core import Metadata
|
||||
|
||||
class CSVDb(Database):
|
||||
"""
|
||||
Outputs results to a CSV file
|
||||
Outputs results to a CSV file
|
||||
"""
|
||||
|
||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
logger.success(f"DONE {item}")
|
||||
is_empty = not os.path.isfile(self.csv_file) or os.path.getsize(self.csv_file) == 0
|
||||
with open(self.csv_file, "a", encoding="utf-8") as outf:
|
||||
writer = DictWriter(outf, fieldnames=asdict(Metadata()))
|
||||
if is_empty: writer.writeheader()
|
||||
if is_empty:
|
||||
writer.writeheader()
|
||||
writer.writerow(asdict(item))
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .csv_feeder import CSVFeeder
|
||||
from .csv_feeder import CSVFeeder
|
||||
|
||||
@@ -2,26 +2,23 @@
|
||||
"name": "CSV Feeder",
|
||||
"type": ["feeder"],
|
||||
"requires_setup": False,
|
||||
"dependencies": {
|
||||
"python": ["loguru"],
|
||||
"bin": [""]
|
||||
},
|
||||
'requires_setup': True,
|
||||
'entry_point': "csv_feeder::CSVFeeder",
|
||||
"dependencies": {"python": ["loguru"], "bin": [""]},
|
||||
"requires_setup": True,
|
||||
"entry_point": "csv_feeder::CSVFeeder",
|
||||
"configs": {
|
||||
"files": {
|
||||
"default": None,
|
||||
"help": "Path to the input file(s) to read the URLs from, comma separated. \
|
||||
"files": {
|
||||
"default": None,
|
||||
"help": "Path to the input file(s) to read the URLs from, comma separated. \
|
||||
Input files should be formatted with one URL per line",
|
||||
"required": True,
|
||||
"type": "valid_file",
|
||||
"nargs": "+",
|
||||
},
|
||||
"column": {
|
||||
"default": None,
|
||||
"help": "Column number or name to read the URLs from, 0-indexed",
|
||||
}
|
||||
"required": True,
|
||||
"type": "valid_file",
|
||||
"nargs": "+",
|
||||
},
|
||||
"column": {
|
||||
"default": None,
|
||||
"help": "Column number or name to read the URLs from, 0-indexed",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Reads URLs from CSV files and feeds them into the archiving process.
|
||||
|
||||
@@ -33,5 +30,5 @@
|
||||
### Setup
|
||||
- Input files should be formatted with one URL per line, with or without a header row.
|
||||
- If you have a header row, you can specify the column number or name to read URLs from using the 'column' config option.
|
||||
"""
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -5,11 +5,10 @@ from auto_archiver.core import Feeder
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.utils import url_or_none
|
||||
|
||||
|
||||
class CSVFeeder(Feeder):
|
||||
|
||||
column = None
|
||||
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
for file in self.files:
|
||||
with open(file, "r") as f:
|
||||
@@ -20,9 +19,11 @@ class CSVFeeder(Feeder):
|
||||
try:
|
||||
url_column = first_row.index(url_column)
|
||||
except ValueError:
|
||||
logger.error(f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?")
|
||||
logger.error(
|
||||
f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?"
|
||||
)
|
||||
return
|
||||
elif not(url_or_none(first_row[url_column])):
|
||||
elif not (url_or_none(first_row[url_column])):
|
||||
# it's a header row, but we've been given a column number already
|
||||
logger.debug(f"Skipping header row: {first_row}")
|
||||
else:
|
||||
@@ -35,4 +36,4 @@ class CSVFeeder(Feeder):
|
||||
continue
|
||||
url = row[url_column]
|
||||
logger.debug(f"Processing {url}")
|
||||
yield Metadata().set_url(url)
|
||||
yield Metadata().set_url(url)
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .gdrive_storage import GDriveStorage
|
||||
from .gdrive_storage import GDriveStorage
|
||||
|
||||
@@ -22,11 +22,18 @@
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
"choices": ["random", "static"],
|
||||
},
|
||||
"root_folder_id": {"required": True,
|
||||
"help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
|
||||
"oauth_token": {"default": None,
|
||||
"help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
|
||||
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."},
|
||||
"root_folder_id": {
|
||||
"required": True,
|
||||
"help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'",
|
||||
},
|
||||
"oauth_token": {
|
||||
"default": None,
|
||||
"help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account.",
|
||||
},
|
||||
"service_account": {
|
||||
"default": "secrets/service_account.json",
|
||||
"help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account.",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
|
||||
@@ -94,5 +101,5 @@ This module integrates Google Drive as a storage backend, enabling automatic fol
|
||||
https://davemateer.com/2022/04/28/google-drive-with-python#tokens
|
||||
|
||||
|
||||
"""
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
@@ -15,12 +14,9 @@ from auto_archiver.core import Media
|
||||
from auto_archiver.core import Storage
|
||||
|
||||
|
||||
|
||||
|
||||
class GDriveStorage(Storage):
|
||||
|
||||
def setup(self) -> None:
|
||||
self.scopes = ['https://www.googleapis.com/auth/drive']
|
||||
self.scopes = ["https://www.googleapis.com/auth/drive"]
|
||||
# Initialize Google Drive service
|
||||
self._setup_google_drive_service()
|
||||
|
||||
@@ -37,25 +33,25 @@ class GDriveStorage(Storage):
|
||||
|
||||
def _initialize_with_oauth_token(self):
|
||||
"""Initialize Google Drive service with OAuth token."""
|
||||
with open(self.oauth_token, 'r') as stream:
|
||||
with open(self.oauth_token, "r") as stream:
|
||||
creds_json = json.load(stream)
|
||||
creds_json['refresh_token'] = creds_json.get("refresh_token", "")
|
||||
creds_json["refresh_token"] = creds_json.get("refresh_token", "")
|
||||
|
||||
creds = Credentials.from_authorized_user_info(creds_json, self.scopes)
|
||||
if not creds.valid and creds.expired and creds.refresh_token:
|
||||
creds.refresh(Request())
|
||||
with open(self.oauth_token, 'w') as token_file:
|
||||
with open(self.oauth_token, "w") as token_file:
|
||||
logger.debug("Saving refreshed OAuth token.")
|
||||
token_file.write(creds.to_json())
|
||||
elif not creds.valid:
|
||||
raise ValueError("Invalid OAuth token. Please regenerate the token.")
|
||||
|
||||
return build('drive', 'v3', credentials=creds)
|
||||
return build("drive", "v3", credentials=creds)
|
||||
|
||||
def _initialize_with_service_account(self):
|
||||
"""Initialize Google Drive service with service account."""
|
||||
creds = service_account.Credentials.from_service_account_file(self.service_account, scopes=self.scopes)
|
||||
return build('drive', 'v3', credentials=creds)
|
||||
return build("drive", "v3", credentials=creds)
|
||||
|
||||
def get_cdn_url(self, media: Media) -> str:
|
||||
"""
|
||||
@@ -79,7 +75,7 @@ class GDriveStorage(Storage):
|
||||
return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
|
||||
|
||||
def upload(self, media: Media, **kwargs) -> bool:
|
||||
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
|
||||
logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key}")
|
||||
"""
|
||||
1. for each sub-folder in the path check if exists or create
|
||||
2. upload file to root_id/other_paths.../filename
|
||||
@@ -95,25 +91,30 @@ class GDriveStorage(Storage):
|
||||
parent_id = upload_to
|
||||
|
||||
# upload file to gd
|
||||
logger.debug(f'uploading {filename=} to folder id {upload_to}')
|
||||
file_metadata = {
|
||||
'name': [filename],
|
||||
'parents': [upload_to]
|
||||
}
|
||||
logger.debug(f"uploading {filename=} to folder id {upload_to}")
|
||||
file_metadata = {"name": [filename], "parents": [upload_to]}
|
||||
media = MediaFileUpload(media.filename, resumable=True)
|
||||
gd_file = self.service.files().create(supportsAllDrives=True, body=file_metadata, media_body=media, fields='id').execute()
|
||||
logger.debug(f'uploadf: uploaded file {gd_file["id"]} successfully in folder={upload_to}')
|
||||
gd_file = (
|
||||
self.service.files()
|
||||
.create(supportsAllDrives=True, body=file_metadata, media_body=media, fields="id")
|
||||
.execute()
|
||||
)
|
||||
logger.debug(f"uploadf: uploaded file {gd_file['id']} successfully in folder={upload_to}")
|
||||
|
||||
# must be implemented even if unused
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||||
pass
|
||||
|
||||
def _get_id_from_parent_and_name(self, parent_id: str,
|
||||
name: str,
|
||||
retries: int = 1,
|
||||
sleep_seconds: int = 10,
|
||||
use_mime_type: bool = False,
|
||||
raise_on_missing: bool = True,
|
||||
use_cache=False):
|
||||
def _get_id_from_parent_and_name(
|
||||
self,
|
||||
parent_id: str,
|
||||
name: str,
|
||||
retries: int = 1,
|
||||
sleep_seconds: int = 10,
|
||||
use_mime_type: bool = False,
|
||||
raise_on_missing: bool = True,
|
||||
use_cache=False,
|
||||
):
|
||||
"""
|
||||
Retrieves the id of a folder or file from its @name and the @parent_id folder
|
||||
Optionally does multiple @retries and sleeps @sleep_seconds between them
|
||||
@@ -137,29 +138,36 @@ class GDriveStorage(Storage):
|
||||
query_string += f" and mimeType='application/vnd.google-apps.folder' "
|
||||
|
||||
for attempt in range(retries):
|
||||
results = self.service.files().list(
|
||||
# both below for Google Shared Drives
|
||||
supportsAllDrives=True,
|
||||
includeItemsFromAllDrives=True,
|
||||
q=query_string,
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
results = (
|
||||
self.service.files()
|
||||
.list(
|
||||
# both below for Google Shared Drives
|
||||
supportsAllDrives=True,
|
||||
includeItemsFromAllDrives=True,
|
||||
q=query_string,
|
||||
spaces="drive", # ie not appDataFolder or photos
|
||||
fields="files(id, name)",
|
||||
)
|
||||
.execute()
|
||||
)
|
||||
items = results.get("files", [])
|
||||
|
||||
if len(items) > 0:
|
||||
logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}")
|
||||
_id = items[-1]['id']
|
||||
if use_cache: self.api_cache[cache_key] = _id
|
||||
logger.debug(
|
||||
f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}"
|
||||
)
|
||||
_id = items[-1]["id"]
|
||||
if use_cache:
|
||||
self.api_cache[cache_key] = _id
|
||||
return _id
|
||||
else:
|
||||
logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}.')
|
||||
logger.debug(f"{debug_header} not found, attempt {attempt + 1}/{retries}.")
|
||||
if attempt < retries - 1:
|
||||
logger.debug(f'sleeping for {sleep_seconds} second(s)')
|
||||
logger.debug(f"sleeping for {sleep_seconds} second(s)")
|
||||
time.sleep(sleep_seconds)
|
||||
|
||||
if raise_on_missing:
|
||||
raise ValueError(f'{debug_header} not found after {retries} attempt(s)')
|
||||
raise ValueError(f"{debug_header} not found after {retries} attempt(s)")
|
||||
return None
|
||||
|
||||
def _mkdir(self, name: str, parent_id: str):
|
||||
@@ -167,12 +175,7 @@ class GDriveStorage(Storage):
|
||||
Creates a new GDrive folder @name inside folder @parent_id
|
||||
Returns id of the created folder
|
||||
"""
|
||||
logger.debug(f'Creating new folder with {name=} inside {parent_id=}')
|
||||
file_metadata = {
|
||||
'name': [name],
|
||||
'mimeType': 'application/vnd.google-apps.folder',
|
||||
'parents': [parent_id]
|
||||
}
|
||||
gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields='id').execute()
|
||||
return gd_folder.get('id')
|
||||
|
||||
logger.debug(f"Creating new folder with {name=} inside {parent_id=}")
|
||||
file_metadata = {"name": [name], "mimeType": "application/vnd.google-apps.folder", "parents": [parent_id]}
|
||||
gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields="id").execute()
|
||||
return gd_folder.get("id")
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .generic_extractor import GenericExtractor
|
||||
from .generic_extractor import GenericExtractor
|
||||
|
||||
@@ -4,15 +4,16 @@ from auto_archiver.core.extractor import Extractor
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from .dropin import GenericDropin, InfoExtractor
|
||||
|
||||
class Bluesky(GenericDropin):
|
||||
|
||||
class Bluesky(GenericDropin):
|
||||
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
|
||||
result = Metadata()
|
||||
result.set_url(url)
|
||||
result.set_title(post["record"]["text"])
|
||||
result.set_timestamp(post["record"]["createdAt"])
|
||||
for k, v in self._get_post_data(post).items():
|
||||
if v: result.set(k, v)
|
||||
if v:
|
||||
result.set(k, v)
|
||||
|
||||
# download if embeds present (1 video XOR >=1 images)
|
||||
for media in self._download_bsky_embeds(post, archiver):
|
||||
@@ -23,12 +24,12 @@ class Bluesky(GenericDropin):
|
||||
|
||||
def extract_post(self, url: str, ie_instance: InfoExtractor) -> dict:
|
||||
# TODO: If/when this PR (https://github.com/yt-dlp/yt-dlp/pull/12098) is merged on ytdlp, remove the comments and delete the code below
|
||||
handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
|
||||
handle, video_id = ie_instance._match_valid_url(url).group("handle", "id")
|
||||
return ie_instance._extract_post(handle=handle, post_id=video_id)
|
||||
|
||||
def _download_bsky_embeds(self, post: dict, archiver: Extractor) -> list[Media]:
|
||||
"""
|
||||
Iterates over image(s) or video in a Bluesky post and downloads them
|
||||
Iterates over image(s) or video in a Bluesky post and downloads them
|
||||
"""
|
||||
media = []
|
||||
embed = post.get("record", {}).get("embed", {})
|
||||
@@ -37,16 +38,15 @@ class Bluesky(GenericDropin):
|
||||
|
||||
media_url = "https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={}&did={}"
|
||||
for image_media in image_medias:
|
||||
url = media_url.format(image_media['image']['ref']['$link'], post['author']['did'])
|
||||
url = media_url.format(image_media["image"]["ref"]["$link"], post["author"]["did"])
|
||||
image_media = archiver.download_from_url(url)
|
||||
media.append(Media(image_media))
|
||||
for video_media in video_medias:
|
||||
url = media_url.format(video_media['ref']['$link'], post['author']['did'])
|
||||
url = media_url.format(video_media["ref"]["$link"], post["author"]["did"])
|
||||
video_media = archiver.download_from_url(url)
|
||||
media.append(Media(video_media))
|
||||
return media
|
||||
|
||||
|
||||
def _get_post_data(self, post: dict) -> dict:
|
||||
"""
|
||||
Extracts relevant information returned by the .getPostThread api call (excluding text/created_at): author, mentions, tags, links.
|
||||
@@ -74,4 +74,4 @@ class Bluesky(GenericDropin):
|
||||
res["tags"] = tags
|
||||
if links:
|
||||
res["links"] = links
|
||||
return res
|
||||
return res
|
||||
|
||||
@@ -2,11 +2,12 @@ from yt_dlp.extractor.common import InfoExtractor
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
|
||||
|
||||
class GenericDropin:
|
||||
"""Base class for dropins for the generic extractor.
|
||||
|
||||
|
||||
In many instances, an extractor will exist in ytdlp, but it will only process videos.
|
||||
Dropins can be created and used to make use of the already-written private code of a
|
||||
Dropins can be created and used to make use of the already-written private code of a
|
||||
specific extractor from ytdlp.
|
||||
|
||||
The dropin should be able to handle the following methods:
|
||||
@@ -28,21 +29,19 @@ class GenericDropin:
|
||||
This method should return the post data from the url.
|
||||
"""
|
||||
raise NotImplementedError("This method should be implemented in the subclass")
|
||||
|
||||
|
||||
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
|
||||
"""
|
||||
This method should create a Metadata object from the post data.
|
||||
"""
|
||||
raise NotImplementedError("This method should be implemented in the subclass")
|
||||
|
||||
|
||||
def skip_ytdlp_download(self, url: str, ie_instance: InfoExtractor):
|
||||
"""
|
||||
This method should return True if you want to skip the ytdlp download method.
|
||||
"""
|
||||
return False
|
||||
|
||||
|
||||
def keys_to_clean(self, video_data: dict, info_extractor: InfoExtractor):
|
||||
"""
|
||||
This method should return a list of strings (keys) to clean from the video_data dict.
|
||||
@@ -50,9 +49,9 @@ class GenericDropin:
|
||||
E.g. ["uploader", "uploader_id", "tiktok_specific_field"]
|
||||
"""
|
||||
return []
|
||||
|
||||
|
||||
def download_additional_media(self, video_data: dict, info_extractor: InfoExtractor, metadata: Metadata):
|
||||
"""
|
||||
This method should download any additional media from the post.
|
||||
"""
|
||||
return metadata
|
||||
return metadata
|
||||
|
||||
@@ -3,16 +3,15 @@ from .dropin import GenericDropin
|
||||
|
||||
class Facebook(GenericDropin):
|
||||
def extract_post(self, url: str, ie_instance):
|
||||
video_id = ie_instance._match_valid_url(url).group('id')
|
||||
ie_instance._download_webpage(
|
||||
url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
|
||||
webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group('id'))
|
||||
video_id = ie_instance._match_valid_url(url).group("id")
|
||||
ie_instance._download_webpage(url.replace("://m.facebook.com/", "://www.facebook.com/"), video_id)
|
||||
webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group("id"))
|
||||
|
||||
# TODO: fix once https://github.com/yt-dlp/yt-dlp/pull/12275 is merged
|
||||
post_data = ie_instance._extract_metadata(webpage)
|
||||
return post_data
|
||||
|
||||
|
||||
def create_metadata(self, post: dict, ie_instance, archiver, url):
|
||||
metadata = archiver.create_metadata(url)
|
||||
metadata.set_title(post.get('title')).set_content(post.get('description')).set_post_data(post)
|
||||
return metadata
|
||||
metadata.set_title(post.get("title")).set_content(post.get("description")).set_post_data(post)
|
||||
return metadata
|
||||
|
||||
@@ -12,6 +12,7 @@ from loguru import logger
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
|
||||
class GenericExtractor(Extractor):
|
||||
_dropins = {}
|
||||
|
||||
@@ -19,14 +20,14 @@ class GenericExtractor(Extractor):
|
||||
# check for file .ytdlp-update in the secrets folder
|
||||
if self.ytdlp_update_interval < 0:
|
||||
return
|
||||
|
||||
use_secrets = os.path.exists('secrets')
|
||||
path = os.path.join('secrets' if use_secrets else '', '.ytdlp-update')
|
||||
|
||||
use_secrets = os.path.exists("secrets")
|
||||
path = os.path.join("secrets" if use_secrets else "", ".ytdlp-update")
|
||||
next_update_check = None
|
||||
if os.path.exists(path):
|
||||
with open(path, "r") as f:
|
||||
next_update_check = datetime.datetime.fromisoformat(f.read())
|
||||
|
||||
|
||||
if not next_update_check or next_update_check < datetime.datetime.now():
|
||||
self.update_ytdlp()
|
||||
|
||||
@@ -36,8 +37,11 @@ class GenericExtractor(Extractor):
|
||||
|
||||
def update_ytdlp(self):
|
||||
logger.info("Checking and updating yt-dlp...")
|
||||
logger.info(f"Tip: change the 'ytdlp_update_interval' setting to control how often yt-dlp is updated. Set to -1 to disable or 0 to enable on every run. Current setting: {self.ytdlp_update_interval}")
|
||||
logger.info(
|
||||
f"Tip: change the 'ytdlp_update_interval' setting to control how often yt-dlp is updated. Set to -1 to disable or 0 to enable on every run. Current setting: {self.ytdlp_update_interval}"
|
||||
)
|
||||
from importlib.metadata import version as get_version
|
||||
|
||||
old_version = get_version("yt-dlp")
|
||||
try:
|
||||
# try and update with pip (this works inside poetry environment and in a normal virtualenv)
|
||||
@@ -59,15 +63,17 @@ class GenericExtractor(Extractor):
|
||||
for info_extractor in yt_dlp.YoutubeDL()._ies.values():
|
||||
if info_extractor.suitable(url) and info_extractor.working():
|
||||
yield info_extractor
|
||||
|
||||
|
||||
def suitable(self, url: str) -> bool:
|
||||
"""
|
||||
Checks for valid URLs out of all ytdlp extractors.
|
||||
Returns False for the GenericIE, which as labelled by yt-dlp: 'Generic downloader that works on some sites'
|
||||
"""
|
||||
return any(self.suitable_extractors(url))
|
||||
|
||||
def download_additional_media(self, video_data: dict, info_extractor: InfoExtractor, metadata: Metadata) -> Metadata:
|
||||
|
||||
def download_additional_media(
|
||||
self, video_data: dict, info_extractor: InfoExtractor, metadata: Metadata
|
||||
) -> Metadata:
|
||||
"""
|
||||
Downloads additional media like images, comments, subtitles, etc.
|
||||
|
||||
@@ -76,7 +82,7 @@ class GenericExtractor(Extractor):
|
||||
|
||||
# Just get the main thumbnail. More thumbnails are available in
|
||||
# video_data['thumbnails'] should they be required
|
||||
thumbnail_url = video_data.get('thumbnail')
|
||||
thumbnail_url = video_data.get("thumbnail")
|
||||
if thumbnail_url:
|
||||
try:
|
||||
cover_image_path = self.download_from_url(thumbnail_url)
|
||||
@@ -99,15 +105,65 @@ class GenericExtractor(Extractor):
|
||||
Clean up the ytdlp generic video data to make it more readable and remove unnecessary keys that ytdlp adds
|
||||
"""
|
||||
|
||||
base_keys = ['formats', 'thumbnail', 'display_id', 'epoch', 'requested_downloads',
|
||||
'duration_string', 'thumbnails', 'http_headers', 'webpage_url_basename', 'webpage_url_domain',
|
||||
'extractor', 'extractor_key', 'playlist', 'playlist_index', 'duration_string', 'protocol', 'requested_subtitles',
|
||||
'format_id', 'acodec', 'vcodec', 'ext', 'epoch', '_has_drm', 'filesize', 'audio_ext', 'video_ext', 'vbr', 'abr',
|
||||
'resolution', 'dynamic_range', 'aspect_ratio', 'cookies', 'format', 'quality', 'preference', 'artists',
|
||||
'channel_id', 'subtitles', 'tbr', 'url', 'original_url', 'automatic_captions', 'playable_in_embed', 'live_status',
|
||||
'_format_sort_fields', 'chapters', 'requested_formats', 'format_note',
|
||||
'audio_channels', 'asr', 'fps', 'was_live', 'is_live', 'heatmap', 'age_limit', 'stretched_ratio']
|
||||
|
||||
base_keys = [
|
||||
"formats",
|
||||
"thumbnail",
|
||||
"display_id",
|
||||
"epoch",
|
||||
"requested_downloads",
|
||||
"duration_string",
|
||||
"thumbnails",
|
||||
"http_headers",
|
||||
"webpage_url_basename",
|
||||
"webpage_url_domain",
|
||||
"extractor",
|
||||
"extractor_key",
|
||||
"playlist",
|
||||
"playlist_index",
|
||||
"duration_string",
|
||||
"protocol",
|
||||
"requested_subtitles",
|
||||
"format_id",
|
||||
"acodec",
|
||||
"vcodec",
|
||||
"ext",
|
||||
"epoch",
|
||||
"_has_drm",
|
||||
"filesize",
|
||||
"audio_ext",
|
||||
"video_ext",
|
||||
"vbr",
|
||||
"abr",
|
||||
"resolution",
|
||||
"dynamic_range",
|
||||
"aspect_ratio",
|
||||
"cookies",
|
||||
"format",
|
||||
"quality",
|
||||
"preference",
|
||||
"artists",
|
||||
"channel_id",
|
||||
"subtitles",
|
||||
"tbr",
|
||||
"url",
|
||||
"original_url",
|
||||
"automatic_captions",
|
||||
"playable_in_embed",
|
||||
"live_status",
|
||||
"_format_sort_fields",
|
||||
"chapters",
|
||||
"requested_formats",
|
||||
"format_note",
|
||||
"audio_channels",
|
||||
"asr",
|
||||
"fps",
|
||||
"was_live",
|
||||
"is_live",
|
||||
"heatmap",
|
||||
"age_limit",
|
||||
"stretched_ratio",
|
||||
]
|
||||
|
||||
dropin = self.dropin_for_name(info_extractor.ie_key())
|
||||
if dropin:
|
||||
try:
|
||||
@@ -116,8 +172,8 @@ class GenericExtractor(Extractor):
|
||||
pass
|
||||
|
||||
return base_keys
|
||||
|
||||
def add_metadata(self, video_data: dict, info_extractor: InfoExtractor, url:str, result: Metadata) -> Metadata:
|
||||
|
||||
def add_metadata(self, video_data: dict, info_extractor: InfoExtractor, url: str, result: Metadata) -> Metadata:
|
||||
"""
|
||||
Creates a Metadata object from the given video_data
|
||||
"""
|
||||
@@ -126,29 +182,36 @@ class GenericExtractor(Extractor):
|
||||
result = self.download_additional_media(video_data, info_extractor, result)
|
||||
|
||||
# keep both 'title' and 'fulltitle', but prefer 'title', falling back to 'fulltitle' if it doesn't exist
|
||||
result.set_title(video_data.pop('title', video_data.pop('fulltitle', "")))
|
||||
result.set_title(video_data.pop("title", video_data.pop("fulltitle", "")))
|
||||
result.set_url(url)
|
||||
if "description" in video_data: result.set_content(video_data["description"])
|
||||
if "description" in video_data:
|
||||
result.set_content(video_data["description"])
|
||||
# extract comments if enabled
|
||||
if self.comments:
|
||||
result.set("comments", [{
|
||||
"text": c["text"],
|
||||
"author": c["author"],
|
||||
"timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz = datetime.timezone.utc)
|
||||
} for c in video_data.get("comments", [])])
|
||||
result.set(
|
||||
"comments",
|
||||
[
|
||||
{
|
||||
"text": c["text"],
|
||||
"author": c["author"],
|
||||
"timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz=datetime.timezone.utc),
|
||||
}
|
||||
for c in video_data.get("comments", [])
|
||||
],
|
||||
)
|
||||
|
||||
# then add the common metadata
|
||||
if timestamp := video_data.pop("timestamp", None):
|
||||
timestamp = datetime.datetime.fromtimestamp(timestamp, tz = datetime.timezone.utc).isoformat()
|
||||
timestamp = datetime.datetime.fromtimestamp(timestamp, tz=datetime.timezone.utc).isoformat()
|
||||
result.set_timestamp(timestamp)
|
||||
if upload_date := video_data.pop("upload_date", None):
|
||||
upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
|
||||
upload_date = datetime.datetime.strptime(upload_date, "%Y%m%d").replace(tzinfo=datetime.timezone.utc)
|
||||
result.set("upload_date", upload_date)
|
||||
|
||||
|
||||
# then clean away any keys we don't want
|
||||
for clean_key in self.keys_to_clean(info_extractor, video_data):
|
||||
video_data.pop(clean_key, None)
|
||||
|
||||
|
||||
# then add the rest of the video data
|
||||
for k, v in video_data.items():
|
||||
if v:
|
||||
@@ -169,22 +232,24 @@ class GenericExtractor(Extractor):
|
||||
logger.debug(f"""Could not find valid dropin for {info_extractor.IE_NAME}.
|
||||
Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""")
|
||||
return False
|
||||
|
||||
|
||||
post_data = dropin.extract_post(url, ie_instance)
|
||||
return dropin.create_metadata(post_data, ie_instance, self, url)
|
||||
|
||||
def get_metadata_for_video(self, data: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
|
||||
|
||||
def get_metadata_for_video(
|
||||
self, data: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL
|
||||
) -> Metadata:
|
||||
# this time download
|
||||
ydl.params['getcomments'] = self.comments
|
||||
#TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
|
||||
ydl.params["getcomments"] = self.comments
|
||||
# TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
|
||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
|
||||
if "entries" in data:
|
||||
entries = data.get("entries", [])
|
||||
if not len(entries):
|
||||
logger.warning('YoutubeDLArchiver could not find any video')
|
||||
logger.warning("YoutubeDLArchiver could not find any video")
|
||||
return False
|
||||
else: entries = [data]
|
||||
else:
|
||||
entries = [data]
|
||||
|
||||
result = Metadata()
|
||||
|
||||
@@ -192,17 +257,18 @@ class GenericExtractor(Extractor):
|
||||
try:
|
||||
filename = ydl.prepare_filename(entry)
|
||||
if not os.path.exists(filename):
|
||||
filename = filename.split('.')[0] + '.mkv'
|
||||
filename = filename.split(".")[0] + ".mkv"
|
||||
|
||||
new_media = Media(filename)
|
||||
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
|
||||
if x in entry: new_media.set(x, entry[x])
|
||||
if x in entry:
|
||||
new_media.set(x, entry[x])
|
||||
|
||||
# read text from subtitles if enabled
|
||||
if self.subtitles:
|
||||
for lang, val in (data.get('requested_subtitles') or {}).items():
|
||||
try:
|
||||
subs = pysubs2.load(val.get('filepath'), encoding="utf-8")
|
||||
for lang, val in (data.get("requested_subtitles") or {}).items():
|
||||
try:
|
||||
subs = pysubs2.load(val.get("filepath"), encoding="utf-8")
|
||||
text = " ".join([line.text for line in subs])
|
||||
new_media.set(f"subtitles_{lang}", text)
|
||||
except Exception as e:
|
||||
@@ -212,8 +278,8 @@ class GenericExtractor(Extractor):
|
||||
logger.error(f"Error processing entry {entry}: {e}")
|
||||
|
||||
return self.add_metadata(data, info_extractor, url, result)
|
||||
|
||||
def dropin_for_name(self, dropin_name: str, additional_paths = [], package=__package__) -> Type[InfoExtractor]:
|
||||
|
||||
def dropin_for_name(self, dropin_name: str, additional_paths=[], package=__package__) -> Type[InfoExtractor]:
|
||||
dropin_name = dropin_name.lower()
|
||||
|
||||
if dropin_name == "generic":
|
||||
@@ -221,6 +287,7 @@ class GenericExtractor(Extractor):
|
||||
return None
|
||||
|
||||
dropin_class_name = dropin_name.title()
|
||||
|
||||
def _load_dropin(dropin):
|
||||
dropin_class = getattr(dropin, dropin_class_name)()
|
||||
return self._dropins.setdefault(dropin_name, dropin_class)
|
||||
@@ -244,7 +311,7 @@ class GenericExtractor(Extractor):
|
||||
return _load_dropin(dropin)
|
||||
except (FileNotFoundError, ModuleNotFoundError):
|
||||
pass
|
||||
|
||||
|
||||
# fallback to loading the dropins within auto-archiver
|
||||
try:
|
||||
return _load_dropin(importlib.import_module(f".{dropin_name}", package=package))
|
||||
@@ -256,12 +323,12 @@ class GenericExtractor(Extractor):
|
||||
def download_for_extractor(self, info_extractor: InfoExtractor, url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
|
||||
"""
|
||||
Tries to download the given url using the specified extractor
|
||||
|
||||
|
||||
It first tries to use ytdlp directly to download the video. If the post is not a video, it will then try to
|
||||
use the extractor's _extract_post method to get the post metadata if possible.
|
||||
"""
|
||||
# when getting info without download, we also don't need the comments
|
||||
ydl.params['getcomments'] = False
|
||||
ydl.params["getcomments"] = False
|
||||
result = False
|
||||
|
||||
dropin_submodule = self.dropin_for_name(info_extractor.ie_key())
|
||||
@@ -272,7 +339,7 @@ class GenericExtractor(Extractor):
|
||||
|
||||
# don't download since it can be a live stream
|
||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
|
||||
if data.get('is_live', False) and not self.livestreams:
|
||||
if data.get("is_live", False) and not self.livestreams:
|
||||
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
|
||||
return False
|
||||
# it's a valid video, that the youtubdedl can download out of the box
|
||||
@@ -283,16 +350,21 @@ class GenericExtractor(Extractor):
|
||||
# don't clutter the logs with issues about the 'generic' extractor not having a dropin
|
||||
return False
|
||||
|
||||
logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead')
|
||||
logger.debug(
|
||||
f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead'
|
||||
)
|
||||
try:
|
||||
result = self.get_metadata_for_post(info_extractor, url, ydl)
|
||||
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
|
||||
logger.error(f'Error downloading metadata for post: {post_e}')
|
||||
logger.error(f"Error downloading metadata for post: {post_e}")
|
||||
return False
|
||||
except Exception as generic_e:
|
||||
logger.debug(f'Attempt to extract using ytdlp extractor "{info_extractor.IE_NAME}" failed: \n {repr(generic_e)}', exc_info=True)
|
||||
logger.debug(
|
||||
f'Attempt to extract using ytdlp extractor "{info_extractor.IE_NAME}" failed: \n {repr(generic_e)}',
|
||||
exc_info=True,
|
||||
)
|
||||
return False
|
||||
|
||||
|
||||
if result:
|
||||
extractor_name = "yt-dlp"
|
||||
if info_extractor:
|
||||
@@ -308,43 +380,49 @@ class GenericExtractor(Extractor):
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
#TODO: this is a temporary hack until this issue is closed: https://github.com/yt-dlp/yt-dlp/issues/11025
|
||||
# TODO: this is a temporary hack until this issue is closed: https://github.com/yt-dlp/yt-dlp/issues/11025
|
||||
if url.startswith("https://ya.ru"):
|
||||
url = url.replace("https://ya.ru", "https://yandex.ru")
|
||||
item.set("replaced_url", url)
|
||||
|
||||
ydl_options = {
|
||||
"outtmpl": os.path.join(self.tmp_dir, f"%(id)s.%(ext)s"),
|
||||
"quiet": False,
|
||||
"noplaylist": not self.allow_playlist,
|
||||
"writesubtitles": self.subtitles,
|
||||
"writeautomaticsub": self.subtitles,
|
||||
"live_from_start": self.live_from_start,
|
||||
"proxy": self.proxy,
|
||||
"max_downloads": self.max_downloads,
|
||||
"playlistend": self.max_downloads,
|
||||
}
|
||||
|
||||
ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'),
|
||||
'quiet': False, 'noplaylist': not self.allow_playlist ,
|
||||
'writesubtitles': self.subtitles,'writeautomaticsub': self.subtitles,
|
||||
"live_from_start": self.live_from_start, "proxy": self.proxy,
|
||||
"max_downloads": self.max_downloads, "playlistend": self.max_downloads}
|
||||
|
||||
# set up auth
|
||||
auth = self.auth_for_site(url, extract_cookies=False)
|
||||
|
||||
# order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
|
||||
if auth:
|
||||
if 'username' in auth and 'password' in auth:
|
||||
logger.debug(f'Using provided auth username and password for {url}')
|
||||
ydl_options['username'] = auth['username']
|
||||
ydl_options['password'] = auth['password']
|
||||
elif 'cookie' in auth:
|
||||
logger.debug(f'Using provided auth cookie for {url}')
|
||||
yt_dlp.utils.std_headers['cookie'] = auth['cookie']
|
||||
elif 'cookies_from_browser' in auth:
|
||||
logger.debug(f'Using extracted cookies from browser {auth["cookies_from_browser"]} for {url}')
|
||||
ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
|
||||
elif 'cookies_file' in auth:
|
||||
logger.debug(f'Using cookies from file {auth["cookies_file"]} for {url}')
|
||||
ydl_options['cookiefile'] = auth['cookies_file']
|
||||
if "username" in auth and "password" in auth:
|
||||
logger.debug(f"Using provided auth username and password for {url}")
|
||||
ydl_options["username"] = auth["username"]
|
||||
ydl_options["password"] = auth["password"]
|
||||
elif "cookie" in auth:
|
||||
logger.debug(f"Using provided auth cookie for {url}")
|
||||
yt_dlp.utils.std_headers["cookie"] = auth["cookie"]
|
||||
elif "cookies_from_browser" in auth:
|
||||
logger.debug(f"Using extracted cookies from browser {auth['cookies_from_browser']} for {url}")
|
||||
ydl_options["cookiesfrombrowser"] = auth["cookies_from_browser"]
|
||||
elif "cookies_file" in auth:
|
||||
logger.debug(f"Using cookies from file {auth['cookies_file']} for {url}")
|
||||
ydl_options["cookiefile"] = auth["cookies_file"]
|
||||
|
||||
ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
|
||||
ydl = yt_dlp.YoutubeDL(
|
||||
ydl_options
|
||||
) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
|
||||
|
||||
for info_extractor in self.suitable_extractors(url):
|
||||
result = self.download_for_extractor(info_extractor, url, ydl)
|
||||
if result:
|
||||
return result
|
||||
|
||||
|
||||
return False
|
||||
|
||||
@@ -9,11 +9,11 @@ from dateutil.parser import parse as parse_dt
|
||||
|
||||
from .dropin import GenericDropin
|
||||
|
||||
class Truth(GenericDropin):
|
||||
|
||||
class Truth(GenericDropin):
|
||||
def extract_post(self, url, ie_instance: InfoExtractor) -> dict:
|
||||
video_id = ie_instance._match_id(url)
|
||||
truthsocial_url = f'https://truthsocial.com/api/v1/statuses/{video_id}'
|
||||
truthsocial_url = f"https://truthsocial.com/api/v1/statuses/{video_id}"
|
||||
return ie_instance._download_json(truthsocial_url, video_id)
|
||||
|
||||
def skip_ytdlp_download(self, url, ie_instance: Type[InfoExtractor]) -> bool:
|
||||
@@ -22,31 +22,42 @@ class Truth(GenericDropin):
|
||||
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
|
||||
"""
|
||||
Creates metadata from a truth social post
|
||||
|
||||
|
||||
Only used for posts that contain no media. ytdlp.TruthIE extractor can handle posts with media
|
||||
|
||||
|
||||
Format is:
|
||||
|
||||
|
||||
{'id': '109598702184774628', 'created_at': '2022-12-29T19:51:18.161Z', 'in_reply_to_id': None, 'quote_id': None, 'in_reply_to_account_id': None, 'sensitive': False, 'spoiler_text': '', 'visibility': 'public', 'language': 'en', 'uri': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'url': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'content': '<p>Pele, regarded by many as football\'s greatest ever player, has died in Brazil at the age of 82. <a href="https://www.bbc.com/sport/football/42751517" rel="nofollow noopener noreferrer" target="_blank"><span class="invisible">https://www.</span><span class="ellipsis">bbc.com/sport/football/4275151</span><span class="invisible">7</span></a></p>', 'account': {'id': '107905163010312793', 'username': 'bbcnewa', 'acct': 'bbcnewa', 'display_name': 'BBC News', 'locked': False, 'bot': False, 'discoverable': True, 'group': False, 'created_at': '2022-03-05T17:42:01.159Z', 'note': '<p>News, features and analysis by the BBC</p>', 'url': 'https://truthsocial.com/@bbcnewa', 'avatar': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'avatar_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'header': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'header_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'followers_count': 1131, 'following_count': 3, 'statuses_count': 9, 'last_status_at': '2024-11-12', 'verified': False, 'location': '', 'website': 'https://www.bbc.com/news', 'unauth_visibility': True, 'chats_onboarded': True, 'feeds_onboarded': True, 'accepting_messages': False, 'show_nonmember_group_statuses': None, 'emojis': [], 'fields': [], 'tv_onboarded': True, 'tv_account': False}, 'media_attachments': [], 'mentions': [], 'tags': [], 'card': None, 'group': None, 'quote': None, 'in_reply_to': None, 'reblog': None, 'sponsored': False, 'replies_count': 1, 'reblogs_count': 0, 'favourites_count': 2, 'favourited': False, 'reblogged': False, 'muted': False, 'pinned': False, 'bookmarked': False, 'poll': None, 'emojis': []}
|
||||
"""
|
||||
|
||||
result = Metadata()
|
||||
result.set_url(url)
|
||||
timestamp = post['created_at'] # format is 2022-12-29T19:51:18.161Z
|
||||
timestamp = post["created_at"] # format is 2022-12-29T19:51:18.161Z
|
||||
result.set_timestamp(parse_dt(timestamp))
|
||||
result.set('description', post['content'])
|
||||
result.set('author', post['account']['username'])
|
||||
result.set("description", post["content"])
|
||||
result.set("author", post["account"]["username"])
|
||||
|
||||
for key in ['replies_count', 'reblogs_count', 'favourites_count', ('account', 'followers_count'), ('account', 'following_count'), ('account', 'statuses_count'), ('account', 'display_name'), 'language', 'in_reply_to_account', 'replies_count']:
|
||||
for key in [
|
||||
"replies_count",
|
||||
"reblogs_count",
|
||||
"favourites_count",
|
||||
("account", "followers_count"),
|
||||
("account", "following_count"),
|
||||
("account", "statuses_count"),
|
||||
("account", "display_name"),
|
||||
"language",
|
||||
"in_reply_to_account",
|
||||
"replies_count",
|
||||
]:
|
||||
if isinstance(key, tuple):
|
||||
store_key = " ".join(key)
|
||||
else:
|
||||
store_key = key
|
||||
result.set(store_key, traverse_obj(post, key))
|
||||
|
||||
# add the media
|
||||
for media in post.get('media_attachments', []):
|
||||
filename = archiver.download_from_url(media['url'])
|
||||
result.add_media(Media(filename), id=media.get('id'))
|
||||
|
||||
return result
|
||||
# add the media
|
||||
for media in post.get("media_attachments", []):
|
||||
filename = archiver.download_from_url(media["url"])
|
||||
result.add_media(Media(filename), id=media.get("id"))
|
||||
|
||||
return result
|
||||
|
||||
@@ -10,9 +10,8 @@ from auto_archiver.core.extractor import Extractor
|
||||
|
||||
from .dropin import GenericDropin, InfoExtractor
|
||||
|
||||
|
||||
class Twitter(GenericDropin):
|
||||
|
||||
|
||||
def choose_variant(self, variants):
|
||||
# choosing the highest quality possible
|
||||
variant, width, height = None, 0, 0
|
||||
@@ -27,9 +26,9 @@ class Twitter(GenericDropin):
|
||||
else:
|
||||
variant = var if not variant else variant
|
||||
return variant
|
||||
|
||||
|
||||
def extract_post(self, url: str, ie_instance: InfoExtractor):
|
||||
twid = ie_instance._match_valid_url(url).group('id')
|
||||
twid = ie_instance._match_valid_url(url).group("id")
|
||||
return ie_instance._extract_status(twid=twid)
|
||||
|
||||
def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
|
||||
@@ -41,30 +40,29 @@ class Twitter(GenericDropin):
|
||||
except (ValueError, KeyError) as ex:
|
||||
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
||||
return False
|
||||
|
||||
result\
|
||||
.set_title(tweet.get('full_text', ''))\
|
||||
.set_content(json.dumps(tweet, ensure_ascii=False))\
|
||||
.set_timestamp(timestamp)
|
||||
|
||||
result.set_title(tweet.get("full_text", "")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(
|
||||
timestamp
|
||||
)
|
||||
if not tweet.get("entities", {}).get("media"):
|
||||
logger.debug('No media found, archiving tweet text only')
|
||||
logger.debug("No media found, archiving tweet text only")
|
||||
result.status = "twitter-ytdl"
|
||||
return result
|
||||
for i, tw_media in enumerate(tweet["entities"]["media"]):
|
||||
media = Media(filename="")
|
||||
mimetype = ""
|
||||
if tw_media["type"] == "photo":
|
||||
media.set("src", UrlUtil.twitter_best_quality_url(tw_media['media_url_https']))
|
||||
media.set("src", UrlUtil.twitter_best_quality_url(tw_media["media_url_https"]))
|
||||
mimetype = "image/jpeg"
|
||||
elif tw_media["type"] == "video":
|
||||
variant = self.choose_variant(tw_media['video_info']['variants'])
|
||||
media.set("src", variant['url'])
|
||||
mimetype = variant['content_type']
|
||||
variant = self.choose_variant(tw_media["video_info"]["variants"])
|
||||
media.set("src", variant["url"])
|
||||
mimetype = variant["content_type"]
|
||||
elif tw_media["type"] == "animated_gif":
|
||||
variant = tw_media['video_info']['variants'][0]
|
||||
media.set("src", variant['url'])
|
||||
mimetype = variant['content_type']
|
||||
variant = tw_media["video_info"]["variants"][0]
|
||||
media.set("src", variant["url"])
|
||||
mimetype = variant["content_type"]
|
||||
ext = mimetypes.guess_extension(mimetype)
|
||||
media.filename = archiver.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}')
|
||||
media.filename = archiver.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}")
|
||||
result.add_media(media)
|
||||
return result
|
||||
return result
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
from .gworksheet import GWorksheet
|
||||
from .gsheet_feeder_db import GsheetsFeederDB
|
||||
from .gsheet_feeder_db import GsheetsFeederDB
|
||||
|
||||
@@ -12,9 +12,7 @@
|
||||
"default": None,
|
||||
"help": "the id of the sheet to archive (alternative to 'sheet' config)",
|
||||
},
|
||||
"header": {"default": 1,
|
||||
"type": "int",
|
||||
"help": "index of the header row (starts at 1)", "type": "int"},
|
||||
"header": {"default": 1, "type": "int", "help": "index of the header row (starts at 1)", "type": "int"},
|
||||
"service_account": {
|
||||
"default": "secrets/service_account.json",
|
||||
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
|
||||
@@ -65,7 +63,7 @@
|
||||
"default": True,
|
||||
"type": "bool",
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
}
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
GsheetsFeederDatabase
|
||||
|
||||
@@ -8,6 +8,7 @@ The filtered rows are processed into `Metadata` objects.
|
||||
- validates the sheet's structure and filters rows based on input configurations.
|
||||
- Ensures only rows with valid URLs and unprocessed statuses are included.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Tuple, Union
|
||||
from urllib.parse import quote
|
||||
@@ -23,7 +24,6 @@ from auto_archiver.utils.misc import calculate_file_hash, get_current_timestamp
|
||||
|
||||
|
||||
class GsheetsFeederDB(Feeder, Database):
|
||||
|
||||
def setup(self) -> None:
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
# TODO mv to validators
|
||||
@@ -42,24 +42,28 @@ class GsheetsFeederDB(Feeder, Database):
|
||||
if not self.should_process_sheet(worksheet.title):
|
||||
logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
|
||||
continue
|
||||
logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
|
||||
logger.info(f"Opening worksheet {ii=}: {worksheet.title=} header={self.header}")
|
||||
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
|
||||
if len(missing_cols := self.missing_required_columns(gw)):
|
||||
logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
|
||||
logger.warning(
|
||||
f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}"
|
||||
)
|
||||
continue
|
||||
|
||||
# process and yield metadata here:
|
||||
yield from self._process_rows(gw)
|
||||
logger.success(f'Finished worksheet {worksheet.title}')
|
||||
logger.success(f"Finished worksheet {worksheet.title}")
|
||||
|
||||
def _process_rows(self, gw: GWorksheet):
|
||||
for row in range(1 + self.header, gw.count_rows() + 1):
|
||||
url = gw.get_cell(row, 'url').strip()
|
||||
if not len(url): continue
|
||||
original_status = gw.get_cell(row, 'status')
|
||||
status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
|
||||
url = gw.get_cell(row, "url").strip()
|
||||
if not len(url):
|
||||
continue
|
||||
original_status = gw.get_cell(row, "status")
|
||||
status = gw.get_cell(row, "status", fresh=original_status in ["", None])
|
||||
# TODO: custom status parser(?) aka should_retry_from_status
|
||||
if status not in ['', None]: continue
|
||||
if status not in ["", None]:
|
||||
continue
|
||||
|
||||
# All checks done - archival process starts here
|
||||
m = Metadata().set_url(url)
|
||||
@@ -70,10 +74,10 @@ class GsheetsFeederDB(Feeder, Database):
|
||||
# TODO: Check folder value not being recognised
|
||||
m.set_context("gsheet", {"row": row, "worksheet": gw})
|
||||
|
||||
if gw.get_cell_or_default(row, 'folder', "") is None:
|
||||
folder = ''
|
||||
if gw.get_cell_or_default(row, "folder", "") is None:
|
||||
folder = ""
|
||||
else:
|
||||
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
|
||||
folder = slugify(gw.get_cell_or_default(row, "folder", "").strip())
|
||||
if len(folder):
|
||||
if self.use_sheet_names_in_stored_paths:
|
||||
m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title)))
|
||||
@@ -91,12 +95,11 @@ class GsheetsFeederDB(Feeder, Database):
|
||||
|
||||
def missing_required_columns(self, gw: GWorksheet) -> list:
|
||||
missing = []
|
||||
for required_col in ['url', 'status']:
|
||||
for required_col in ["url", "status"]:
|
||||
if not gw.col_exists(required_col):
|
||||
missing.append(required_col)
|
||||
return missing
|
||||
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
@@ -155,9 +158,7 @@ class GsheetsFeederDB(Feeder, Database):
|
||||
if len(pdq_hashes):
|
||||
batch_if_valid("pdq_hash", ",".join(pdq_hashes))
|
||||
|
||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(
|
||||
screenshot, "urls"
|
||||
):
|
||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
|
||||
batch_if_valid("screenshot", "\n".join(screenshot.urls))
|
||||
|
||||
if thumbnail := item.get_first_image("thumbnail"):
|
||||
@@ -186,11 +187,12 @@ class GsheetsFeederDB(Feeder, Database):
|
||||
logger.debug(f"Unable to update sheet: {e}")
|
||||
|
||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||
|
||||
if gsheet := item.get_context("gsheet"):
|
||||
gw: GWorksheet = gsheet.get("worksheet")
|
||||
row: int = gsheet.get("row")
|
||||
elif self.sheet_id:
|
||||
logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")
|
||||
logger.error(
|
||||
f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder."
|
||||
)
|
||||
|
||||
return gw, row
|
||||
|
||||
@@ -5,24 +5,25 @@ class GWorksheet:
|
||||
"""
|
||||
This class makes read/write operations to the a worksheet easier.
|
||||
It can read the headers from a custom row number, but the row references
|
||||
should always include the offset of the header.
|
||||
eg: if header=4, row 5 will be the first with data.
|
||||
should always include the offset of the header.
|
||||
eg: if header=4, row 5 will be the first with data.
|
||||
"""
|
||||
|
||||
COLUMN_NAMES = {
|
||||
'url': 'link',
|
||||
'status': 'archive status',
|
||||
'folder': 'destination folder',
|
||||
'archive': 'archive location',
|
||||
'date': 'archive date',
|
||||
'thumbnail': 'thumbnail',
|
||||
'timestamp': 'upload timestamp',
|
||||
'title': 'upload title',
|
||||
'text': 'text content',
|
||||
'screenshot': 'screenshot',
|
||||
'hash': 'hash',
|
||||
'pdq_hash': 'perceptual hashes',
|
||||
'wacz': 'wacz',
|
||||
'replaywebpage': 'replaywebpage',
|
||||
"url": "link",
|
||||
"status": "archive status",
|
||||
"folder": "destination folder",
|
||||
"archive": "archive location",
|
||||
"date": "archive date",
|
||||
"thumbnail": "thumbnail",
|
||||
"timestamp": "upload timestamp",
|
||||
"title": "upload title",
|
||||
"text": "text content",
|
||||
"screenshot": "screenshot",
|
||||
"hash": "hash",
|
||||
"pdq_hash": "perceptual hashes",
|
||||
"wacz": "wacz",
|
||||
"replaywebpage": "replaywebpage",
|
||||
}
|
||||
|
||||
def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):
|
||||
@@ -36,7 +37,7 @@ class GWorksheet:
|
||||
|
||||
def _check_col_exists(self, col: str):
|
||||
if col not in self.columns:
|
||||
raise Exception(f'Column {col} is not in the configured column names: {self.columns.keys()}')
|
||||
raise Exception(f"Column {col} is not in the configured column names: {self.columns.keys()}")
|
||||
|
||||
def _col_index(self, col: str):
|
||||
self._check_col_exists(col)
|
||||
@@ -58,7 +59,7 @@ class GWorksheet:
|
||||
|
||||
def get_cell(self, row, col: str, fresh=False):
|
||||
"""
|
||||
returns the cell value from (row, col),
|
||||
returns the cell value from (row, col),
|
||||
where row can be an index (1-based) OR list of values
|
||||
as received from self.get_row(row)
|
||||
if fresh=True, the sheet is queried again for this cell
|
||||
@@ -71,7 +72,7 @@ class GWorksheet:
|
||||
row = self.get_row(row)
|
||||
|
||||
if col_index >= len(row):
|
||||
return ''
|
||||
return ""
|
||||
return row[col_index]
|
||||
|
||||
def get_cell_or_default(self, row, col: str, default: str = None, fresh=False, when_empty_use_default=True):
|
||||
@@ -96,13 +97,9 @@ class GWorksheet:
|
||||
receives a list of [(row:int, col:str, val)] and batch updates it, the parameters are the same as in the self.set_cell() method
|
||||
"""
|
||||
cell_updates = [
|
||||
{
|
||||
'range': self.to_a1(row, col),
|
||||
'values': [[str(val)[0:49999]]]
|
||||
}
|
||||
for row, col, val in cell_updates
|
||||
{"range": self.to_a1(row, col), "values": [[str(val)[0:49999]]]} for row, col, val in cell_updates
|
||||
]
|
||||
self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
|
||||
self.wks.batch_update(cell_updates, value_input_option="USER_ENTERED")
|
||||
|
||||
def to_a1(self, row: int, col: str):
|
||||
# row is 1-based
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .hash_enricher import HashEnricher
|
||||
from .hash_enricher import HashEnricher
|
||||
|
||||
@@ -3,16 +3,17 @@
|
||||
"type": ["enricher"],
|
||||
"requires_setup": False,
|
||||
"dependencies": {
|
||||
"python": ["loguru"],
|
||||
"python": ["loguru"],
|
||||
},
|
||||
"configs": {
|
||||
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
|
||||
# TODO add non-negative requirement to match previous implementation?
|
||||
"chunksize": {"default": 16000000,
|
||||
"help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB",
|
||||
'type': 'int',
|
||||
},
|
||||
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
|
||||
# TODO add non-negative requirement to match previous implementation?
|
||||
"chunksize": {
|
||||
"default": 16000000,
|
||||
"help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB",
|
||||
"type": "int",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Generates cryptographic hashes for media files to ensure data integrity and authenticity.
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
""" Hash Enricher for generating cryptographic hashes of media files.
|
||||
"""Hash Enricher for generating cryptographic hashes of media files.
|
||||
|
||||
The `HashEnricher` calculates cryptographic hashes (e.g., SHA-256, SHA3-512)
|
||||
for media files stored in `Metadata` objects. These hashes are used for
|
||||
@@ -7,6 +7,7 @@ exact duplicates. The hash is computed by reading the file's bytes in chunks,
|
||||
making it suitable for handling large files efficiently.
|
||||
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
from loguru import logger
|
||||
|
||||
@@ -20,7 +21,6 @@ class HashEnricher(Enricher):
|
||||
Calculates hashes for Media instances
|
||||
"""
|
||||
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
|
||||
@@ -35,5 +35,6 @@ class HashEnricher(Enricher):
|
||||
hash_algo = hashlib.sha256
|
||||
elif self.algorithm == "SHA3-512":
|
||||
hash_algo = hashlib.sha3_512
|
||||
else: return ""
|
||||
else:
|
||||
return ""
|
||||
return calculate_file_hash(filename, hash_algo, self.chunksize)
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .html_formatter import HtmlFormatter
|
||||
from .html_formatter import HtmlFormatter
|
||||
|
||||
@@ -2,14 +2,13 @@
|
||||
"name": "HTML Formatter",
|
||||
"type": ["formatter"],
|
||||
"requires_setup": False,
|
||||
"dependencies": {
|
||||
"python": ["hash_enricher", "loguru", "jinja2"],
|
||||
"bin": [""]
|
||||
},
|
||||
"dependencies": {"python": ["hash_enricher", "loguru", "jinja2"], "bin": [""]},
|
||||
"configs": {
|
||||
"detect_thumbnails": {"default": True,
|
||||
"help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'",
|
||||
"type": "bool"},
|
||||
"detect_thumbnails": {
|
||||
"default": True,
|
||||
"help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'",
|
||||
"type": "bool",
|
||||
},
|
||||
},
|
||||
"description": """ """,
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@ from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.core import Formatter
|
||||
from auto_archiver.utils.misc import random_str
|
||||
|
||||
|
||||
class HtmlFormatter(Formatter):
|
||||
environment: Environment = None
|
||||
template: any = None
|
||||
@@ -21,9 +22,9 @@ class HtmlFormatter(Formatter):
|
||||
self.environment = Environment(loader=FileSystemLoader(template_dir), autoescape=True)
|
||||
|
||||
# JinjaHelper class static methods are added as filters
|
||||
self.environment.filters.update({
|
||||
k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)
|
||||
})
|
||||
self.environment.filters.update(
|
||||
{k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)}
|
||||
)
|
||||
|
||||
# Load a specific template or default to "html_template.html"
|
||||
template_name = self.config.get("template_name", "html_template.html")
|
||||
@@ -36,11 +37,7 @@ class HtmlFormatter(Formatter):
|
||||
return
|
||||
|
||||
content = self.template.render(
|
||||
url=url,
|
||||
title=item.get_title(),
|
||||
media=item.media,
|
||||
metadata=item.metadata,
|
||||
version=__version__
|
||||
url=url, title=item.get_title(), media=item.media, metadata=item.metadata, version=__version__
|
||||
)
|
||||
|
||||
html_path = os.path.join(self.tmp_dir, f"formatted{random_str(24)}.html")
|
||||
@@ -49,7 +46,7 @@ class HtmlFormatter(Formatter):
|
||||
final_media = Media(filename=html_path, _mimetype="text/html")
|
||||
|
||||
# get the already instantiated hash_enricher module
|
||||
he = self.module_factory.get_module('hash_enricher', self.config)
|
||||
he = self.module_factory.get_module("hash_enricher", self.config)
|
||||
if len(hd := he.calculate_hash(final_media.filename)):
|
||||
final_media.set("hash", f"{he.algorithm}:{hd}")
|
||||
|
||||
|
||||
@@ -2,18 +2,18 @@
|
||||
"name": "Instagram API Extractor",
|
||||
"type": ["extractor"],
|
||||
"entry_point": "instagram_api_extractor::InstagramAPIExtractor",
|
||||
"dependencies":
|
||||
{"python": ["requests",
|
||||
"loguru",
|
||||
"retrying",
|
||||
"tqdm",],
|
||||
},
|
||||
"dependencies": {
|
||||
"python": [
|
||||
"requests",
|
||||
"loguru",
|
||||
"retrying",
|
||||
"tqdm",
|
||||
],
|
||||
},
|
||||
"requires_setup": True,
|
||||
"configs": {
|
||||
"access_token": {"default": None,
|
||||
"help": "a valid instagrapi-api token"},
|
||||
"api_endpoint": {"required": True,
|
||||
"help": "API endpoint to use"},
|
||||
"access_token": {"default": None, "help": "a valid instagrapi-api token"},
|
||||
"api_endpoint": {"required": True, "help": "API endpoint to use"},
|
||||
"full_profile": {
|
||||
"default": False,
|
||||
"type": "bool",
|
||||
|
||||
@@ -36,21 +36,16 @@ class InstagramAPIExtractor(Extractor):
|
||||
if self.api_endpoint[-1] == "/":
|
||||
self.api_endpoint = self.api_endpoint[:-1]
|
||||
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
url.replace("instagr.com", "instagram.com").replace(
|
||||
"instagr.am", "instagram.com"
|
||||
)
|
||||
url.replace("instagr.com", "instagram.com").replace("instagr.am", "instagram.com")
|
||||
insta_matches = self.valid_url.findall(url)
|
||||
logger.info(f"{insta_matches=}")
|
||||
if not len(insta_matches) or len(insta_matches[0]) != 3:
|
||||
return
|
||||
if len(insta_matches) > 1:
|
||||
logger.warning(
|
||||
f"Multiple instagram matches found in {url=}, using the first one"
|
||||
)
|
||||
logger.warning(f"Multiple instagram matches found in {url=}, using the first one")
|
||||
return
|
||||
g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2]
|
||||
if g1 == "":
|
||||
@@ -73,9 +68,7 @@ class InstagramAPIExtractor(Extractor):
|
||||
def call_api(self, path: str, params: dict) -> dict:
|
||||
headers = {"accept": "application/json", "x-access-key": self.access_token}
|
||||
logger.debug(f"calling {self.api_endpoint}/{path} with {params=}")
|
||||
return requests.get(
|
||||
f"{self.api_endpoint}/{path}", headers=headers, params=params
|
||||
).json()
|
||||
return requests.get(f"{self.api_endpoint}/{path}", headers=headers, params=params).json()
|
||||
|
||||
def cleanup_dict(self, d: dict | list) -> dict:
|
||||
# repeats 3 times to remove nested empty values
|
||||
@@ -88,8 +81,7 @@ class InstagramAPIExtractor(Extractor):
|
||||
return {
|
||||
k: clean_v
|
||||
for k, v in d.items()
|
||||
if (clean_v := self.cleanup_dict(v))
|
||||
not in [0.0, 0, [], {}, "", None, "null"]
|
||||
if (clean_v := self.cleanup_dict(v)) not in [0.0, 0, [], {}, "", None, "null"]
|
||||
and k not in ["x", "y", "width", "height"]
|
||||
}
|
||||
|
||||
@@ -126,9 +118,7 @@ class InstagramAPIExtractor(Extractor):
|
||||
try:
|
||||
self.download_all_tagged(result, user_id)
|
||||
except Exception as e:
|
||||
result.append(
|
||||
"errors", f"Error downloading tagged posts for {username}"
|
||||
)
|
||||
result.append("errors", f"Error downloading tagged posts for {username}")
|
||||
logger.error(f"Error downloading tagged posts for {username}: {e}")
|
||||
|
||||
# download all highlights
|
||||
@@ -153,22 +143,13 @@ class InstagramAPIExtractor(Extractor):
|
||||
"errors",
|
||||
f"Error downloading highlight id{h.get('pk')} for {username}",
|
||||
)
|
||||
logger.error(
|
||||
f"Error downloading highlight id{h.get('pk')} for {username}: {e}"
|
||||
)
|
||||
if (
|
||||
self.full_profile_max_posts
|
||||
and count_highlights >= self.full_profile_max_posts
|
||||
):
|
||||
logger.info(
|
||||
f"HIGHLIGHTS reached full_profile_max_posts={self.full_profile_max_posts}"
|
||||
)
|
||||
logger.error(f"Error downloading highlight id{h.get('pk')} for {username}: {e}")
|
||||
if self.full_profile_max_posts and count_highlights >= self.full_profile_max_posts:
|
||||
logger.info(f"HIGHLIGHTS reached full_profile_max_posts={self.full_profile_max_posts}")
|
||||
break
|
||||
result.set("#highlights", count_highlights)
|
||||
|
||||
def download_post(
|
||||
self, result: Metadata, code: str = None, id: str = None, context: str = None
|
||||
) -> Metadata:
|
||||
def download_post(self, result: Metadata, code: str = None, id: str = None, context: str = None) -> Metadata:
|
||||
if id:
|
||||
post = self.call_api(f"v1/media/by/id", {"id": id})
|
||||
else:
|
||||
@@ -196,11 +177,7 @@ class InstagramAPIExtractor(Extractor):
|
||||
h_info = full_h.get("response", {}).get("reels", {}).get(f"highlight:{id}")
|
||||
assert h_info, f"Highlight {id} not found: {full_h=}"
|
||||
|
||||
if (
|
||||
cover_media := h_info.get("cover_media", {})
|
||||
.get("cropped_image_version", {})
|
||||
.get("url")
|
||||
):
|
||||
if cover_media := h_info.get("cover_media", {}).get("cropped_image_version", {}).get("url"):
|
||||
filename = self.download_from_url(cover_media)
|
||||
result.add_media(Media(filename=filename), id=f"cover_media highlight {id}")
|
||||
|
||||
@@ -210,9 +187,7 @@ class InstagramAPIExtractor(Extractor):
|
||||
self.scrape_item(result, h, "highlight")
|
||||
except Exception as e:
|
||||
result.append("errors", f"Error downloading highlight {h.get('id')}")
|
||||
logger.error(
|
||||
f"Error downloading highlight, skipping {h.get('id')}: {e}"
|
||||
)
|
||||
logger.error(f"Error downloading highlight, skipping {h.get('id')}: {e}")
|
||||
|
||||
return h_info
|
||||
|
||||
@@ -244,9 +219,7 @@ class InstagramAPIExtractor(Extractor):
|
||||
|
||||
post_count = 0
|
||||
while end_cursor != "":
|
||||
posts = self.call_api(
|
||||
f"v1/user/medias/chunk", {"user_id": user_id, "end_cursor": end_cursor}
|
||||
)
|
||||
posts = self.call_api(f"v1/user/medias/chunk", {"user_id": user_id, "end_cursor": end_cursor})
|
||||
if not len(posts) or not type(posts) == list or len(posts) != 2:
|
||||
break
|
||||
posts, end_cursor = posts[0], posts[1]
|
||||
@@ -260,13 +233,8 @@ class InstagramAPIExtractor(Extractor):
|
||||
logger.error(f"Error downloading post, skipping {p.get('id')}: {e}")
|
||||
pbar.update(1)
|
||||
post_count += 1
|
||||
if (
|
||||
self.full_profile_max_posts
|
||||
and post_count >= self.full_profile_max_posts
|
||||
):
|
||||
logger.info(
|
||||
f"POSTS reached full_profile_max_posts={self.full_profile_max_posts}"
|
||||
)
|
||||
if self.full_profile_max_posts and post_count >= self.full_profile_max_posts:
|
||||
logger.info(f"POSTS reached full_profile_max_posts={self.full_profile_max_posts}")
|
||||
break
|
||||
result.set("#posts", post_count)
|
||||
|
||||
@@ -276,9 +244,7 @@ class InstagramAPIExtractor(Extractor):
|
||||
|
||||
tagged_count = 0
|
||||
while next_page_id != None:
|
||||
resp = self.call_api(
|
||||
f"v2/user/tag/medias", {"user_id": user_id, "page_id": next_page_id}
|
||||
)
|
||||
resp = self.call_api(f"v2/user/tag/medias", {"user_id": user_id, "page_id": next_page_id})
|
||||
posts = resp.get("response", {}).get("items", [])
|
||||
if not len(posts):
|
||||
break
|
||||
@@ -290,21 +256,12 @@ class InstagramAPIExtractor(Extractor):
|
||||
try:
|
||||
self.scrape_item(result, p, "tagged")
|
||||
except Exception as e:
|
||||
result.append(
|
||||
"errors", f"Error downloading tagged post {p.get('id')}"
|
||||
)
|
||||
logger.error(
|
||||
f"Error downloading tagged post, skipping {p.get('id')}: {e}"
|
||||
)
|
||||
result.append("errors", f"Error downloading tagged post {p.get('id')}")
|
||||
logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e}")
|
||||
pbar.update(1)
|
||||
tagged_count += 1
|
||||
if (
|
||||
self.full_profile_max_posts
|
||||
and tagged_count >= self.full_profile_max_posts
|
||||
):
|
||||
logger.info(
|
||||
f"TAGS reached full_profile_max_posts={self.full_profile_max_posts}"
|
||||
)
|
||||
if self.full_profile_max_posts and tagged_count >= self.full_profile_max_posts:
|
||||
logger.info(f"TAGS reached full_profile_max_posts={self.full_profile_max_posts}")
|
||||
break
|
||||
result.set("#tagged", tagged_count)
|
||||
|
||||
@@ -318,9 +275,7 @@ class InstagramAPIExtractor(Extractor):
|
||||
context can be used to give specific id prefixes to media
|
||||
"""
|
||||
if "clips_metadata" in item:
|
||||
if reusable_text := item.get("clips_metadata", {}).get(
|
||||
"reusable_text_attribute_string"
|
||||
):
|
||||
if reusable_text := item.get("clips_metadata", {}).get("reusable_text_attribute_string"):
|
||||
item["clips_metadata_text"] = reusable_text
|
||||
if self.minimize_json_output:
|
||||
del item["clips_metadata"]
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .instagram_extractor import InstagramExtractor
|
||||
from .instagram_extractor import InstagramExtractor
|
||||
|
||||
@@ -9,8 +9,7 @@
|
||||
},
|
||||
"requires_setup": True,
|
||||
"configs": {
|
||||
"username": {"required": True,
|
||||
"help": "A valid Instagram username."},
|
||||
"username": {"required": True, "help": "A valid Instagram username."},
|
||||
"password": {
|
||||
"required": True,
|
||||
"help": "The corresponding Instagram account password.",
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
""" Uses the Instaloader library to download content from Instagram. This class handles both individual posts
|
||||
and user profiles, downloading as much information as possible, including images, videos, text, stories,
|
||||
highlights, and tagged posts. Authentication is required via username/password or a session file.
|
||||
"""Uses the Instaloader library to download content from Instagram. This class handles both individual posts
|
||||
and user profiles, downloading as much information as possible, including images, videos, text, stories,
|
||||
highlights, and tagged posts. Authentication is required via username/password or a session file.
|
||||
|
||||
"""
|
||||
|
||||
import re, os, shutil
|
||||
import instaloader
|
||||
from loguru import logger
|
||||
@@ -11,6 +12,7 @@ from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.core import Media
|
||||
|
||||
|
||||
class InstagramExtractor(Extractor):
|
||||
"""
|
||||
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
|
||||
@@ -25,13 +27,12 @@ class InstagramExtractor(Extractor):
|
||||
# TODO: links to stories
|
||||
|
||||
def setup(self) -> None:
|
||||
|
||||
self.insta = instaloader.Instaloader(
|
||||
download_geotags=True,
|
||||
download_comments=True,
|
||||
compress_json=False,
|
||||
dirname_pattern=self.download_folder,
|
||||
filename_pattern="{date_utc}_UTC_{target}__{typename}"
|
||||
filename_pattern="{date_utc}_UTC_{target}__{typename}",
|
||||
)
|
||||
try:
|
||||
self.insta.load_session_from_file(self.username, self.session_file)
|
||||
@@ -44,7 +45,6 @@ class InstagramExtractor(Extractor):
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}")
|
||||
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
@@ -53,7 +53,8 @@ class InstagramExtractor(Extractor):
|
||||
profile_matches = self.profile_pattern.findall(url)
|
||||
|
||||
# return if not a valid instagram link
|
||||
if not len(post_matches) and not len(profile_matches): return
|
||||
if not len(post_matches) and not len(profile_matches):
|
||||
return
|
||||
|
||||
result = None
|
||||
try:
|
||||
@@ -65,7 +66,9 @@ class InstagramExtractor(Extractor):
|
||||
elif len(profile_matches):
|
||||
result = self.download_profile(url, profile_matches[0])
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download with instagram extractor due to: {e}, make sure your account credentials are valid.")
|
||||
logger.error(
|
||||
f"Failed to download with instagram extractor due to: {e}, make sure your account credentials are valid."
|
||||
)
|
||||
finally:
|
||||
shutil.rmtree(self.download_folder, ignore_errors=True)
|
||||
return result
|
||||
@@ -84,35 +87,50 @@ class InstagramExtractor(Extractor):
|
||||
profile = instaloader.Profile.from_username(self.insta.context, username)
|
||||
try:
|
||||
for post in profile.get_posts():
|
||||
try: self.insta.download_post(post, target=f"profile_post_{post.owner_username}")
|
||||
except Exception as e: logger.error(f"Failed to download post: {post.shortcode}: {e}")
|
||||
except Exception as e: logger.error(f"Failed profile.get_posts: {e}")
|
||||
try:
|
||||
self.insta.download_post(post, target=f"profile_post_{post.owner_username}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download post: {post.shortcode}: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed profile.get_posts: {e}")
|
||||
|
||||
try:
|
||||
for post in profile.get_tagged_posts():
|
||||
try: self.insta.download_post(post, target=f"tagged_post_{post.owner_username}")
|
||||
except Exception as e: logger.error(f"Failed to download tagged post: {post.shortcode}: {e}")
|
||||
except Exception as e: logger.error(f"Failed profile.get_tagged_posts: {e}")
|
||||
try:
|
||||
self.insta.download_post(post, target=f"tagged_post_{post.owner_username}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download tagged post: {post.shortcode}: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed profile.get_tagged_posts: {e}")
|
||||
|
||||
try:
|
||||
for post in profile.get_igtv_posts():
|
||||
try: self.insta.download_post(post, target=f"igtv_post_{post.owner_username}")
|
||||
except Exception as e: logger.error(f"Failed to download igtv post: {post.shortcode}: {e}")
|
||||
except Exception as e: logger.error(f"Failed profile.get_igtv_posts: {e}")
|
||||
try:
|
||||
self.insta.download_post(post, target=f"igtv_post_{post.owner_username}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download igtv post: {post.shortcode}: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed profile.get_igtv_posts: {e}")
|
||||
|
||||
try:
|
||||
for story in self.insta.get_stories([profile.userid]):
|
||||
for item in story.get_items():
|
||||
try: self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}")
|
||||
except Exception as e: logger.error(f"Failed to download story item: {item}: {e}")
|
||||
except Exception as e: logger.error(f"Failed get_stories: {e}")
|
||||
try:
|
||||
self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download story item: {item}: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed get_stories: {e}")
|
||||
|
||||
try:
|
||||
for highlight in self.insta.get_highlights(profile.userid):
|
||||
for item in highlight.get_items():
|
||||
try: self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}")
|
||||
except Exception as e: logger.error(f"Failed to download highlight item: {item}: {e}")
|
||||
except Exception as e: logger.error(f"Failed get_highlights: {e}")
|
||||
try:
|
||||
self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download highlight item: {item}: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed get_highlights: {e}")
|
||||
|
||||
return self.process_downloads(url, f"@{username}", profile._asdict(), None)
|
||||
|
||||
@@ -124,7 +142,8 @@ class InstagramExtractor(Extractor):
|
||||
all_media = []
|
||||
for f in os.listdir(self.download_folder):
|
||||
if os.path.isfile((filename := os.path.join(self.download_folder, f))):
|
||||
if filename[-4:] == ".txt": continue
|
||||
if filename[-4:] == ".txt":
|
||||
continue
|
||||
all_media.append(Media(filename))
|
||||
|
||||
assert len(all_media) > 1, "No uploaded media found"
|
||||
|
||||
@@ -1,16 +1,21 @@
|
||||
{
|
||||
"name": "Instagram Telegram Bot Extractor",
|
||||
"type": ["extractor"],
|
||||
"dependencies": {"python": ["loguru", "telethon",],
|
||||
},
|
||||
"dependencies": {
|
||||
"python": [
|
||||
"loguru",
|
||||
"telethon",
|
||||
],
|
||||
},
|
||||
"requires_setup": True,
|
||||
"configs": {
|
||||
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
|
||||
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
||||
"session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
||||
"timeout": {"default": 45,
|
||||
"type": "int",
|
||||
"help": "timeout to fetch the instagram content in seconds."},
|
||||
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
|
||||
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
||||
"session_file": {
|
||||
"default": "secrets/anon-insta",
|
||||
"help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value.",
|
||||
},
|
||||
"timeout": {"default": 45, "type": "int", "help": "timeout to fetch the instagram content in seconds."},
|
||||
},
|
||||
"description": """
|
||||
The `InstagramTbotExtractor` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content,
|
||||
|
||||
@@ -65,15 +65,15 @@ class InstagramTbotExtractor(Extractor):
|
||||
session_file_name = self.session_file + ".session"
|
||||
if os.path.exists(session_file_name):
|
||||
os.remove(session_file_name)
|
||||
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
if not "instagram.com" in url: return False
|
||||
if not "instagram.com" in url:
|
||||
return False
|
||||
|
||||
result = Metadata()
|
||||
tmp_dir = self.tmp_dir
|
||||
with self.client.start():
|
||||
|
||||
chat, since_id = self._send_url_to_bot(url)
|
||||
message = self._process_messages(chat, since_id, tmp_dir, result)
|
||||
|
||||
@@ -110,13 +110,14 @@ class InstagramTbotExtractor(Extractor):
|
||||
for post in self.client.iter_messages(chat, min_id=since_id):
|
||||
since_id = max(since_id, post.id)
|
||||
# Skip known filler message:
|
||||
if post.message == 'The bot receives information through https://hikerapi.com/p/hJqpppqi':
|
||||
if post.message == "The bot receives information through https://hikerapi.com/p/hJqpppqi":
|
||||
continue
|
||||
if post.media and post.id not in seen_media:
|
||||
filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
|
||||
filename_dest = os.path.join(tmp_dir, f"{chat.id}_{post.id}")
|
||||
media = self.client.download_media(post.media, filename_dest)
|
||||
if media:
|
||||
result.add_media(Media(media))
|
||||
seen_media.append(post.id)
|
||||
if post.message: message += post.message
|
||||
return message.strip()
|
||||
if post.message:
|
||||
message += post.message
|
||||
return message.strip()
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .local_storage import LocalStorage
|
||||
from .local_storage import LocalStorage
|
||||
|
||||
@@ -17,9 +17,11 @@
|
||||
"choices": ["random", "static"],
|
||||
},
|
||||
"save_to": {"default": "./local_archive", "help": "folder where to save archived content"},
|
||||
"save_absolute": {"default": False,
|
||||
"type": "bool",
|
||||
"help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
|
||||
"save_absolute": {
|
||||
"default": False,
|
||||
"type": "bool",
|
||||
"help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
LocalStorage: A storage module for saving archived content locally on the filesystem.
|
||||
@@ -33,5 +35,5 @@
|
||||
### Notes
|
||||
- Default storage folder is `./archived`, but this can be changed via the `save_to` configuration.
|
||||
- The `save_absolute` option can reveal the file structure in output formats; use with caution.
|
||||
"""
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
|
||||
import shutil
|
||||
from typing import IO
|
||||
import os
|
||||
@@ -9,7 +8,6 @@ from auto_archiver.core import Storage
|
||||
|
||||
|
||||
class LocalStorage(Storage):
|
||||
|
||||
def get_cdn_url(self, media: Media) -> str:
|
||||
# TODO: is this viable with Storage.configs on path/filename?
|
||||
dest = os.path.join(self.save_to, media.key)
|
||||
@@ -21,10 +19,11 @@ class LocalStorage(Storage):
|
||||
# override parent so that we can use shutil.copy2 and keep metadata
|
||||
dest = os.path.join(self.save_to, media.key)
|
||||
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
||||
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}')
|
||||
logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}")
|
||||
res = shutil.copy2(media.filename, dest)
|
||||
logger.info(res)
|
||||
return True
|
||||
|
||||
# must be implemented even if unused
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||||
pass
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
"type": ["enricher"],
|
||||
"requires_setup": False,
|
||||
"dependencies": {
|
||||
"python": ["loguru"],
|
||||
"python": ["loguru"],
|
||||
},
|
||||
"description": """
|
||||
Adds metadata information about the archive operations, Adds metadata about archive operations, including file sizes and archive duration./
|
||||
|
||||
@@ -23,7 +23,9 @@ class MetaEnricher(Enricher):
|
||||
self.enrich_archive_duration(to_enrich)
|
||||
|
||||
def enrich_file_sizes(self, to_enrich: Metadata):
|
||||
logger.debug(f"calculating archive file sizes for url={to_enrich.get_url()} ({len(to_enrich.media)} media files)")
|
||||
logger.debug(
|
||||
f"calculating archive file sizes for url={to_enrich.get_url()} ({len(to_enrich.media)} media files)"
|
||||
)
|
||||
total_size = 0
|
||||
for media in to_enrich.get_all_media():
|
||||
file_stats = os.stat(media.filename)
|
||||
@@ -34,7 +36,6 @@ class MetaEnricher(Enricher):
|
||||
to_enrich.set("total_bytes", total_size)
|
||||
to_enrich.set("total_size", self.human_readable_bytes(total_size))
|
||||
|
||||
|
||||
def human_readable_bytes(self, size: int) -> str:
|
||||
# receives number of bytes and returns human readble size
|
||||
for unit in ["bytes", "KB", "MB", "GB", "TB"]:
|
||||
@@ -46,4 +47,4 @@ class MetaEnricher(Enricher):
|
||||
logger.debug(f"calculating archive duration for url={to_enrich.get_url()} ")
|
||||
|
||||
archive_duration = datetime.datetime.now(datetime.timezone.utc) - to_enrich.get("_processed_at")
|
||||
to_enrich.set("archive_duration_seconds", archive_duration.seconds)
|
||||
to_enrich.set("archive_duration_seconds", archive_duration.seconds)
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .metadata_enricher import MetadataEnricher
|
||||
from .metadata_enricher import MetadataEnricher
|
||||
|
||||
@@ -2,10 +2,7 @@
|
||||
"name": "Media Metadata Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru"],
|
||||
"bin": ["exiftool"]
|
||||
},
|
||||
"dependencies": {"python": ["loguru"], "bin": ["exiftool"]},
|
||||
"description": """
|
||||
Extracts metadata information from files using ExifTool.
|
||||
|
||||
@@ -17,5 +14,5 @@
|
||||
### Notes
|
||||
- Requires ExifTool to be installed and accessible via the system's PATH.
|
||||
- Skips enrichment for files where metadata extraction fails.
|
||||
"""
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -11,7 +11,6 @@ class MetadataEnricher(Enricher):
|
||||
Extracts metadata information from files using exiftool.
|
||||
"""
|
||||
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"extracting EXIF metadata for {url=}")
|
||||
@@ -23,13 +22,13 @@ class MetadataEnricher(Enricher):
|
||||
def get_metadata(self, filename: str) -> dict:
|
||||
try:
|
||||
# Run ExifTool command to extract metadata from the file
|
||||
cmd = ['exiftool', filename]
|
||||
cmd = ["exiftool", filename]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
# Process the output to extract individual metadata fields
|
||||
metadata = {}
|
||||
for line in result.stdout.splitlines():
|
||||
field, value = line.strip().split(':', 1)
|
||||
field, value = line.strip().split(":", 1)
|
||||
metadata[field.strip()] = value.strip()
|
||||
return metadata
|
||||
except FileNotFoundError:
|
||||
|
||||
@@ -2,8 +2,7 @@
|
||||
"name": "Mute Formatter",
|
||||
"type": ["formatter"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
},
|
||||
"dependencies": {},
|
||||
"description": """ Default formatter.
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -5,5 +5,5 @@ from auto_archiver.core import Formatter
|
||||
|
||||
|
||||
class MuteFormatter(Formatter):
|
||||
|
||||
def format(self, item: Metadata) -> Media: return None
|
||||
def format(self, item: Metadata) -> Media:
|
||||
return None
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .pdq_hash_enricher import PdqHashEnricher
|
||||
from .pdq_hash_enricher import PdqHashEnricher
|
||||
|
||||
@@ -17,5 +17,5 @@
|
||||
### Notes
|
||||
- Best used after enrichers like `thumbnail_enricher` or `screenshot_enricher` to ensure images are available.
|
||||
- Uses the `pdqhash` library to compute 256-bit perceptual hashes, which are stored as hexadecimal strings.
|
||||
"""
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ This enricher is typically used after thumbnail or screenshot enrichers
|
||||
to ensure images are available for hashing.
|
||||
|
||||
"""
|
||||
|
||||
import traceback
|
||||
import pdqhash
|
||||
import numpy as np
|
||||
@@ -34,7 +35,12 @@ class PdqHashEnricher(Enricher):
|
||||
for m in to_enrich.media:
|
||||
for media in m.all_inner_media(True):
|
||||
media_id = media.get("id", "")
|
||||
if media.is_image() and "screenshot" not in media_id and "warc-file-" not in media_id and len(hd := self.calculate_pdq_hash(media.filename)):
|
||||
if (
|
||||
media.is_image()
|
||||
and "screenshot" not in media_id
|
||||
and "warc-file-" not in media_id
|
||||
and len(hd := self.calculate_pdq_hash(media.filename))
|
||||
):
|
||||
media.set("pdq_hash", hd)
|
||||
media_with_hashes.append(media.filename)
|
||||
|
||||
@@ -51,5 +57,7 @@ class PdqHashEnricher(Enricher):
|
||||
hash = "".join(str(b) for b in hash_array)
|
||||
return hex(int(hash, 2))[2:]
|
||||
except UnidentifiedImageError as e:
|
||||
logger.error(f"Image {filename=} is likely corrupted or in unsupported format {e}: {traceback.format_exc()}")
|
||||
logger.error(
|
||||
f"Image {filename=} is likely corrupted or in unsupported format {e}: {traceback.format_exc()}"
|
||||
)
|
||||
return ""
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .s3_storage import S3Storage
|
||||
from .s3_storage import S3Storage
|
||||
|
||||
@@ -20,20 +20,20 @@
|
||||
"region": {"default": None, "help": "S3 region name"},
|
||||
"key": {"default": None, "help": "S3 API key"},
|
||||
"secret": {"default": None, "help": "S3 API secret"},
|
||||
"random_no_duplicate": {"default": False,
|
||||
"type": "bool",
|
||||
"help": "if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`"},
|
||||
"random_no_duplicate": {
|
||||
"default": False,
|
||||
"type": "bool",
|
||||
"help": "if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`",
|
||||
},
|
||||
"endpoint_url": {
|
||||
"default": 'https://{region}.digitaloceanspaces.com',
|
||||
"help": "S3 bucket endpoint, {region} are inserted at runtime"
|
||||
"default": "https://{region}.digitaloceanspaces.com",
|
||||
"help": "S3 bucket endpoint, {region} are inserted at runtime",
|
||||
},
|
||||
"cdn_url": {
|
||||
"default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
|
||||
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
|
||||
"default": "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}",
|
||||
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime",
|
||||
},
|
||||
"private": {"default": False,
|
||||
"type": "bool",
|
||||
"help": "if true S3 files will not be readable online"},
|
||||
"private": {"default": False, "type": "bool", "help": "if true S3 files will not be readable online"},
|
||||
},
|
||||
"description": """
|
||||
S3Storage: A storage module for saving media files to an S3-compatible object storage.
|
||||
@@ -50,5 +50,5 @@
|
||||
- The `random_no_duplicate` option ensures no duplicate uploads by leveraging hash-based folder structures.
|
||||
- Uses `boto3` for interaction with the S3 API.
|
||||
- Depends on the `HashEnricher` module for hash calculation.
|
||||
"""
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
|
||||
from typing import IO
|
||||
|
||||
import boto3
|
||||
@@ -11,60 +10,62 @@ from auto_archiver.utils.misc import calculate_file_hash, random_str
|
||||
|
||||
NO_DUPLICATES_FOLDER = "no-dups/"
|
||||
|
||||
class S3Storage(Storage):
|
||||
|
||||
class S3Storage(Storage):
|
||||
def setup(self) -> None:
|
||||
self.s3 = boto3.client(
|
||||
's3',
|
||||
"s3",
|
||||
region_name=self.region,
|
||||
endpoint_url=self.endpoint_url.format(region=self.region),
|
||||
aws_access_key_id=self.key,
|
||||
aws_secret_access_key=self.secret
|
||||
aws_secret_access_key=self.secret,
|
||||
)
|
||||
if self.random_no_duplicate:
|
||||
logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.")
|
||||
logger.warning(
|
||||
"random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`."
|
||||
)
|
||||
|
||||
def get_cdn_url(self, media: Media) -> str:
|
||||
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
|
||||
|
||||
def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None:
|
||||
if not self.is_upload_needed(media): return True
|
||||
if not self.is_upload_needed(media):
|
||||
return True
|
||||
|
||||
extra_args = kwargs.get("extra_args", {})
|
||||
if not self.private and 'ACL' not in extra_args:
|
||||
extra_args['ACL'] = 'public-read'
|
||||
if not self.private and "ACL" not in extra_args:
|
||||
extra_args["ACL"] = "public-read"
|
||||
|
||||
if 'ContentType' not in extra_args:
|
||||
if "ContentType" not in extra_args:
|
||||
try:
|
||||
if media.mimetype:
|
||||
extra_args['ContentType'] = media.mimetype
|
||||
extra_args["ContentType"] = media.mimetype
|
||||
except Exception as e:
|
||||
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
|
||||
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
|
||||
return True
|
||||
|
||||
|
||||
def is_upload_needed(self, media: Media) -> bool:
|
||||
if self.random_no_duplicate:
|
||||
# checks if a folder with the hash already exists, if so it skips the upload
|
||||
hd = calculate_file_hash(media.filename)
|
||||
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
|
||||
|
||||
if existing_key:=self.file_in_folder(path):
|
||||
if existing_key := self.file_in_folder(path):
|
||||
media.key = existing_key
|
||||
media.set("previously archived", True)
|
||||
logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
|
||||
return False
|
||||
|
||||
|
||||
_, ext = os.path.splitext(media.key)
|
||||
media.key = os.path.join(path, f"{random_str(24)}{ext}")
|
||||
return True
|
||||
|
||||
def file_in_folder(self, path:str) -> str:
|
||||
def file_in_folder(self, path: str) -> str:
|
||||
# checks if path exists and is not an empty folder
|
||||
if not path.endswith('/'):
|
||||
path = path + '/'
|
||||
resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter='/', MaxKeys=1)
|
||||
if 'Contents' in resp:
|
||||
return resp['Contents'][0]['Key']
|
||||
if not path.endswith("/"):
|
||||
path = path + "/"
|
||||
resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter="/", MaxKeys=1)
|
||||
if "Contents" in resp:
|
||||
return resp["Contents"][0]["Key"]
|
||||
return False
|
||||
|
||||
|
||||
@@ -6,26 +6,29 @@
|
||||
"python": ["loguru", "selenium"],
|
||||
},
|
||||
"configs": {
|
||||
"width": {"default": 1280,
|
||||
"type": "int",
|
||||
"help": "width of the screenshots"},
|
||||
"height": {"default": 1024,
|
||||
"type": "int",
|
||||
"help": "height of the screenshots"},
|
||||
"timeout": {"default": 60,
|
||||
"type": "int",
|
||||
"help": "timeout for taking the screenshot"},
|
||||
"sleep_before_screenshot": {"default": 4,
|
||||
"type": "int",
|
||||
"help": "seconds to wait for the pages to load before taking screenshot"},
|
||||
"http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
|
||||
"save_to_pdf": {"default": False,
|
||||
"type": "bool",
|
||||
"help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
|
||||
"print_options": {"default": {},
|
||||
"help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
|
||||
"type": "json_loader"},
|
||||
"width": {"default": 1280, "type": "int", "help": "width of the screenshots"},
|
||||
"height": {"default": 1024, "type": "int", "help": "height of the screenshots"},
|
||||
"timeout": {"default": 60, "type": "int", "help": "timeout for taking the screenshot"},
|
||||
"sleep_before_screenshot": {
|
||||
"default": 4,
|
||||
"type": "int",
|
||||
"help": "seconds to wait for the pages to load before taking screenshot",
|
||||
},
|
||||
"http_proxy": {
|
||||
"default": "",
|
||||
"help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port",
|
||||
},
|
||||
"save_to_pdf": {
|
||||
"default": False,
|
||||
"type": "bool",
|
||||
"help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter",
|
||||
},
|
||||
"print_options": {
|
||||
"default": {},
|
||||
"help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
|
||||
"type": "json_loader",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Captures screenshots and optionally saves web pages as PDFs using a WebDriver.
|
||||
|
||||
@@ -37,5 +40,5 @@
|
||||
|
||||
### Notes
|
||||
- Requires a WebDriver (e.g., ChromeDriver) installed and accessible via the system's PATH.
|
||||
"""
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -9,8 +9,8 @@ from auto_archiver.core import Enricher
|
||||
from auto_archiver.utils import Webdriver, url as UrlUtil, random_str
|
||||
from auto_archiver.core import Media, Metadata
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
def __init__(self, webdriver_factory=None):
|
||||
super().__init__()
|
||||
self.webdriver_factory = webdriver_factory or Webdriver
|
||||
@@ -25,8 +25,14 @@ class ScreenshotEnricher(Enricher):
|
||||
logger.debug(f"Enriching screenshot for {url=}")
|
||||
auth = self.auth_for_site(url)
|
||||
with self.webdriver_factory(
|
||||
self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
|
||||
http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver:
|
||||
self.width,
|
||||
self.height,
|
||||
self.timeout,
|
||||
facebook_accept_cookies="facebook.com" in url,
|
||||
http_proxy=self.http_proxy,
|
||||
print_options=self.print_options,
|
||||
auth=auth,
|
||||
) as driver:
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(int(self.sleep_before_screenshot))
|
||||
@@ -43,4 +49,3 @@ class ScreenshotEnricher(Enricher):
|
||||
logger.info("TimeoutException loading page for screenshot")
|
||||
except Exception as e:
|
||||
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
|
||||
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .ssl_enricher import SSLEnricher
|
||||
from .ssl_enricher import SSLEnricher
|
||||
|
||||
@@ -5,11 +5,13 @@
|
||||
"dependencies": {
|
||||
"python": ["loguru", "slugify"],
|
||||
},
|
||||
'entry_point': 'ssl_enricher::SSLEnricher',
|
||||
"entry_point": "ssl_enricher::SSLEnricher",
|
||||
"configs": {
|
||||
"skip_when_nothing_archived": {"default": True,
|
||||
"type": 'bool',
|
||||
"help": "if true, will skip enriching when no media is archived"},
|
||||
"skip_when_nothing_archived": {
|
||||
"default": True,
|
||||
"type": "bool",
|
||||
"help": "if true, will skip enriching when no media is archived",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Retrieves SSL certificate information for a domain and stores it as a file.
|
||||
@@ -21,5 +23,5 @@
|
||||
|
||||
### Notes
|
||||
- Requires the target URL to use the HTTPS scheme; other schemes are not supported.
|
||||
"""
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -13,16 +13,18 @@ class SSLEnricher(Enricher):
|
||||
"""
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
if not to_enrich.media and self.skip_when_nothing_archived: return
|
||||
|
||||
if not to_enrich.media and self.skip_when_nothing_archived:
|
||||
return
|
||||
|
||||
url = to_enrich.get_url()
|
||||
parsed = urlparse(url)
|
||||
assert parsed.scheme in ["https"], f"Invalid URL scheme {url=}"
|
||||
|
||||
|
||||
domain = parsed.netloc
|
||||
logger.debug(f"fetching SSL certificate for {domain=} in {url=}")
|
||||
|
||||
cert = ssl.get_server_certificate((domain, 443))
|
||||
cert_fn = os.path.join(self.tmp_dir, f"{slugify(domain)}.pem")
|
||||
with open(cert_fn, "w") as f: f.write(cert)
|
||||
with open(cert_fn, "w") as f:
|
||||
f.write(cert)
|
||||
to_enrich.add_media(Media(filename=cert_fn), id="ssl_certificate")
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .telegram_extractor import TelegramExtractor
|
||||
from .telegram_extractor import TelegramExtractor
|
||||
|
||||
@@ -15,11 +15,11 @@ class TelegramExtractor(Extractor):
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
# detect URLs that we definitely cannot handle
|
||||
if 't.me' != item.netloc:
|
||||
if "t.me" != item.netloc:
|
||||
return False
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
|
||||
}
|
||||
|
||||
# TODO: check if we can do this more resilient to variable URLs
|
||||
@@ -27,11 +27,11 @@ class TelegramExtractor(Extractor):
|
||||
url += "?embed=1"
|
||||
|
||||
t = requests.get(url, headers=headers)
|
||||
s = BeautifulSoup(t.content, 'html.parser')
|
||||
s = BeautifulSoup(t.content, "html.parser")
|
||||
|
||||
result = Metadata()
|
||||
result.set_content(html.escape(str(t.content)))
|
||||
if (timestamp := (s.find_all('time') or [{}])[0].get('datetime')):
|
||||
if timestamp := (s.find_all("time") or [{}])[0].get("datetime"):
|
||||
result.set_timestamp(timestamp)
|
||||
|
||||
video = s.find("video")
|
||||
@@ -41,25 +41,26 @@ class TelegramExtractor(Extractor):
|
||||
|
||||
image_urls = []
|
||||
for im in image_tags:
|
||||
urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])]
|
||||
urls = [u.replace("'", "") for u in re.findall(r"url\((.*?)\)", im["style"])]
|
||||
image_urls += urls
|
||||
|
||||
if not len(image_urls): return False
|
||||
if not len(image_urls):
|
||||
return False
|
||||
for img_url in image_urls:
|
||||
result.add_media(Media(self.download_from_url(img_url)))
|
||||
else:
|
||||
video_url = video.get('src')
|
||||
video_url = video.get("src")
|
||||
m_video = Media(self.download_from_url(video_url))
|
||||
# extract duration from HTML
|
||||
try:
|
||||
duration = s.find_all('time')[0].contents[0]
|
||||
if ':' in duration:
|
||||
duration = float(duration.split(
|
||||
':')[0]) * 60 + float(duration.split(':')[1])
|
||||
duration = s.find_all("time")[0].contents[0]
|
||||
if ":" in duration:
|
||||
duration = float(duration.split(":")[0]) * 60 + float(duration.split(":")[1])
|
||||
else:
|
||||
duration = float(duration)
|
||||
m_video.set("duration", duration)
|
||||
except: pass
|
||||
except:
|
||||
pass
|
||||
result.add_media(m_video)
|
||||
|
||||
return result.success("telegram")
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .telethon_extractor import TelethonExtractor
|
||||
from .telethon_extractor import TelethonExtractor
|
||||
|
||||
@@ -3,26 +3,35 @@
|
||||
"type": ["extractor"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["telethon",
|
||||
"loguru",
|
||||
"tqdm",
|
||||
],
|
||||
"bin": [""]
|
||||
"python": [
|
||||
"telethon",
|
||||
"loguru",
|
||||
"tqdm",
|
||||
],
|
||||
"bin": [""],
|
||||
},
|
||||
"configs": {
|
||||
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
|
||||
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
||||
"bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"},
|
||||
"session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
||||
"join_channels": {"default": True,
|
||||
"type": "bool",
|
||||
"help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
|
||||
"channel_invites": {
|
||||
"default": {},
|
||||
"help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
|
||||
"type": "json_loader",
|
||||
}
|
||||
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
|
||||
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
||||
"bot_token": {
|
||||
"default": None,
|
||||
"help": "optional, but allows access to more content such as large videos, talk to @botfather",
|
||||
},
|
||||
"session_file": {
|
||||
"default": "secrets/anon",
|
||||
"help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value.",
|
||||
},
|
||||
"join_channels": {
|
||||
"default": True,
|
||||
"type": "bool",
|
||||
"help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck",
|
||||
},
|
||||
"channel_invites": {
|
||||
"default": {},
|
||||
"help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
|
||||
"type": "json_loader",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
The `TelethonExtractor` uses the Telethon library to archive posts and media from Telegram channels and groups.
|
||||
It supports private and public channels, downloading grouped posts with media, and can join channels using invite links
|
||||
@@ -46,5 +55,5 @@ To use the `TelethonExtractor`, you must configure the following:
|
||||
The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root.
|
||||
|
||||
|
||||
"""
|
||||
}
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
|
||||
import shutil
|
||||
from telethon.sync import TelegramClient
|
||||
from telethon.errors import ChannelInvalidError
|
||||
from telethon.tl.functions.messages import ImportChatInviteRequest
|
||||
from telethon.errors.rpcerrorlist import UserAlreadyParticipantError, FloodWaitError, InviteRequestSentError, InviteHashExpiredError
|
||||
from telethon.errors.rpcerrorlist import (
|
||||
UserAlreadyParticipantError,
|
||||
FloodWaitError,
|
||||
InviteRequestSentError,
|
||||
InviteHashExpiredError,
|
||||
)
|
||||
from loguru import logger
|
||||
from tqdm import tqdm
|
||||
import re, time, os
|
||||
@@ -17,9 +21,7 @@ class TelethonExtractor(Extractor):
|
||||
valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
|
||||
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
|
||||
|
||||
|
||||
def setup(self) -> None:
|
||||
|
||||
"""
|
||||
1. makes a copy of session_file that is removed in cleanup
|
||||
2. trigger login process for telegram or proceed if already saved in a session file
|
||||
@@ -34,7 +36,7 @@ class TelethonExtractor(Extractor):
|
||||
|
||||
# initiate the client
|
||||
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
|
||||
|
||||
|
||||
with self.client.start():
|
||||
logger.success(f"SETUP {self.name} login works.")
|
||||
|
||||
@@ -52,13 +54,15 @@ class TelethonExtractor(Extractor):
|
||||
channel_invite = self.channel_invites[i]
|
||||
channel_id = channel_invite.get("id", False)
|
||||
invite = channel_invite["invite"]
|
||||
if (match := self.invite_pattern.search(invite)):
|
||||
if match := self.invite_pattern.search(invite):
|
||||
try:
|
||||
if channel_id:
|
||||
ent = self.client.get_entity(int(channel_id)) # fails if not a member
|
||||
else:
|
||||
ent = self.client.get_entity(invite) # fails if not a member
|
||||
logger.warning(f"please add the property id='{ent.id}' to the 'channel_invites' configuration where {invite=}, not doing so can lead to a minutes-long setup time due to telegram's rate limiting.")
|
||||
logger.warning(
|
||||
f"please add the property id='{ent.id}' to the 'channel_invites' configuration where {invite=}, not doing so can lead to a minutes-long setup time due to telegram's rate limiting."
|
||||
)
|
||||
except ValueError as e:
|
||||
logger.info(f"joining new channel {invite=}")
|
||||
try:
|
||||
@@ -95,7 +99,8 @@ class TelethonExtractor(Extractor):
|
||||
# detect URLs that we definitely cannot handle
|
||||
match = self.valid_url.search(url)
|
||||
logger.debug(f"TELETHON: {match=}")
|
||||
if not match: return False
|
||||
if not match:
|
||||
return False
|
||||
|
||||
is_private = match.group(1) == "/c"
|
||||
chat = int(match.group(2)) if is_private else match.group(2)
|
||||
@@ -105,45 +110,53 @@ class TelethonExtractor(Extractor):
|
||||
|
||||
# NB: not using bot_token since then private channels cannot be archived: self.client.start(bot_token=self.bot_token)
|
||||
with self.client.start():
|
||||
# with self.client.start(bot_token=self.bot_token):
|
||||
# with self.client.start(bot_token=self.bot_token):
|
||||
try:
|
||||
post = self.client.get_messages(chat, ids=post_id)
|
||||
except ValueError as e:
|
||||
logger.error(f"Could not fetch telegram {url} possibly it's private: {e}")
|
||||
return False
|
||||
except ChannelInvalidError as e:
|
||||
logger.error(f"Could not fetch telegram {url}. This error may be fixed if you setup a bot_token in addition to api_id and api_hash (but then private channels will not be archived, we need to update this logic to handle both): {e}")
|
||||
logger.error(
|
||||
f"Could not fetch telegram {url}. This error may be fixed if you setup a bot_token in addition to api_id and api_hash (but then private channels will not be archived, we need to update this logic to handle both): {e}"
|
||||
)
|
||||
return False
|
||||
|
||||
logger.debug(f"TELETHON GOT POST {post=}")
|
||||
if post is None: return False
|
||||
if post is None:
|
||||
return False
|
||||
|
||||
media_posts = self._get_media_posts_in_group(chat, post)
|
||||
logger.debug(f'got {len(media_posts)=} for {url=}')
|
||||
logger.debug(f"got {len(media_posts)=} for {url=}")
|
||||
|
||||
tmp_dir = self.tmp_dir
|
||||
|
||||
group_id = post.grouped_id if post.grouped_id is not None else post.id
|
||||
title = post.message
|
||||
for mp in media_posts:
|
||||
if len(mp.message) > len(title): title = mp.message # save the longest text found (usually only 1)
|
||||
if len(mp.message) > len(title):
|
||||
title = mp.message # save the longest text found (usually only 1)
|
||||
|
||||
# media can also be in entities
|
||||
if mp.entities:
|
||||
other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]]
|
||||
other_media_urls = [
|
||||
e.url
|
||||
for e in mp.entities
|
||||
if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]
|
||||
]
|
||||
if len(other_media_urls):
|
||||
logger.debug(f"Got {len(other_media_urls)} other media urls from {mp.id=}: {other_media_urls}")
|
||||
for i, om_url in enumerate(other_media_urls):
|
||||
filename = self.download_from_url(om_url, f'{chat}_{group_id}_{i}')
|
||||
filename = self.download_from_url(om_url, f"{chat}_{group_id}_{i}")
|
||||
result.add_media(Media(filename=filename), id=f"{group_id}_{i}")
|
||||
|
||||
filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id))
|
||||
filename_dest = os.path.join(tmp_dir, f"{chat}_{group_id}", str(mp.id))
|
||||
filename = self.client.download_media(mp.media, filename_dest)
|
||||
if not filename:
|
||||
logger.debug(f"Empty media found, skipping {str(mp)=}")
|
||||
continue
|
||||
result.add_media(Media(filename))
|
||||
|
||||
|
||||
result.set_title(title).set_timestamp(post.date).set("api_data", post.to_dict())
|
||||
if post.message != title:
|
||||
result.set_content(post.message)
|
||||
|
||||
@@ -2,18 +2,19 @@
|
||||
"name": "Thumbnail Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": False,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "ffmpeg"],
|
||||
"bin": ["ffmpeg"]
|
||||
},
|
||||
"dependencies": {"python": ["loguru", "ffmpeg"], "bin": ["ffmpeg"]},
|
||||
"configs": {
|
||||
"thumbnails_per_minute": {"default": 60,
|
||||
"type": "int",
|
||||
"help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
|
||||
"max_thumbnails": {"default": 16,
|
||||
"type": "int",
|
||||
"help": "limit the number of thumbnails to generate per video, 0 means no limit"},
|
||||
"thumbnails_per_minute": {
|
||||
"default": 60,
|
||||
"type": "int",
|
||||
"help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails",
|
||||
},
|
||||
"max_thumbnails": {
|
||||
"default": 16,
|
||||
"type": "int",
|
||||
"help": "limit the number of thumbnails to generate per video, 0 means no limit",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Generates thumbnails for video files to provide visual previews.
|
||||
|
||||
@@ -27,5 +28,5 @@
|
||||
- Requires `ffmpeg` to be installed and accessible via the system's PATH.
|
||||
- Handles videos without pre-existing duration metadata by probing with `ffmpeg`.
|
||||
- Skips enrichment for non-video media files.
|
||||
"""
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ visual snapshots of the video's keyframes, helping users preview content
|
||||
and identify important moments without watching the entire video.
|
||||
|
||||
"""
|
||||
|
||||
import ffmpeg, os
|
||||
from loguru import logger
|
||||
|
||||
@@ -18,7 +19,7 @@ class ThumbnailEnricher(Enricher):
|
||||
"""
|
||||
Generates thumbnails for all the media
|
||||
"""
|
||||
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
"""
|
||||
Uses or reads the video duration to generate thumbnails
|
||||
@@ -36,7 +37,9 @@ class ThumbnailEnricher(Enricher):
|
||||
if duration is None:
|
||||
try:
|
||||
probe = ffmpeg.probe(m.filename)
|
||||
duration = float(next(stream for stream in probe['streams'] if stream['codec_type'] == 'video')['duration'])
|
||||
duration = float(
|
||||
next(stream for stream in probe["streams"] if stream["codec_type"] == "video")["duration"]
|
||||
)
|
||||
to_enrich.media[m_id].set("duration", duration)
|
||||
except Exception as e:
|
||||
logger.error(f"error getting duration of video {m.filename}: {e}")
|
||||
@@ -48,11 +51,13 @@ class ThumbnailEnricher(Enricher):
|
||||
thumbnails_media = []
|
||||
for index, timestamp in enumerate(timestamps):
|
||||
output_path = os.path.join(folder, f"out{index}.jpg")
|
||||
ffmpeg.input(m.filename, ss=timestamp).filter('scale', 512, -1).output(output_path, vframes=1, loglevel="quiet").run()
|
||||
ffmpeg.input(m.filename, ss=timestamp).filter("scale", 512, -1).output(
|
||||
output_path, vframes=1, loglevel="quiet"
|
||||
).run()
|
||||
|
||||
try:
|
||||
thumbnails_media.append(Media(
|
||||
filename=output_path)
|
||||
thumbnails_media.append(
|
||||
Media(filename=output_path)
|
||||
.set("id", f"thumbnail_{index}")
|
||||
.set("timestamp", "%.3fs" % timestamp)
|
||||
)
|
||||
|
||||
@@ -3,38 +3,29 @@
|
||||
"type": ["enricher"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": [
|
||||
"loguru",
|
||||
"slugify",
|
||||
"tsp_client",
|
||||
"asn1crypto",
|
||||
"certvalidator",
|
||||
"certifi"
|
||||
],
|
||||
"python": ["loguru", "slugify", "tsp_client", "asn1crypto", "certvalidator", "certifi"],
|
||||
},
|
||||
"configs": {
|
||||
"tsa_urls": {
|
||||
"default": [
|
||||
# [Adobe Approved Trust List] and [Windows Cert Store]
|
||||
"http://timestamp.digicert.com",
|
||||
"http://timestamp.identrust.com",
|
||||
# "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping
|
||||
# "https://timestamp.sectigo.com", # wait 15 seconds between each request.
|
||||
|
||||
# [Adobe: European Union Trusted Lists].
|
||||
# "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request.
|
||||
|
||||
# [Windows Cert Store]
|
||||
"http://timestamp.globalsign.com/tsa/r6advanced1",
|
||||
# [Adobe: European Union Trusted Lists] and [Windows Cert Store]
|
||||
# "http://ts.quovadisglobal.com/eu", # not valid for timestamping
|
||||
# "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain
|
||||
# "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain
|
||||
# "http://tsa.sep.bg", # self-signed certificate in certificate chain
|
||||
# "http://tsa.izenpe.com", #unable to get local issuer certificate
|
||||
# "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate
|
||||
"http://tss.accv.es:8318/tsa",
|
||||
],
|
||||
# [Adobe Approved Trust List] and [Windows Cert Store]
|
||||
"http://timestamp.digicert.com",
|
||||
"http://timestamp.identrust.com",
|
||||
# "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping
|
||||
# "https://timestamp.sectigo.com", # wait 15 seconds between each request.
|
||||
# [Adobe: European Union Trusted Lists].
|
||||
# "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request.
|
||||
# [Windows Cert Store]
|
||||
"http://timestamp.globalsign.com/tsa/r6advanced1",
|
||||
# [Adobe: European Union Trusted Lists] and [Windows Cert Store]
|
||||
# "http://ts.quovadisglobal.com/eu", # not valid for timestamping
|
||||
# "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain
|
||||
# "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain
|
||||
# "http://tsa.sep.bg", # self-signed certificate in certificate chain
|
||||
# "http://tsa.izenpe.com", #unable to get local issuer certificate
|
||||
# "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate
|
||||
"http://tss.accv.es:8318/tsa",
|
||||
],
|
||||
"help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
|
||||
}
|
||||
},
|
||||
@@ -50,5 +41,5 @@
|
||||
### Notes
|
||||
- Should be run after the `hash_enricher` to ensure file hashes are available.
|
||||
- Requires internet access to interact with the configured TSAs.
|
||||
"""
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@ import certifi
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
|
||||
class TimestampingEnricher(Enricher):
|
||||
"""
|
||||
Uses several RFC3161 Time Stamp Authorities to generate a timestamp token that will be preserved. This can be used to prove that a certain file existed at a certain time, useful for legal purposes, for example, to prove that a certain file was not tampered with after a certain date.
|
||||
@@ -25,27 +26,30 @@ class TimestampingEnricher(Enricher):
|
||||
logger.debug(f"RFC3161 timestamping existing files for {url=}")
|
||||
|
||||
# create a new text file with the existing media hashes
|
||||
hashes = [m.get("hash").replace("SHA-256:", "").replace("SHA3-512:", "") for m in to_enrich.media if m.get("hash")]
|
||||
hashes = [
|
||||
m.get("hash").replace("SHA-256:", "").replace("SHA3-512:", "") for m in to_enrich.media if m.get("hash")
|
||||
]
|
||||
|
||||
if not len(hashes):
|
||||
logger.warning(f"No hashes found in {url=}")
|
||||
return
|
||||
|
||||
|
||||
tmp_dir = self.tmp_dir
|
||||
hashes_fn = os.path.join(tmp_dir, "hashes.txt")
|
||||
|
||||
data_to_sign = "\n".join(hashes)
|
||||
with open(hashes_fn, "w") as f:
|
||||
with open(hashes_fn, "w") as f:
|
||||
f.write(data_to_sign)
|
||||
hashes_media = Media(filename=hashes_fn)
|
||||
|
||||
timestamp_tokens = []
|
||||
from slugify import slugify
|
||||
|
||||
for tsa_url in self.tsa_urls:
|
||||
try:
|
||||
signing_settings = SigningSettings(tsp_server=tsa_url, digest_algorithm=DigestAlgorithm.SHA256)
|
||||
signer = TSPSigner()
|
||||
message = bytes(data_to_sign, encoding='utf8')
|
||||
message = bytes(data_to_sign, encoding="utf8")
|
||||
# send TSQ and get TSR from the TSA server
|
||||
signed = signer.sign(message=message, signing_settings=signing_settings)
|
||||
# fail if there's any issue with the certificates, uses certifi list of trusted CAs
|
||||
@@ -54,7 +58,8 @@ class TimestampingEnricher(Enricher):
|
||||
cert_chain = self.download_and_verify_certificate(signed)
|
||||
# continue with saving the timestamp token
|
||||
tst_fn = os.path.join(tmp_dir, f"timestamp_token_{slugify(tsa_url)}")
|
||||
with open(tst_fn, "wb") as f: f.write(signed)
|
||||
with open(tst_fn, "wb") as f:
|
||||
f.write(signed)
|
||||
timestamp_tokens.append(Media(filename=tst_fn).set("tsa", tsa_url).set("cert_chain", cert_chain))
|
||||
except Exception as e:
|
||||
logger.warning(f"Error while timestamping {url=} with {tsa_url=}: {e}")
|
||||
@@ -75,7 +80,7 @@ class TimestampingEnricher(Enricher):
|
||||
tst = ContentInfo.load(signed)
|
||||
|
||||
trust_roots = []
|
||||
with open(certifi.where(), 'rb') as f:
|
||||
with open(certifi.where(), "rb") as f:
|
||||
for _, _, der_bytes in pem.unarmor(f.read(), multiple=True):
|
||||
trust_roots.append(der_bytes)
|
||||
context = ValidationContext(trust_roots=trust_roots)
|
||||
@@ -83,11 +88,11 @@ class TimestampingEnricher(Enricher):
|
||||
certificates = tst["content"]["certificates"]
|
||||
first_cert = certificates[0].dump()
|
||||
intermediate_certs = []
|
||||
for i in range(1, len(certificates)): # cannot use list comprehension [1:]
|
||||
for i in range(1, len(certificates)): # cannot use list comprehension [1:]
|
||||
intermediate_certs.append(certificates[i].dump())
|
||||
|
||||
validator = CertificateValidator(first_cert, intermediate_certs=intermediate_certs, validation_context=context)
|
||||
path = validator.validate_usage({'digital_signature'}, extended_key_usage={'time_stamping'})
|
||||
path = validator.validate_usage({"digital_signature"}, extended_key_usage={"time_stamping"})
|
||||
|
||||
cert_chain = []
|
||||
for cert in path:
|
||||
@@ -96,4 +101,4 @@ class TimestampingEnricher(Enricher):
|
||||
f.write(cert.dump())
|
||||
cert_chain.append(Media(filename=cert_fn).set("subject", cert.subject.native["common_name"]))
|
||||
|
||||
return cert_chain
|
||||
return cert_chain
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .twitter_api_extractor import TwitterApiExtractor
|
||||
from .twitter_api_extractor import TwitterApiExtractor
|
||||
|
||||
@@ -3,21 +3,28 @@
|
||||
"type": ["extractor"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["requests",
|
||||
"loguru",
|
||||
"pytwitter",
|
||||
"slugify",],
|
||||
"bin": [""]
|
||||
"python": [
|
||||
"requests",
|
||||
"loguru",
|
||||
"pytwitter",
|
||||
"slugify",
|
||||
],
|
||||
"bin": [""],
|
||||
},
|
||||
"configs": {
|
||||
"bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
|
||||
"bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line",
|
||||
},
|
||||
"consumer_key": {"default": None, "help": "twitter API consumer_key"},
|
||||
"consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
|
||||
"access_token": {"default": None, "help": "twitter API access_token"},
|
||||
"access_secret": {"default": None, "help": "twitter API access_secret"},
|
||||
"bearer_token": {
|
||||
"default": None,
|
||||
"help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret",
|
||||
},
|
||||
"bearer_tokens": {
|
||||
"default": [],
|
||||
"help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line",
|
||||
},
|
||||
"consumer_key": {"default": None, "help": "twitter API consumer_key"},
|
||||
"consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
|
||||
"access_token": {"default": None, "help": "twitter API access_token"},
|
||||
"access_secret": {"default": None, "help": "twitter API access_secret"},
|
||||
},
|
||||
"description": """
|
||||
The `TwitterApiExtractor` fetches tweets and associated media using the Twitter API.
|
||||
It supports multiple API configurations for extended rate limits and reliable access.
|
||||
@@ -39,6 +46,5 @@
|
||||
- **Access Token and Secret**: Complements the consumer key for enhanced API capabilities.
|
||||
|
||||
Credentials can be obtained by creating a Twitter developer account at [Twitter Developer Platform](https://developer.twitter.com/en).
|
||||
"""
|
||||
,
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -11,8 +11,8 @@ from slugify import slugify
|
||||
from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
class TwitterApiExtractor(Extractor):
|
||||
|
||||
class TwitterApiExtractor(Extractor):
|
||||
valid_url: re.Pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
||||
|
||||
def setup(self) -> None:
|
||||
@@ -23,30 +23,38 @@ class TwitterApiExtractor(Extractor):
|
||||
if self.bearer_token:
|
||||
self.apis.append(Api(bearer_token=self.bearer_token))
|
||||
if self.consumer_key and self.consumer_secret and self.access_token and self.access_secret:
|
||||
self.apis.append(Api(consumer_key=self.consumer_key, consumer_secret=self.consumer_secret,
|
||||
access_token=self.access_token, access_secret=self.access_secret))
|
||||
assert self.api_client is not None, "Missing Twitter API configurations, please provide either AND/OR (consumer_key, consumer_secret, access_token, access_secret) to use this archiver, you can provide both for better rate-limit results."
|
||||
self.apis.append(
|
||||
Api(
|
||||
consumer_key=self.consumer_key,
|
||||
consumer_secret=self.consumer_secret,
|
||||
access_token=self.access_token,
|
||||
access_secret=self.access_secret,
|
||||
)
|
||||
)
|
||||
assert self.api_client is not None, (
|
||||
"Missing Twitter API configurations, please provide either AND/OR (consumer_key, consumer_secret, access_token, access_secret) to use this archiver, you can provide both for better rate-limit results."
|
||||
)
|
||||
|
||||
@property # getter .mimetype
|
||||
def api_client(self) -> str:
|
||||
return self.apis[self.api_index]
|
||||
|
||||
|
||||
def sanitize_url(self, url: str) -> str:
|
||||
# expand URL if t.co and clean tracker GET params
|
||||
if 'https://t.co/' in url:
|
||||
if "https://t.co/" in url:
|
||||
try:
|
||||
r = requests.get(url, timeout=30)
|
||||
logger.debug(f'Expanded url {url} to {r.url}')
|
||||
logger.debug(f"Expanded url {url} to {r.url}")
|
||||
url = r.url
|
||||
except:
|
||||
logger.error(f'Failed to expand url {url}')
|
||||
logger.error(f"Failed to expand url {url}")
|
||||
return url
|
||||
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
# call download retry until success or no more apis
|
||||
while self.api_index < len(self.apis):
|
||||
if res := self.download_retry(item): return res
|
||||
if res := self.download_retry(item):
|
||||
return res
|
||||
self.api_index += 1
|
||||
self.api_index = 0
|
||||
return False
|
||||
@@ -54,7 +62,8 @@ class TwitterApiExtractor(Extractor):
|
||||
def get_username_tweet_id(self, url):
|
||||
# detect URLs that we definitely cannot handle
|
||||
matches = self.valid_url.findall(url)
|
||||
if not len(matches): return False, False
|
||||
if not len(matches):
|
||||
return False, False
|
||||
|
||||
username, tweet_id = matches[0] # only one URL supported
|
||||
logger.debug(f"Found {username=} and {tweet_id=} in {url=}")
|
||||
@@ -65,10 +74,16 @@ class TwitterApiExtractor(Extractor):
|
||||
url = item.get_url()
|
||||
# detect URLs that we definitely cannot handle
|
||||
username, tweet_id = self.get_username_tweet_id(url)
|
||||
if not username: return False
|
||||
if not username:
|
||||
return False
|
||||
|
||||
try:
|
||||
tweet = self.api_client.get_tweet(tweet_id, expansions=["attachments.media_keys"], media_fields=["type", "duration_ms", "url", "variants"], tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"])
|
||||
tweet = self.api_client.get_tweet(
|
||||
tweet_id,
|
||||
expansions=["attachments.media_keys"],
|
||||
media_fields=["type", "duration_ms", "url", "variants"],
|
||||
tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"],
|
||||
)
|
||||
logger.debug(tweet)
|
||||
except Exception as e:
|
||||
logger.error(f"Could not get tweet: {e}")
|
||||
@@ -88,29 +103,35 @@ class TwitterApiExtractor(Extractor):
|
||||
mimetype = "image/jpeg"
|
||||
elif hasattr(m, "variants"):
|
||||
variant = self.choose_variant(m.variants)
|
||||
if not variant: continue
|
||||
if not variant:
|
||||
continue
|
||||
media.set("src", variant.url)
|
||||
mimetype = variant.content_type
|
||||
else:
|
||||
continue
|
||||
logger.info(f"Found media {media}")
|
||||
ext = mimetypes.guess_extension(mimetype)
|
||||
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}')
|
||||
media.filename = self.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}")
|
||||
result.add_media(media)
|
||||
|
||||
result.set_content(json.dumps({
|
||||
"id": tweet.data.id,
|
||||
"text": tweet.data.text,
|
||||
"created_at": tweet.data.created_at,
|
||||
"author_id": tweet.data.author_id,
|
||||
"geo": tweet.data.geo,
|
||||
"lang": tweet.data.lang,
|
||||
"media": urls
|
||||
}, ensure_ascii=False, indent=4))
|
||||
result.set_content(
|
||||
json.dumps(
|
||||
{
|
||||
"id": tweet.data.id,
|
||||
"text": tweet.data.text,
|
||||
"created_at": tweet.data.created_at,
|
||||
"author_id": tweet.data.author_id,
|
||||
"geo": tweet.data.geo,
|
||||
"lang": tweet.data.lang,
|
||||
"media": urls,
|
||||
},
|
||||
ensure_ascii=False,
|
||||
indent=4,
|
||||
)
|
||||
)
|
||||
return result.success("twitter-api")
|
||||
|
||||
def choose_variant(self, variants):
|
||||
|
||||
"""
|
||||
Chooses the highest quality variable possible out of a list of variants
|
||||
"""
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user