mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-13 05:38:29 +03:00
Set up feeder manifests (not merged by source yet)
This commit is contained in:
@@ -4,4 +4,9 @@
|
||||
|
||||
# cannot import ArchivingOrchestrator/Config to avoid circular dep
|
||||
# from .orchestrator import ArchivingOrchestrator
|
||||
# from .config import Config
|
||||
# from .config import Config
|
||||
|
||||
from .media import Media
|
||||
from .step import Step
|
||||
from .context import ArchivingContext
|
||||
from .metadata import Metadata
|
||||
|
||||
@@ -190,7 +190,6 @@ class ArchivingOrchestrator:
|
||||
|
||||
yaml_config = read_yaml(basic_config.config_file)
|
||||
|
||||
breakpoint()
|
||||
self.setup_complete_parser(basic_config, yaml_config, unused_args)
|
||||
|
||||
self.install_modules()
|
||||
|
||||
@@ -3,8 +3,3 @@
|
||||
|
||||
"""
|
||||
from .database import Database
|
||||
from .gsheet_db.gsheet_db import GsheetsDb
|
||||
from .console_db.console_db import ConsoleDb
|
||||
from .csv_db.csv_db import CSVDb
|
||||
from .api_db.api_db import AAApiDb
|
||||
from .atlos_db.atlos_db import AtlosDb
|
||||
@@ -1,70 +0,0 @@
|
||||
from typing import Union
|
||||
import requests, os
|
||||
from loguru import logger
|
||||
|
||||
from .. import Database
|
||||
from ...core import Metadata
|
||||
|
||||
|
||||
class AAApiDb(Database):
|
||||
"""
|
||||
Connects to auto-archiver-api instance
|
||||
"""
|
||||
name = "auto_archiver_api_db"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
self.allow_rearchive = bool(self.allow_rearchive)
|
||||
self.store_results = bool(self.store_results)
|
||||
self.assert_valid_string("api_endpoint")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
|
||||
"api_token": {"default": None, "help": "API Bearer token."},
|
||||
"public": {"default": False, "help": "whether the URL should be publicly available via the API"},
|
||||
"author_id": {"default": None, "help": "which email to assign as author"},
|
||||
"group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
|
||||
"allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"},
|
||||
"store_results": {"default": True, "help": "when set, will send the results to the API database."},
|
||||
"tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))},
|
||||
}
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
""" query the database for the existence of this item.
|
||||
Helps avoid re-archiving the same URL multiple times.
|
||||
"""
|
||||
if not self.allow_rearchive: return
|
||||
|
||||
params = {"url": item.get_url(), "limit": 15}
|
||||
headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
|
||||
response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers)
|
||||
|
||||
if response.status_code == 200:
|
||||
if len(response.json()):
|
||||
logger.success(f"API returned {len(response.json())} previously archived instance(s)")
|
||||
fetched_metadata = [Metadata.from_dict(r["result"]) for r in response.json()]
|
||||
return Metadata.choose_most_complete(fetched_metadata)
|
||||
else:
|
||||
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
|
||||
return False
|
||||
|
||||
|
||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
if not self.store_results: return
|
||||
if cached:
|
||||
logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
|
||||
return
|
||||
logger.debug(f"saving archive of {item.get_url()} to the AA API.")
|
||||
|
||||
payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
|
||||
headers = {"Authorization": f"Bearer {self.api_token}"}
|
||||
response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers)
|
||||
|
||||
if response.status_code == 200:
|
||||
logger.success(f"AA API: {response.json()}")
|
||||
else:
|
||||
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
|
||||
|
||||
@@ -1,79 +0,0 @@
|
||||
import os
|
||||
from typing import Union
|
||||
from loguru import logger
|
||||
from csv import DictWriter
|
||||
from dataclasses import asdict
|
||||
import requests
|
||||
|
||||
from .. import Database
|
||||
from ...core import Metadata
|
||||
from ...utils import get_atlos_config_options
|
||||
|
||||
|
||||
class AtlosDb(Database):
|
||||
"""
|
||||
Outputs results to Atlos
|
||||
"""
|
||||
|
||||
name = "atlos_db"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return get_atlos_config_options()
|
||||
|
||||
def failed(self, item: Metadata, reason: str) -> None:
|
||||
"""Update DB accordingly for failure"""
|
||||
# If the item has no Atlos ID, there's nothing for us to do
|
||||
if not item.metadata.get("atlos_id"):
|
||||
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
|
||||
return
|
||||
|
||||
requests.post(
|
||||
f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver",
|
||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||
json={"metadata": {"processed": True, "status": "error", "error": reason}},
|
||||
).raise_for_status()
|
||||
logger.info(
|
||||
f"Stored failure for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos: {reason}"
|
||||
)
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
"""check and fetch if the given item has been archived already, each
|
||||
database should handle its own caching, and configuration mechanisms"""
|
||||
return False
|
||||
|
||||
def _process_metadata(self, item: Metadata) -> dict:
|
||||
"""Process metadata for storage on Atlos. Will convert any datetime
|
||||
objects to ISO format."""
|
||||
|
||||
return {
|
||||
k: v.isoformat() if hasattr(v, "isoformat") else v
|
||||
for k, v in item.metadata.items()
|
||||
}
|
||||
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
|
||||
if not item.metadata.get("atlos_id"):
|
||||
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
|
||||
return
|
||||
|
||||
requests.post(
|
||||
f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver",
|
||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||
json={
|
||||
"metadata": dict(
|
||||
processed=True,
|
||||
status="success",
|
||||
results=self._process_metadata(item),
|
||||
)
|
||||
},
|
||||
).raise_for_status()
|
||||
|
||||
logger.info(
|
||||
f"Stored success for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos"
|
||||
)
|
||||
@@ -1,32 +0,0 @@
|
||||
from loguru import logger
|
||||
|
||||
from .. import Database
|
||||
from ...core import Metadata
|
||||
|
||||
|
||||
class ConsoleDb(Database):
|
||||
"""
|
||||
Outputs results to the console
|
||||
"""
|
||||
name = "console_db"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
|
||||
def failed(self, item: Metadata, reason:str) -> None:
|
||||
logger.error(f"FAILED {item}: {reason}")
|
||||
|
||||
def aborted(self, item: Metadata) -> None:
|
||||
logger.warning(f"ABORTED {item}")
|
||||
|
||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
logger.success(f"DONE {item}")
|
||||
@@ -1,34 +0,0 @@
|
||||
import os
|
||||
from loguru import logger
|
||||
from csv import DictWriter
|
||||
from dataclasses import asdict
|
||||
|
||||
from .. import Database
|
||||
from ...core import Metadata
|
||||
|
||||
|
||||
class CSVDb(Database):
|
||||
"""
|
||||
Outputs results to a CSV file
|
||||
"""
|
||||
name = "csv_db"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
self.assert_valid_string("csv_file")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"csv_file": {"default": "db.csv", "help": "CSV file name"}
|
||||
}
|
||||
|
||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
logger.success(f"DONE {item}")
|
||||
is_empty = not os.path.isfile(self.csv_file) or os.path.getsize(self.csv_file) == 0
|
||||
with open(self.csv_file, "a", encoding="utf-8") as outf:
|
||||
writer = DictWriter(outf, fieldnames=asdict(Metadata()))
|
||||
if is_empty: writer.writeheader()
|
||||
writer.writerow(asdict(item))
|
||||
@@ -1,112 +0,0 @@
|
||||
from typing import Union, Tuple
|
||||
import datetime
|
||||
from urllib.parse import quote
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from .. import Database
|
||||
from ...core import Metadata, Media, ArchivingContext
|
||||
from ...utils import GWorksheet
|
||||
|
||||
|
||||
class GsheetsDb(Database):
|
||||
"""
|
||||
NB: only works if GsheetFeeder is used.
|
||||
could be updated in the future to support non-GsheetFeeder metadata
|
||||
"""
|
||||
name = "gsheet_db"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, 'status', 'Archive in progress')
|
||||
|
||||
def failed(self, item: Metadata, reason:str) -> None:
|
||||
logger.error(f"FAILED {item}")
|
||||
self._safe_status_update(item, f'Archive failed {reason}')
|
||||
|
||||
def aborted(self, item: Metadata) -> None:
|
||||
logger.warning(f"ABORTED {item}")
|
||||
self._safe_status_update(item, '')
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
"""check if the given item has been archived already"""
|
||||
return False
|
||||
|
||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
logger.success(f"DONE {item.get_url()}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
# self._safe_status_update(item, 'done')
|
||||
|
||||
cell_updates = []
|
||||
row_values = gw.get_row(row)
|
||||
|
||||
def batch_if_valid(col, val, final_value=None):
|
||||
final_value = final_value or val
|
||||
try:
|
||||
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
|
||||
cell_updates.append((row, col, final_value))
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to batch {col}={final_value} due to {e}")
|
||||
status_message = item.status
|
||||
if cached:
|
||||
status_message = f"[cached] {status_message}"
|
||||
cell_updates.append((row, 'status', status_message))
|
||||
|
||||
media: Media = item.get_final_media()
|
||||
if hasattr(media, "urls"):
|
||||
batch_if_valid('archive', "\n".join(media.urls))
|
||||
batch_if_valid('date', True, datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat())
|
||||
batch_if_valid('title', item.get_title())
|
||||
batch_if_valid('text', item.get("content", ""))
|
||||
batch_if_valid('timestamp', item.get_timestamp())
|
||||
if media: batch_if_valid('hash', media.get("hash", "not-calculated"))
|
||||
|
||||
# merge all pdq hashes into a single string, if present
|
||||
pdq_hashes = []
|
||||
all_media = item.get_all_media()
|
||||
for m in all_media:
|
||||
if pdq := m.get("pdq_hash"):
|
||||
pdq_hashes.append(pdq)
|
||||
if len(pdq_hashes):
|
||||
batch_if_valid('pdq_hash', ",".join(pdq_hashes))
|
||||
|
||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
|
||||
batch_if_valid('screenshot', "\n".join(screenshot.urls))
|
||||
|
||||
if (thumbnail := item.get_first_image("thumbnail")):
|
||||
if hasattr(thumbnail, "urls"):
|
||||
batch_if_valid('thumbnail', f'=IMAGE("{thumbnail.urls[0]}")')
|
||||
|
||||
if (browsertrix := item.get_media_by_id("browsertrix")):
|
||||
batch_if_valid('wacz', "\n".join(browsertrix.urls))
|
||||
batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls]))
|
||||
|
||||
gw.batch_set_cell(cell_updates)
|
||||
|
||||
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
|
||||
try:
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, 'status', new_status)
|
||||
except Exception as e:
|
||||
logger.debug(f"Unable to update sheet: {e}")
|
||||
|
||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
|
||||
if gsheet := ArchivingContext.get("gsheet"):
|
||||
gw: GWorksheet = gsheet.get("worksheet")
|
||||
row: int = gsheet.get("row")
|
||||
elif self.sheet_id:
|
||||
print(self.sheet_id)
|
||||
|
||||
|
||||
return gw, row
|
||||
@@ -2,8 +2,8 @@ from typing import Union
|
||||
import requests, os
|
||||
from loguru import logger
|
||||
|
||||
from . import Database
|
||||
from ..core import Metadata
|
||||
from auto_archiver.databases import Database
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
class AAApiDb(Database):
|
||||
@@ -19,18 +19,7 @@ class AAApiDb(Database):
|
||||
self.store_results = bool(self.store_results)
|
||||
self.assert_valid_string("api_endpoint")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
|
||||
"api_token": {"default": None, "help": "API Bearer token."},
|
||||
"public": {"default": False, "help": "whether the URL should be publicly available via the API"},
|
||||
"author_id": {"default": None, "help": "which email to assign as author"},
|
||||
"group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
|
||||
"allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"},
|
||||
"store_results": {"default": True, "help": "when set, will send the results to the API database."},
|
||||
"tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))},
|
||||
}
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
""" query the database for the existence of this item.
|
||||
Helps avoid re-archiving the same URL multiple times.
|
||||
@@ -7,7 +7,18 @@
|
||||
{"python": ["loguru",
|
||||
""],
|
||||
"bin": [""]},
|
||||
"configs": {},
|
||||
"configs": {
|
||||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Handles integration with the Atlos platform for managing archival results.
|
||||
|
||||
@@ -5,9 +5,9 @@ from csv import DictWriter
|
||||
from dataclasses import asdict
|
||||
import requests
|
||||
|
||||
from . import Database
|
||||
from ..core import Metadata
|
||||
from ..utils import get_atlos_config_options
|
||||
from auto_archiver.databases import Database
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.utils import get_atlos_config_options
|
||||
|
||||
|
||||
class AtlosDb(Database):
|
||||
@@ -21,6 +21,7 @@ class AtlosDb(Database):
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
# TODO
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return get_atlos_config_options()
|
||||
34
src/auto_archiver/modules/atlos_feeder/__manifest__.py
Normal file
34
src/auto_archiver/modules/atlos_feeder/__manifest__.py
Normal file
@@ -0,0 +1,34 @@
|
||||
{
|
||||
"name": "Atlos Feeder",
|
||||
"type": ["feeder"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
},
|
||||
"configs": {
|
||||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
AtlosFeeder: A feeder module that integrates with the Atlos API to fetch source material URLs for archival.
|
||||
|
||||
### Features
|
||||
- Connects to the Atlos API to retrieve a list of source material URLs.
|
||||
- Filters source materials based on visibility, processing status, and metadata.
|
||||
- Converts filtered source materials into `Metadata` objects with the relevant `atlos_id` and URL.
|
||||
- Iterates through paginated results using a cursor for efficient API interaction.
|
||||
|
||||
### Notes
|
||||
- Requires an Atlos API endpoint and a valid API token for authentication.
|
||||
- Ensures only unprocessed, visible, and ready-to-archive URLs are returned.
|
||||
- Handles pagination transparently when retrieving data from the Atlos API.
|
||||
"""
|
||||
}
|
||||
@@ -1,9 +1,9 @@
|
||||
from loguru import logger
|
||||
import requests
|
||||
|
||||
from . import Feeder
|
||||
from ..core import Metadata, ArchivingContext
|
||||
from ..utils import get_atlos_config_options
|
||||
from auto_archiver.feeders import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
from auto_archiver.utils import get_atlos_config_options
|
||||
|
||||
|
||||
class AtlosFeeder(Feeder):
|
||||
@@ -15,6 +15,7 @@ class AtlosFeeder(Feeder):
|
||||
if type(self.api_token) != str:
|
||||
raise Exception("Atlos Feeder did not receive an Atlos API token")
|
||||
|
||||
# TODO
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return get_atlos_config_options()
|
||||
24
src/auto_archiver/modules/cli_feeder/__manifest__.py
Normal file
24
src/auto_archiver/modules/cli_feeder/__manifest__.py
Normal file
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"name": "CLI Feeder",
|
||||
"type": ["feeder"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru"],
|
||||
},
|
||||
"configs": {
|
||||
"urls": {
|
||||
"default": None,
|
||||
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||
"cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Processes URLs to archive passed via the command line and feeds them into the archiving pipeline.
|
||||
|
||||
### Features
|
||||
- Takes a single URL or a list of URLs provided via the command line.
|
||||
- Converts each URL into a `Metadata` object and yields it for processing.
|
||||
- Ensures URLs are processed only if they are explicitly provided.
|
||||
|
||||
"""
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
from loguru import logger
|
||||
|
||||
from . import Feeder
|
||||
from ..core import Metadata, ArchivingContext
|
||||
from auto_archiver.feeders import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
|
||||
|
||||
class CLIFeeder(Feeder):
|
||||
@@ -13,15 +13,15 @@ class CLIFeeder(Feeder):
|
||||
if type(self.urls) != list or len(self.urls) == 0:
|
||||
raise Exception("CLI Feeder did not receive any URL to process")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"urls": {
|
||||
"default": None,
|
||||
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||
"cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
|
||||
},
|
||||
}
|
||||
# @staticmethod
|
||||
# def configs() -> dict:
|
||||
# return {
|
||||
# "urls": {
|
||||
# "default": None,
|
||||
# "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||
# "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
|
||||
# },
|
||||
# }
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
for url in self.urls:
|
||||
@@ -1,7 +1,7 @@
|
||||
from loguru import logger
|
||||
|
||||
from . import Database
|
||||
from ..core import Metadata
|
||||
from auto_archiver.databases import Database
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
class ConsoleDb(Database):
|
||||
@@ -14,10 +14,6 @@ class ConsoleDb(Database):
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
|
||||
0
src/auto_archiver/modules/csv_db/__init__.py
Normal file
0
src/auto_archiver/modules/csv_db/__init__.py
Normal file
@@ -3,8 +3,8 @@ from loguru import logger
|
||||
from csv import DictWriter
|
||||
from dataclasses import asdict
|
||||
|
||||
from . import Database
|
||||
from ..core import Metadata
|
||||
from auto_archiver.databases import Database
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
class CSVDb(Database):
|
||||
@@ -18,11 +18,6 @@ class CSVDb(Database):
|
||||
super().__init__(config)
|
||||
self.assert_valid_string("csv_file")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"csv_file": {"default": "db.csv", "help": "CSV file name"}
|
||||
}
|
||||
|
||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
0
src/auto_archiver/modules/csv_feeder/__init__.py
Normal file
0
src/auto_archiver/modules/csv_feeder/__init__.py
Normal file
33
src/auto_archiver/modules/csv_feeder/__manifest__.py
Normal file
33
src/auto_archiver/modules/csv_feeder/__manifest__.py
Normal file
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"name": "CSV Feeder",
|
||||
"type": ["feeder"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru"],
|
||||
"bin": [""]
|
||||
},
|
||||
"configs": {
|
||||
"files": {
|
||||
"default": None,
|
||||
"help": "Path to the input file(s) to read the URLs from, comma separated. \
|
||||
Input files should be formatted with one URL per line",
|
||||
"cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
|
||||
},
|
||||
"column": {
|
||||
"default": None,
|
||||
"help": "Column number or name to read the URLs from, 0-indexed",
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
Reads URLs from CSV files and feeds them into the archiving process.
|
||||
|
||||
### Features
|
||||
- Supports reading URLs from multiple input files, specified as a comma-separated list.
|
||||
- Allows specifying the column number or name to extract URLs from.
|
||||
- Skips header rows if the first value is not a valid URL.
|
||||
- Integrates with the `ArchivingContext` to manage URL feeding.
|
||||
|
||||
### Setu N
|
||||
- Input files should be formatted with one URL per line.
|
||||
"""
|
||||
}
|
||||
@@ -1,12 +1,15 @@
|
||||
from loguru import logger
|
||||
import csv
|
||||
|
||||
from . import Feeder
|
||||
from ..core import Metadata, ArchivingContext
|
||||
from ..utils import url_or_none
|
||||
from auto_archiver.feeders import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
from auto_archiver.utils import url_or_none
|
||||
|
||||
class CSVFeeder(Feeder):
|
||||
|
||||
name = "csv_feeder"
|
||||
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
0
src/auto_archiver/modules/gsheet_db/__init__.py
Normal file
0
src/auto_archiver/modules/gsheet_db/__init__.py
Normal file
@@ -4,9 +4,9 @@ from urllib.parse import quote
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from . import Database
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
from ..utils import GWorksheet
|
||||
from auto_archiver.databases import Database
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.utils import GWorksheet
|
||||
|
||||
|
||||
class GsheetsDb(Database):
|
||||
@@ -20,10 +20,6 @@ class GsheetsDb(Database):
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
0
src/auto_archiver/modules/gsheet_feeder/__init__.py
Normal file
0
src/auto_archiver/modules/gsheet_feeder/__init__.py
Normal file
40
src/auto_archiver/modules/gsheet_feeder/__manifest__.py
Normal file
40
src/auto_archiver/modules/gsheet_feeder/__manifest__.py
Normal file
@@ -0,0 +1,40 @@
|
||||
{
|
||||
"name": "Google Sheets Feeder",
|
||||
"type": ["feeder"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "gspread", "python-slugify"],
|
||||
},
|
||||
"configs": {
|
||||
"allow_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver.
|
||||
|
||||
This reads data from Google Sheets and filters rows based on user-defined rules.
|
||||
The filtered rows are processed into `Metadata` objects.
|
||||
|
||||
### Features
|
||||
- Validates the sheet structure and filters rows based on input configurations.
|
||||
- Processes only worksheets allowed by the `allow_worksheets` and `block_worksheets` configurations.
|
||||
- Ensures only rows with valid URLs and unprocessed statuses are included for archival.
|
||||
- Supports organizing stored files into folder paths based on sheet and worksheet names.
|
||||
|
||||
### Notes
|
||||
- Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`.
|
||||
- Create the sheet using the template provided in the docs.
|
||||
"""
|
||||
}
|
||||
@@ -14,9 +14,9 @@ from loguru import logger
|
||||
from slugify import slugify
|
||||
|
||||
# from . import Enricher
|
||||
from . import Feeder
|
||||
from ..core import Metadata, ArchivingContext
|
||||
from ..utils import Gsheets, GWorksheet
|
||||
from auto_archiver.feeders import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
from auto_archiver.utils import Gsheets, GWorksheet
|
||||
|
||||
|
||||
class GsheetsFeeder(Gsheets, Feeder):
|
||||
@@ -27,26 +27,26 @@ class GsheetsFeeder(Gsheets, Feeder):
|
||||
super().__init__(config)
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return dict(
|
||||
Gsheets.configs(),
|
||||
** {
|
||||
"allow_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
}
|
||||
})
|
||||
# @staticmethod
|
||||
# def configs() -> dict:
|
||||
# return dict(
|
||||
# Gsheets.configs(),
|
||||
# ** {
|
||||
# "allow_worksheets": {
|
||||
# "default": set(),
|
||||
# "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
# "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
# },
|
||||
# "block_worksheets": {
|
||||
# "default": set(),
|
||||
# "help": "(CSV) explicitly block some worksheets from being processed",
|
||||
# "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
# },
|
||||
# "use_sheet_names_in_stored_paths": {
|
||||
# "default": True,
|
||||
# "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
# }
|
||||
# })
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
sh = self.open_sheet()
|
||||
0
src/auto_archiver/modules/hash_enricher/__init__.py
Normal file
0
src/auto_archiver/modules/hash_enricher/__init__.py
Normal file
27
src/auto_archiver/modules/hash_enricher/__manifest__.py
Normal file
27
src/auto_archiver/modules/hash_enricher/__manifest__.py
Normal file
@@ -0,0 +1,27 @@
|
||||
{
|
||||
"name": "Hash Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru"],
|
||||
},
|
||||
"configs": {
|
||||
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
|
||||
"chunksize": {"default": int(1.6e7), "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
|
||||
},
|
||||
"description": """
|
||||
Generates cryptographic hashes for media files to ensure data integrity and authenticity.
|
||||
|
||||
### Features
|
||||
- Calculates cryptographic hashes (SHA-256 or SHA3-512) for media files stored in `Metadata` objects.
|
||||
- Ensures content authenticity, integrity validation, and duplicate identification.
|
||||
- Efficiently processes large files by reading file bytes in configurable chunk sizes.
|
||||
- Supports dynamic configuration of hash algorithms and chunk sizes.
|
||||
- Updates media metadata with the computed hash value in the format `<algorithm>:<hash>`.
|
||||
|
||||
### Notes
|
||||
- Default hash algorithm is SHA-256, but SHA3-512 is also supported.
|
||||
- Chunk size defaults to 16 MB but can be adjusted based on memory requirements.
|
||||
- Useful for workflows requiring hash-based content validation or deduplication.
|
||||
""",
|
||||
}
|
||||
@@ -10,8 +10,8 @@ making it suitable for handling large files efficiently.
|
||||
import hashlib
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Metadata, ArchivingContext
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
|
||||
|
||||
class HashEnricher(Enricher):
|
||||
@@ -45,13 +45,6 @@ class HashEnricher(Enricher):
|
||||
|
||||
ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
|
||||
"chunksize": {"default": int(1.6e7), "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
|
||||
@@ -8,7 +8,7 @@
|
||||
"retrying",
|
||||
"tqdm",],
|
||||
},
|
||||
"no_setup_required": False,
|
||||
"requires_setup": True,
|
||||
"configs": {
|
||||
"access_token": {"default": None, "help": "a valid instagrapi-api token"},
|
||||
"api_endpoint": {"default": None, "help": "API endpoint to use"},
|
||||
@@ -25,5 +25,22 @@
|
||||
"help": "if true, will remove empty values from the json output",
|
||||
},
|
||||
},
|
||||
"description": "",
|
||||
"description": """
|
||||
Archives various types of Instagram content using the Instagrapi API.
|
||||
|
||||
### Features
|
||||
- Connects to an Instagrapi API deployment to fetch Instagram profiles, posts, stories, highlights, reels, and tagged content.
|
||||
- Supports advanced configuration options, including:
|
||||
- Full profile download (all posts, stories, highlights, and tagged content).
|
||||
- Limiting the number of posts to fetch for large profiles.
|
||||
- Minimising JSON output to remove empty fields and redundant data.
|
||||
- Provides robust error handling and retries for API calls.
|
||||
- Ensures efficient media scraping, including handling nested or carousel media items.
|
||||
- Adds downloaded media and metadata to the result for further processing.
|
||||
|
||||
### Notes
|
||||
- Requires a valid Instagrapi API token (`access_token`) and API endpoint (`api_endpoint`).
|
||||
- Full-profile downloads can be limited by setting `full_profile_max_posts`.
|
||||
- Designed to fetch content in batches for large profiles, minimising API load.
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -45,25 +45,6 @@ class InstagramAPIArchiver(Archiver):
|
||||
self.full_profile = bool(self.full_profile)
|
||||
self.minimize_json_output = bool(self.minimize_json_output)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"access_token": {"default": None, "help": "a valid instagrapi-api token"},
|
||||
"api_endpoint": {"default": None, "help": "API endpoint to use"},
|
||||
"full_profile": {
|
||||
"default": False,
|
||||
"help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.",
|
||||
},
|
||||
"full_profile_max_posts": {
|
||||
"default": 0,
|
||||
"help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights",
|
||||
},
|
||||
"minimize_json_output": {
|
||||
"default": True,
|
||||
"help": "if true, will remove empty values from the json output",
|
||||
},
|
||||
}
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
|
||||
@@ -3,10 +3,12 @@
|
||||
"type": ["extractor"],
|
||||
"entry_point": "instagram_archiver:InstagramArchiver",
|
||||
"external_dependencies": {
|
||||
"python": ["instaloader",
|
||||
"loguru",],
|
||||
"python": [
|
||||
"instaloader",
|
||||
"loguru",
|
||||
],
|
||||
},
|
||||
"no_setup_required": False,
|
||||
"requires_setup": True,
|
||||
"configs": {
|
||||
"username": {"default": None, "help": "a valid Instagram username"},
|
||||
"password": {
|
||||
|
||||
@@ -45,16 +45,7 @@ class InstagramArchiver(Archiver):
|
||||
except Exception as e2:
|
||||
logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"username": {"default": None, "help": "a valid Instagram username"},
|
||||
"password": {"default": None, "help": "the corresponding Instagram account password"},
|
||||
"download_folder": {"default": "instaloader", "help": "name of a folder to temporarily download content to"},
|
||||
"session_file": {"default": "secrets/instaloader.session", "help": "path to the instagram session which saves session credentials"},
|
||||
#TODO: fine-grain
|
||||
# "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"},
|
||||
}
|
||||
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
@@ -34,15 +34,6 @@ class InstagramTbotArchiver(Archiver):
|
||||
self.assert_valid_string("api_hash")
|
||||
self.timeout = int(self.timeout)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
|
||||
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
||||
"session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
||||
"timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."},
|
||||
}
|
||||
|
||||
def setup(self) -> None:
|
||||
"""
|
||||
1. makes a copy of session_file that is removed in cleanup
|
||||
|
||||
0
src/auto_archiver/modules/meta_enricher/__init__.py
Normal file
0
src/auto_archiver/modules/meta_enricher/__init__.py
Normal file
22
src/auto_archiver/modules/meta_enricher/__manifest__.py
Normal file
22
src/auto_archiver/modules/meta_enricher/__manifest__.py
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"name": "Archive Metadata Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru"],
|
||||
},
|
||||
"description": """
|
||||
Adds metadata information about the archive operations, Adds metadata about archive operations, including file sizes and archive duration./
|
||||
To be included at the end of all enrichments.
|
||||
|
||||
### Features
|
||||
- Calculates the total size of all archived media files, storing the result in human-readable and byte formats.
|
||||
- Computes the duration of the archival process, storing the elapsed time in seconds.
|
||||
- Ensures all enrichments are performed only if the `Metadata` object contains valid data.
|
||||
- Adds detailed metadata to provide insights into file sizes and archival performance.
|
||||
|
||||
### Notes
|
||||
- Skips enrichment if no media or metadata is available in the `Metadata` object.
|
||||
- File sizes are calculated using the `os.stat` module, ensuring accurate byte-level reporting.
|
||||
""",
|
||||
}
|
||||
@@ -2,8 +2,8 @@ import datetime
|
||||
import os
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Metadata
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
class MetaEnricher(Enricher):
|
||||
@@ -17,10 +17,6 @@ class MetaEnricher(Enricher):
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
if to_enrich.is_empty():
|
||||
@@ -28,7 +24,7 @@ class MetaEnricher(Enricher):
|
||||
return
|
||||
|
||||
logger.debug(f"calculating archive metadata information for {url=}")
|
||||
|
||||
|
||||
self.enrich_file_sizes(to_enrich)
|
||||
self.enrich_archive_duration(to_enrich)
|
||||
|
||||
@@ -40,10 +36,10 @@ class MetaEnricher(Enricher):
|
||||
media.set("bytes", file_stats.st_size)
|
||||
media.set("size", self.human_readable_bytes(file_stats.st_size))
|
||||
total_size += file_stats.st_size
|
||||
|
||||
|
||||
to_enrich.set("total_bytes", total_size)
|
||||
to_enrich.set("total_size", self.human_readable_bytes(total_size))
|
||||
|
||||
|
||||
|
||||
def human_readable_bytes(self, size: int) -> str:
|
||||
# receives number of bytes and returns human readble size
|
||||
22
src/auto_archiver/modules/metadata_enricher/__manifest__.py
Normal file
22
src/auto_archiver/modules/metadata_enricher/__manifest__.py
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"name": "Media Metadata Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru"],
|
||||
"bin": ["exiftool"]
|
||||
|
||||
},
|
||||
"description": """
|
||||
Extracts metadata information from files using ExifTool.
|
||||
|
||||
### Features
|
||||
- Uses ExifTool to extract detailed metadata from media files.
|
||||
- Processes file-specific data like camera settings, geolocation, timestamps, and other embedded metadata.
|
||||
- Adds extracted metadata to the corresponding `Media` object within the `Metadata`.
|
||||
|
||||
### Notes
|
||||
- Requires ExifTool to be installed and accessible via the system's PATH.
|
||||
- Skips enrichment for files where metadata extraction fails.
|
||||
"""
|
||||
}
|
||||
@@ -2,8 +2,8 @@ import subprocess
|
||||
import traceback
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Metadata
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
class MetadataEnricher(Enricher):
|
||||
@@ -16,9 +16,6 @@ class MetadataEnricher(Enricher):
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
21
src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py
Normal file
21
src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py
Normal file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"name": "PDQ Hash Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "pdqhash", "numpy", "Pillow"],
|
||||
},
|
||||
"description": """
|
||||
PDQ Hash Enricher for generating perceptual hashes of media files.
|
||||
|
||||
### Features
|
||||
- Calculates perceptual hashes for image files using the PDQ hashing algorithm.
|
||||
- Enables detection of duplicate or near-duplicate visual content.
|
||||
- Processes images stored in `Metadata` objects, adding computed hashes to the corresponding `Media` entries.
|
||||
- Skips non-image media or files unsuitable for hashing (e.g., corrupted or unsupported formats).
|
||||
|
||||
### Notes
|
||||
- Best used after enrichers like `thumbnail_enricher` or `screenshot_enricher` to ensure images are available.
|
||||
- Uses the `pdqhash` library to compute 256-bit perceptual hashes, which are stored as hexadecimal strings.
|
||||
"""
|
||||
}
|
||||
@@ -16,8 +16,8 @@ import numpy as np
|
||||
from PIL import Image, UnidentifiedImageError
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Metadata
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
class PdqHashEnricher(Enricher):
|
||||
@@ -31,10 +31,6 @@ class PdqHashEnricher(Enricher):
|
||||
# Without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"calculating perceptual hashes for {url=}")
|
||||
@@ -0,0 +1,30 @@
|
||||
{
|
||||
"name": "Screenshot Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "selenium"],
|
||||
"bin": ["chromedriver"]
|
||||
},
|
||||
"configs": {
|
||||
"width": {"default": 1280, "help": "width of the screenshots"},
|
||||
"height": {"default": 720, "help": "height of the screenshots"},
|
||||
"timeout": {"default": 60, "help": "timeout for taking the screenshot"},
|
||||
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
|
||||
"http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
|
||||
"save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
|
||||
"print_options": {"default": {}, "help": "options to pass to the pdf printer"}
|
||||
},
|
||||
"description": """
|
||||
Captures screenshots and optionally saves web pages as PDFs using a WebDriver.
|
||||
|
||||
### Features
|
||||
- Takes screenshots of web pages, with configurable width, height, and timeout settings.
|
||||
- Optionally saves pages as PDFs, with additional configuration for PDF printing options.
|
||||
- Bypasses URLs detected as authentication walls.
|
||||
- Integrates seamlessly with the metadata enrichment pipeline, adding screenshots and PDFs as media.
|
||||
|
||||
### Notes
|
||||
- Requires a WebDriver (e.g., ChromeDriver) installed and accessible via the system's PATH.
|
||||
"""
|
||||
}
|
||||
@@ -5,24 +5,30 @@ import base64
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
|
||||
from . import Enricher
|
||||
from ..utils import Webdriver, UrlUtil, random_str
|
||||
from ..core import Media, Metadata, ArchivingContext
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.utils import Webdriver, UrlUtil, random_str
|
||||
from auto_archiver.core import Media, Metadata, ArchivingContext
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
name = "screenshot_enricher"
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"width": {"default": 1280, "help": "width of the screenshots"},
|
||||
"height": {"default": 720, "help": "height of the screenshots"},
|
||||
"timeout": {"default": 60, "help": "timeout for taking the screenshot"},
|
||||
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
|
||||
"http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
|
||||
"save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
|
||||
"print_options": {"default": {}, "help": "options to pass to the pdf printer"}
|
||||
}
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
# TODO?
|
||||
|
||||
|
||||
|
||||
# @staticmethod
|
||||
# def configs() -> dict:
|
||||
# return {
|
||||
# "width": {"default": 1280, "help": "width of the screenshots"},
|
||||
# "height": {"default": 720, "help": "height of the screenshots"},
|
||||
# "timeout": {"default": 60, "help": "timeout for taking the screenshot"},
|
||||
# "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
|
||||
# "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
|
||||
# "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
|
||||
# "print_options": {"default": {}, "help": "options to pass to the pdf printer"}
|
||||
# }
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
0
src/auto_archiver/modules/ssl_enricher/__init__.py
Normal file
0
src/auto_archiver/modules/ssl_enricher/__init__.py
Normal file
22
src/auto_archiver/modules/ssl_enricher/__manifest__.py
Normal file
22
src/auto_archiver/modules/ssl_enricher/__manifest__.py
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"name": "SSL Certificate Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "python-slugify"],
|
||||
},
|
||||
"configs": {
|
||||
"skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"},
|
||||
},
|
||||
"description": """
|
||||
Retrieves SSL certificate information for a domain and stores it as a file.
|
||||
|
||||
### Features
|
||||
- Fetches SSL certificates for domains using the HTTPS protocol.
|
||||
- Stores certificates in PEM format and adds them as media to the metadata.
|
||||
- Skips enrichment if no media has been archived, based on the `skip_when_nothing_archived` configuration.
|
||||
|
||||
### Notes
|
||||
- Requires the target URL to use the HTTPS scheme; other schemes are not supported.
|
||||
"""
|
||||
}
|
||||
@@ -3,8 +3,8 @@ from slugify import slugify
|
||||
from urllib.parse import urlparse
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Metadata, ArchivingContext, Media
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.core import Metadata, ArchivingContext, Media
|
||||
|
||||
|
||||
class SSLEnricher(Enricher):
|
||||
@@ -15,13 +15,7 @@ class SSLEnricher(Enricher):
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
self. skip_when_nothing_archived = bool(self.skip_when_nothing_archived)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"},
|
||||
}
|
||||
self.skip_when_nothing_archived = bool(self.skip_when_nothing_archived)
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
if not to_enrich.media and self.skip_when_nothing_archived: return
|
||||
@@ -16,9 +16,6 @@ class TelegramArchiver(Archiver):
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
"default": {},
|
||||
"help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
|
||||
# TODO
|
||||
#"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
|
||||
"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
|
||||
@@ -23,20 +23,6 @@ class TelethonArchiver(Archiver):
|
||||
self.assert_valid_string("api_id")
|
||||
self.assert_valid_string("api_hash")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
|
||||
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
||||
"bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"},
|
||||
"session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
||||
"join_channels": {"default": True, "help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
|
||||
"channel_invites": {
|
||||
"default": {},
|
||||
"help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
|
||||
"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
|
||||
}
|
||||
}
|
||||
|
||||
def setup(self) -> None:
|
||||
"""
|
||||
|
||||
27
src/auto_archiver/modules/thumbnail_enricher/__manifest__.py
Normal file
27
src/auto_archiver/modules/thumbnail_enricher/__manifest__.py
Normal file
@@ -0,0 +1,27 @@
|
||||
{
|
||||
"name": "Thumbnail Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "ffmpeg-python"],
|
||||
"bin": ["ffmpeg"]
|
||||
},
|
||||
"configs": {
|
||||
"thumbnails_per_minute": {"default": 60, "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
|
||||
"max_thumbnails": {"default": 16, "help": "limit the number of thumbnails to generate per video, 0 means no limit"},
|
||||
},
|
||||
"description": """
|
||||
Generates thumbnails for video files to provide visual previews.
|
||||
|
||||
### Features
|
||||
- Processes video files and generates evenly distributed thumbnails.
|
||||
- Calculates the number of thumbnails based on video duration, `thumbnails_per_minute`, and `max_thumbnails`.
|
||||
- Distributes thumbnails equally across the video's duration and stores them as media objects.
|
||||
- Adds metadata for each thumbnail, including timestamps and IDs.
|
||||
|
||||
### Notes
|
||||
- Requires `ffmpeg` to be installed and accessible via the system's PATH.
|
||||
- Handles videos without pre-existing duration metadata by probing with `ffmpeg`.
|
||||
- Skips enrichment for non-video media files.
|
||||
"""
|
||||
}
|
||||
@@ -9,9 +9,9 @@ and identify important moments without watching the entire video.
|
||||
import ffmpeg, os
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Media, Metadata, ArchivingContext
|
||||
from ..utils.misc import random_str
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.core import Media, Metadata, ArchivingContext
|
||||
from auto_archiver.utils.misc import random_str
|
||||
|
||||
|
||||
class ThumbnailEnricher(Enricher):
|
||||
@@ -25,13 +25,6 @@ class ThumbnailEnricher(Enricher):
|
||||
super().__init__(config)
|
||||
self.thumbnails_per_second = int(self.thumbnails_per_minute) / 60
|
||||
self.max_thumbnails = int(self.max_thumbnails)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"thumbnails_per_minute": {"default": 60, "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
|
||||
"max_thumbnails": {"default": 16, "help": "limit the number of thumbnails to generate per video, 0 means no limit"},
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
"""
|
||||
@@ -0,0 +1,40 @@
|
||||
{
|
||||
"name": "Timestamping Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": [
|
||||
"loguru",
|
||||
"slugify",
|
||||
"tsp_client",
|
||||
"asn1crypto",
|
||||
"certvalidator",
|
||||
"certifi"
|
||||
],
|
||||
},
|
||||
"configs": {
|
||||
"tsa_urls": {
|
||||
"default": [
|
||||
"http://timestamp.digicert.com",
|
||||
"http://timestamp.identrust.com",
|
||||
"http://timestamp.globalsign.com/tsa/r6advanced1",
|
||||
"http://tss.accv.es:8318/tsa"
|
||||
],
|
||||
"help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
Generates RFC3161-compliant timestamp tokens using Time Stamp Authorities (TSA) for archived files.
|
||||
|
||||
### Features
|
||||
- Creates timestamp tokens to prove the existence of files at a specific time, useful for legal and authenticity purposes.
|
||||
- Aggregates file hashes into a text file and timestamps the concatenated data.
|
||||
- Uses multiple Time Stamp Authorities (TSAs) to ensure reliability and redundancy.
|
||||
- Validates timestamping certificates against trusted Certificate Authorities (CAs) using the `certifi` trust store.
|
||||
|
||||
### Notes
|
||||
- Should be run after the `hash_enricher` to ensure file hashes are available.
|
||||
- Requires internet access to interact with the configured TSAs.
|
||||
"""
|
||||
}
|
||||
@@ -8,9 +8,9 @@ from certvalidator import CertificateValidator, ValidationContext
|
||||
from asn1crypto import pem
|
||||
import certifi
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Metadata, ArchivingContext, Media
|
||||
from ..archivers import Archiver
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.core import Metadata, ArchivingContext, Media
|
||||
from auto_archiver.archivers import Archiver
|
||||
|
||||
|
||||
class TimestampingEnricher(Enricher):
|
||||
@@ -26,36 +26,36 @@ class TimestampingEnricher(Enricher):
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"tsa_urls": {
|
||||
"default": [
|
||||
# [Adobe Approved Trust List] and [Windows Cert Store]
|
||||
"http://timestamp.digicert.com",
|
||||
"http://timestamp.identrust.com",
|
||||
# "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping
|
||||
# "https://timestamp.sectigo.com", # wait 15 seconds between each request.
|
||||
|
||||
# [Adobe: European Union Trusted Lists].
|
||||
# "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request.
|
||||
|
||||
# [Windows Cert Store]
|
||||
"http://timestamp.globalsign.com/tsa/r6advanced1",
|
||||
|
||||
# [Adobe: European Union Trusted Lists] and [Windows Cert Store]
|
||||
# "http://ts.quovadisglobal.com/eu", # not valid for timestamping
|
||||
# "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain
|
||||
# "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain
|
||||
# "http://tsa.sep.bg", # self-signed certificate in certificate chain
|
||||
# "http://tsa.izenpe.com", #unable to get local issuer certificate
|
||||
# "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate
|
||||
"http://tss.accv.es:8318/tsa",
|
||||
],
|
||||
"help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
}
|
||||
}
|
||||
# @staticmethod
|
||||
# def configs() -> dict:
|
||||
# return {
|
||||
# "tsa_urls": {
|
||||
# "default": [
|
||||
# # [Adobe Approved Trust List] and [Windows Cert Store]
|
||||
# "http://timestamp.digicert.com",
|
||||
# "http://timestamp.identrust.com",
|
||||
# # "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping
|
||||
# # "https://timestamp.sectigo.com", # wait 15 seconds between each request.
|
||||
#
|
||||
# # [Adobe: European Union Trusted Lists].
|
||||
# # "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request.
|
||||
#
|
||||
# # [Windows Cert Store]
|
||||
# "http://timestamp.globalsign.com/tsa/r6advanced1",
|
||||
#
|
||||
# # [Adobe: European Union Trusted Lists] and [Windows Cert Store]
|
||||
# # "http://ts.quovadisglobal.com/eu", # not valid for timestamping
|
||||
# # "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain
|
||||
# # "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain
|
||||
# # "http://tsa.sep.bg", # self-signed certificate in certificate chain
|
||||
# # "http://tsa.izenpe.com", #unable to get local issuer certificate
|
||||
# # "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate
|
||||
# "http://tss.accv.es:8318/tsa",
|
||||
# ],
|
||||
# "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
|
||||
# "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
# }
|
||||
# }
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
@@ -12,7 +12,8 @@
|
||||
},
|
||||
"configs": {
|
||||
"bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
|
||||
"bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line"},
|
||||
"bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line",
|
||||
"cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))},
|
||||
"consumer_key": {"default": None, "help": "twitter API consumer_key"},
|
||||
"consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
|
||||
"access_token": {"default": None, "help": "twitter API access_token"},
|
||||
|
||||
@@ -34,17 +34,6 @@ class TwitterApiArchiver(Archiver):
|
||||
access_token=self.access_token, access_secret=self.access_secret))
|
||||
assert self.api_client is not None, "Missing Twitter API configurations, please provide either AND/OR (consumer_key, consumer_secret, access_token, access_secret) to use this archiver, you can provide both for better rate-limit results."
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
|
||||
"bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))},
|
||||
"consumer_key": {"default": None, "help": "twitter API consumer_key"},
|
||||
"consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
|
||||
"access_token": {"default": None, "help": "twitter API access_token"},
|
||||
"access_secret": {"default": None, "help": "twitter API access_secret"},
|
||||
}
|
||||
|
||||
@property # getter .mimetype
|
||||
def api_client(self) -> str:
|
||||
return self.apis[self.api_index]
|
||||
|
||||
@@ -19,14 +19,6 @@ class VkArchiver(Archiver):
|
||||
self.assert_valid_string("password")
|
||||
self.vks = VkScraper(self.username, self.password, session_file=self.session_file)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"username": {"default": None, "help": "valid VKontakte username"},
|
||||
"password": {"default": None, "help": "valid VKontakte password"},
|
||||
"session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
|
||||
}
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
|
||||
0
src/auto_archiver/modules/wacz_enricher/__init__.py
Normal file
0
src/auto_archiver/modules/wacz_enricher/__init__.py
Normal file
39
src/auto_archiver/modules/wacz_enricher/__manifest__.py
Normal file
39
src/auto_archiver/modules/wacz_enricher/__manifest__.py
Normal file
@@ -0,0 +1,39 @@
|
||||
{
|
||||
"name": "WACZ Enricher",
|
||||
"type": ["enricher", "archiver"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": [
|
||||
"loguru",
|
||||
"jsonlines",
|
||||
"warcio"
|
||||
],
|
||||
# TODO?
|
||||
"bin": [
|
||||
"docker"
|
||||
]
|
||||
},
|
||||
"configs": {
|
||||
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
|
||||
"docker_commands": {"default": None, "help":"if a custom docker invocation is needed"},
|
||||
"timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
|
||||
"extract_media": {"default": False, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
|
||||
"extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
|
||||
"socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
|
||||
"socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
|
||||
"proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
|
||||
},
|
||||
"description": """
|
||||
Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving.
|
||||
|
||||
### Features
|
||||
- Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`.
|
||||
- Supports custom profiles for archiving private or dynamic content.
|
||||
- Extracts media (images, videos, audio) and screenshots from the archive, optionally adding them to the enrichment pipeline.
|
||||
- Generates metadata from the archived page's content and structure (e.g., titles, text).
|
||||
|
||||
### Notes
|
||||
- Requires Docker for running `browsertrix-crawler` unless explicitly disabled.
|
||||
- Configurable via parameters for timeout, media extraction, screenshots, and proxy settings.
|
||||
"""
|
||||
}
|
||||
@@ -5,10 +5,10 @@ from zipfile import ZipFile
|
||||
from loguru import logger
|
||||
from warcio.archiveiterator import ArchiveIterator
|
||||
|
||||
from ..core import Media, Metadata, ArchivingContext
|
||||
from . import Enricher
|
||||
from ..archivers import Archiver
|
||||
from ..utils import UrlUtil, random_str
|
||||
from auto_archiver.core import Media, Metadata, ArchivingContext
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.archivers import Archiver
|
||||
from auto_archiver.utils import UrlUtil, random_str
|
||||
|
||||
|
||||
class WaczArchiverEnricher(Enricher, Archiver):
|
||||
@@ -24,19 +24,6 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
|
||||
"docker_commands": {"default": None, "help":"if a custom docker invocation is needed"},
|
||||
"timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
|
||||
"extract_media": {"default": False, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
|
||||
"extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
|
||||
"socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
|
||||
"socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
|
||||
"proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
|
||||
}
|
||||
|
||||
def setup(self) -> None:
|
||||
self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
|
||||
self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER')
|
||||
29
src/auto_archiver/modules/wayback_enricher/__manifest__.py
Normal file
29
src/auto_archiver/modules/wayback_enricher/__manifest__.py
Normal file
@@ -0,0 +1,29 @@
|
||||
{
|
||||
"name": "Wayback Machine Enricher",
|
||||
"type": ["enricher", "archiver"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
},
|
||||
"configs": {
|
||||
"timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."},
|
||||
"if_not_archived_within": {"default": None, "help": "only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA"},
|
||||
"key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"},
|
||||
"secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"},
|
||||
"proxy_http": {"default": None, "help": "http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port"},
|
||||
"proxy_https": {"default": None, "help": "https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port"},
|
||||
},
|
||||
"description": """
|
||||
Submits the current URL to the Wayback Machine for archiving and returns either a job ID or the completed archive URL.
|
||||
|
||||
### Features
|
||||
- Archives URLs using the Internet Archive's Wayback Machine API.
|
||||
- Supports conditional archiving based on the existence of prior archives within a specified time range.
|
||||
- Provides proxies for HTTP and HTTPS requests.
|
||||
- Fetches and confirms the archive URL or provides a job ID for later status checks.
|
||||
|
||||
### Notes
|
||||
- Requires a valid Wayback Machine API key and secret.
|
||||
- Handles rate-limiting by Wayback Machine and retries status checks with exponential backoff.
|
||||
"""
|
||||
}
|
||||
@@ -2,10 +2,10 @@ import json
|
||||
from loguru import logger
|
||||
import time, requests
|
||||
|
||||
from . import Enricher
|
||||
from ..archivers import Archiver
|
||||
from ..utils import UrlUtil
|
||||
from ..core import Metadata
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.archivers import Archiver
|
||||
from auto_archiver.utils import UrlUtil
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
class WaybackArchiverEnricher(Enricher, Archiver):
|
||||
"""
|
||||
@@ -21,17 +21,6 @@ class WaybackArchiverEnricher(Enricher, Archiver):
|
||||
assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API key"
|
||||
assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API secret"
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."},
|
||||
"if_not_archived_within": {"default": None, "help": "only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA"},
|
||||
"key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"},
|
||||
"secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"},
|
||||
"proxy_http": {"default": None, "help": "http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port"},
|
||||
"proxy_https": {"default": None, "help": "https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port"},
|
||||
}
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
# this new Metadata object is required to avoid duplication
|
||||
result = Metadata()
|
||||
30
src/auto_archiver/modules/whisper_enricher/__manifest__.py
Normal file
30
src/auto_archiver/modules/whisper_enricher/__manifest__.py
Normal file
@@ -0,0 +1,30 @@
|
||||
{
|
||||
"name": "Whisper Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
},
|
||||
"configs": {
|
||||
"api_endpoint": {"default": None, "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
|
||||
"api_key": {"default": None, "help": "WhisperApi api key for authentication"},
|
||||
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
|
||||
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
|
||||
"action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]},
|
||||
},
|
||||
"description": """
|
||||
Integrates with a Whisper API service to transcribe, translate, or detect the language of audio and video files.
|
||||
|
||||
### Features
|
||||
- Submits audio or video files to a Whisper API deployment for processing.
|
||||
- Supports operations such as transcription, translation, and language detection.
|
||||
- Optionally generates SRT subtitle files for video content.
|
||||
- Integrates with S3-compatible storage systems to make files publicly accessible for processing.
|
||||
- Handles job submission, status checking, artifact retrieval, and cleanup.
|
||||
|
||||
### Notes
|
||||
- Requires a Whisper API endpoint and API key for authentication.
|
||||
- Only compatible with S3-compatible storage systems for media file accessibility.
|
||||
- Handles multiple jobs and retries for failed or incomplete processing.
|
||||
"""
|
||||
}
|
||||
@@ -2,9 +2,9 @@ import traceback
|
||||
import requests, time
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
from ..storages import S3Storage
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.storages import S3Storage
|
||||
|
||||
|
||||
class WhisperEnricher(Enricher):
|
||||
@@ -22,17 +22,6 @@ class WhisperEnricher(Enricher):
|
||||
assert type(self.api_key) == str and len(self.api_key) > 0, "please provide a value for the whisper_enricher api_key"
|
||||
self.timeout = int(self.timeout)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"api_endpoint": {"default": None, "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
|
||||
"api_key": {"default": None, "help": "WhisperApi api key for authentication"},
|
||||
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
|
||||
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
|
||||
"action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]},
|
||||
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
if not self._get_s3_storage():
|
||||
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
|
||||
Reference in New Issue
Block a user