mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-10 12:18:30 +03:00
Compare commits
15 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cc66ee3fd4 | ||
|
|
b3b727b005 | ||
|
|
ee37b20e6c | ||
|
|
a184bf7b97 | ||
|
|
e535f44a88 | ||
|
|
0f28bf0e35 | ||
|
|
18a8636552 | ||
|
|
81be65c828 | ||
|
|
0a91863212 | ||
|
|
3ad8349e3f | ||
|
|
2768225cd1 | ||
|
|
3e44b9b577 | ||
|
|
1a5797d0f8 | ||
|
|
768b8fce9f | ||
|
|
613b1f1e50 |
@@ -5,7 +5,7 @@ def main():
|
||||
config = Config()
|
||||
config.parse()
|
||||
orchestrator = ArchivingOrchestrator(config)
|
||||
orchestrator.feed()
|
||||
for r in orchestrator.feed(): pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -13,6 +13,7 @@ from ..formatters import Formatter
|
||||
from ..storages import Storage
|
||||
from ..enrichers import Enricher
|
||||
from . import Step
|
||||
from ..utils import update_nested_dict
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -38,7 +39,7 @@ class Config:
|
||||
self.cli_ops = {}
|
||||
self.config = {}
|
||||
|
||||
def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs:str={}):
|
||||
def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}):
|
||||
"""
|
||||
if yaml_config_filename is provided, the --config argument is ignored,
|
||||
useful for library usage when the config values are preloaded
|
||||
@@ -81,7 +82,7 @@ class Config:
|
||||
|
||||
# 2. read YAML config file (or use provided value)
|
||||
self.yaml_config = self.read_yaml(yaml_config_filename)
|
||||
self.yaml_config.update(overwrite_configs) # optional override programmatically
|
||||
update_nested_dict(self.yaml_config, overwrite_configs)
|
||||
|
||||
# 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default
|
||||
self.config = defaultdict(dict)
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from ast import List
|
||||
from typing import Any
|
||||
from typing import Any, List
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses_json import dataclass_json, config
|
||||
import mimetypes
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from ast import List, Set
|
||||
from typing import Any, Union, Dict
|
||||
from typing import Any, List, Union, Dict
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses_json import dataclass_json, config
|
||||
import datetime
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from __future__ import annotations
|
||||
from ast import List
|
||||
from typing import Union
|
||||
from typing import Generator, Union, List
|
||||
|
||||
from .context import ArchivingContext
|
||||
|
||||
@@ -10,7 +9,6 @@ from ..formatters import Formatter
|
||||
from ..storages import Storage
|
||||
from ..enrichers import Enricher
|
||||
from ..databases import Database
|
||||
from .media import Media
|
||||
from .metadata import Metadata
|
||||
|
||||
import tempfile, traceback
|
||||
@@ -29,9 +27,9 @@ class ArchivingOrchestrator:
|
||||
|
||||
for a in self.archivers: a.setup()
|
||||
|
||||
def feed(self) -> None:
|
||||
def feed(self) -> Generator[Metadata]:
|
||||
for item in self.feeder:
|
||||
self.feed_item(item)
|
||||
yield self.feed_item(item)
|
||||
|
||||
def feed_item(self, item: Metadata) -> Metadata:
|
||||
try:
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from .database import Database
|
||||
from .gsheet_db import GsheetsDb
|
||||
from .console_db import ConsoleDb
|
||||
from .csv_db import CSVDb
|
||||
from .csv_db import CSVDb
|
||||
from .api_db import AAApiDb
|
||||
41
src/auto_archiver/databases/api_db.py
Normal file
41
src/auto_archiver/databases/api_db.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import requests, os
|
||||
from loguru import logger
|
||||
|
||||
from . import Database
|
||||
from ..core import Metadata
|
||||
|
||||
|
||||
class AAApiDb(Database):
|
||||
"""
|
||||
Connects to auto-archiver-api instance
|
||||
"""
|
||||
name = "auto_archiver_api_db"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
self.assert_valid_string("api_endpoint")
|
||||
self.assert_valid_string("api_secret")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
|
||||
"api_secret": {"default": None, "help": "API authentication secret"},
|
||||
"public": {"default": False, "help": "whether the URL should be publicly available via the API"},
|
||||
"author_id": {"default": None, "help": "which email to assign as author"},
|
||||
"group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
|
||||
"tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))},
|
||||
}
|
||||
|
||||
def done(self, item: Metadata) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
logger.info(f"saving archive of {item.get_url()} to the AA API.")
|
||||
|
||||
payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
|
||||
response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, auth=("abc", self.api_secret))
|
||||
|
||||
if response.status_code == 200:
|
||||
logger.success(f"AA API: {response.json()}")
|
||||
else:
|
||||
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
|
||||
@@ -64,7 +64,10 @@ class GsheetsFeeder(Gsheets, Feeder):
|
||||
# All checks done - archival process starts here
|
||||
m = Metadata().set_url(url)
|
||||
ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
|
||||
folder = slugify(gw.get_cell(row, 'folder').strip())
|
||||
if gw.get_cell_or_default(row, 'folder', "") is None:
|
||||
folder = ''
|
||||
else:
|
||||
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
|
||||
if len(folder):
|
||||
if self.use_sheet_names_in_stored_paths:
|
||||
ArchivingContext.set("folder", os.path.join(folder, slugify(self.sheet), slugify(wks.title)), True)
|
||||
|
||||
@@ -40,3 +40,12 @@ class DateTimeEncoder(json.JSONEncoder):
|
||||
|
||||
def dump_payload(p):
|
||||
return json.dumps(p, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
|
||||
|
||||
|
||||
def update_nested_dict(dictionary, update_dict):
|
||||
# takes 2 dicts and overwrites the first with the second only on the changed balues
|
||||
for key, value in update_dict.items():
|
||||
if key in dictionary and isinstance(value, dict) and isinstance(dictionary[key], dict):
|
||||
update_nested_dict(dictionary[key], value)
|
||||
else:
|
||||
dictionary[key] = value
|
||||
|
||||
@@ -3,7 +3,7 @@ _MAJOR = "0"
|
||||
_MINOR = "5"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "14"
|
||||
_PATCH = "23"
|
||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||
_SUFFIX = ""
|
||||
|
||||
Reference in New Issue
Block a user