mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
Compare commits
15 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a184bf7b97 | ||
|
|
e535f44a88 | ||
|
|
0f28bf0e35 | ||
|
|
18a8636552 | ||
|
|
81be65c828 | ||
|
|
0a91863212 | ||
|
|
3ad8349e3f | ||
|
|
2768225cd1 | ||
|
|
3e44b9b577 | ||
|
|
1a5797d0f8 | ||
|
|
768b8fce9f | ||
|
|
613b1f1e50 | ||
|
|
919c37bfb6 | ||
|
|
a655b3c987 | ||
|
|
d645b840ee |
2
.github/workflows/docker-publish.yaml
vendored
2
.github/workflows/docker-publish.yaml
vendored
@@ -9,7 +9,7 @@ on:
|
|||||||
release:
|
release:
|
||||||
types: [published]
|
types: [published]
|
||||||
push:
|
push:
|
||||||
branches: [ "main" ]
|
# branches: [ "main" ]
|
||||||
tags: [ "v*.*.*" ]
|
tags: [ "v*.*.*" ]
|
||||||
|
|
||||||
env:
|
env:
|
||||||
|
|||||||
2
.github/workflows/python-publish.yaml
vendored
2
.github/workflows/python-publish.yaml
vendored
@@ -12,7 +12,7 @@ on:
|
|||||||
release:
|
release:
|
||||||
types: [published]
|
types: [published]
|
||||||
push:
|
push:
|
||||||
branches: [ "main" ]
|
# branches: [ "main" ]
|
||||||
tags: [ "v*.*.*" ]
|
tags: [ "v*.*.*" ]
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ def main():
|
|||||||
config = Config()
|
config = Config()
|
||||||
config.parse()
|
config.parse()
|
||||||
orchestrator = ArchivingOrchestrator(config)
|
orchestrator = ArchivingOrchestrator(config)
|
||||||
orchestrator.feed()
|
for r in orchestrator.feed(): pass
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ from ..formatters import Formatter
|
|||||||
from ..storages import Storage
|
from ..storages import Storage
|
||||||
from ..enrichers import Enricher
|
from ..enrichers import Enricher
|
||||||
from . import Step
|
from . import Step
|
||||||
|
from ..utils import update_nested_dict
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -38,7 +39,7 @@ class Config:
|
|||||||
self.cli_ops = {}
|
self.cli_ops = {}
|
||||||
self.config = {}
|
self.config = {}
|
||||||
|
|
||||||
def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs:str={}):
|
def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}):
|
||||||
"""
|
"""
|
||||||
if yaml_config_filename is provided, the --config argument is ignored,
|
if yaml_config_filename is provided, the --config argument is ignored,
|
||||||
useful for library usage when the config values are preloaded
|
useful for library usage when the config values are preloaded
|
||||||
@@ -81,7 +82,7 @@ class Config:
|
|||||||
|
|
||||||
# 2. read YAML config file (or use provided value)
|
# 2. read YAML config file (or use provided value)
|
||||||
self.yaml_config = self.read_yaml(yaml_config_filename)
|
self.yaml_config = self.read_yaml(yaml_config_filename)
|
||||||
self.yaml_config.update(overwrite_configs) # optional override programmatically
|
update_nested_dict(self.yaml_config, overwrite_configs)
|
||||||
|
|
||||||
# 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default
|
# 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default
|
||||||
self.config = defaultdict(dict)
|
self.config = defaultdict(dict)
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from ast import List
|
from typing import Any, List
|
||||||
from typing import Any
|
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from dataclasses_json import dataclass_json, config
|
from dataclasses_json import dataclass_json, config
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from ast import List, Set
|
from typing import Any, List, Union, Dict
|
||||||
from typing import Any, Union, Dict
|
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from dataclasses_json import dataclass_json, config
|
from dataclasses_json import dataclass_json, config
|
||||||
import datetime
|
import datetime
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from ast import List
|
from typing import Generator, Union, List
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from .context import ArchivingContext
|
from .context import ArchivingContext
|
||||||
|
|
||||||
@@ -10,7 +9,6 @@ from ..formatters import Formatter
|
|||||||
from ..storages import Storage
|
from ..storages import Storage
|
||||||
from ..enrichers import Enricher
|
from ..enrichers import Enricher
|
||||||
from ..databases import Database
|
from ..databases import Database
|
||||||
from .media import Media
|
|
||||||
from .metadata import Metadata
|
from .metadata import Metadata
|
||||||
|
|
||||||
import tempfile, traceback
|
import tempfile, traceback
|
||||||
@@ -29,9 +27,9 @@ class ArchivingOrchestrator:
|
|||||||
|
|
||||||
for a in self.archivers: a.setup()
|
for a in self.archivers: a.setup()
|
||||||
|
|
||||||
def feed(self) -> None:
|
def feed(self) -> Generator[Metadata]:
|
||||||
for item in self.feeder:
|
for item in self.feeder:
|
||||||
self.feed_item(item)
|
yield self.feed_item(item)
|
||||||
|
|
||||||
def feed_item(self, item: Metadata) -> Metadata:
|
def feed_item(self, item: Metadata) -> Metadata:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ class Step(ABC):
|
|||||||
|
|
||||||
def init(name: str, config: dict, child: Type[Step]) -> Step:
|
def init(name: str, config: dict, child: Type[Step]) -> Step:
|
||||||
"""
|
"""
|
||||||
looks into direct subclasses of child for name and returns such ab object
|
looks into direct subclasses of child for name and returns such an object
|
||||||
TODO: cannot find subclasses of child.subclasses
|
TODO: cannot find subclasses of child.subclasses
|
||||||
"""
|
"""
|
||||||
for sub in child.__subclasses__():
|
for sub in child.__subclasses__():
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
from .database import Database
|
from .database import Database
|
||||||
from .gsheet_db import GsheetsDb
|
from .gsheet_db import GsheetsDb
|
||||||
from .console_db import ConsoleDb
|
from .console_db import ConsoleDb
|
||||||
from .csv_db import CSVDb
|
from .csv_db import CSVDb
|
||||||
|
from .api_db import AAApiDb
|
||||||
41
src/auto_archiver/databases/api_db.py
Normal file
41
src/auto_archiver/databases/api_db.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
import requests, os
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from . import Database
|
||||||
|
from ..core import Metadata
|
||||||
|
|
||||||
|
|
||||||
|
class AAApiDb(Database):
|
||||||
|
"""
|
||||||
|
Connects to auto-archiver-api instance
|
||||||
|
"""
|
||||||
|
name = "auto_archiver_api_db"
|
||||||
|
|
||||||
|
def __init__(self, config: dict) -> None:
|
||||||
|
# without this STEP.__init__ is not called
|
||||||
|
super().__init__(config)
|
||||||
|
self.assert_valid_string("api_endpoint")
|
||||||
|
self.assert_valid_string("api_secret")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def configs() -> dict:
|
||||||
|
return {
|
||||||
|
"api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
|
||||||
|
"api_secret": {"default": None, "help": "API authentication secret"},
|
||||||
|
"public": {"default": False, "help": "whether the URL should be publicly available via the API"},
|
||||||
|
"author_id": {"default": None, "help": "which email to assign as author"},
|
||||||
|
"group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
|
||||||
|
"tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))},
|
||||||
|
}
|
||||||
|
|
||||||
|
def done(self, item: Metadata) -> None:
|
||||||
|
"""archival result ready - should be saved to DB"""
|
||||||
|
logger.info(f"saving archive of {item.get_url()} to the AA API.")
|
||||||
|
|
||||||
|
payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
|
||||||
|
response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, auth=("abc", self.api_secret))
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
logger.success(f"AA API: {response.json()}")
|
||||||
|
else:
|
||||||
|
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
|
||||||
@@ -39,7 +39,7 @@ class GsheetsFeeder(Gsheets, Feeder):
|
|||||||
})
|
})
|
||||||
|
|
||||||
def __iter__(self) -> Metadata:
|
def __iter__(self) -> Metadata:
|
||||||
sh = self.gsheets_client.open(self.sheet)
|
sh = self.open_sheet()
|
||||||
for ii, wks in enumerate(sh.worksheets()):
|
for ii, wks in enumerate(sh.worksheets()):
|
||||||
if not self.should_process_sheet(wks.title):
|
if not self.should_process_sheet(wks.title):
|
||||||
logger.debug(f"SKIPPED worksheet '{wks.title}' due to allow/block rules")
|
logger.debug(f"SKIPPED worksheet '{wks.title}' due to allow/block rules")
|
||||||
@@ -64,7 +64,7 @@ class GsheetsFeeder(Gsheets, Feeder):
|
|||||||
# All checks done - archival process starts here
|
# All checks done - archival process starts here
|
||||||
m = Metadata().set_url(url)
|
m = Metadata().set_url(url)
|
||||||
ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
|
ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
|
||||||
folder = slugify(gw.get_cell(row, 'folder').strip())
|
folder = slugify(gw.get_cell_or_default(row, 'folder').strip())
|
||||||
if len(folder):
|
if len(folder):
|
||||||
if self.use_sheet_names_in_stored_paths:
|
if self.use_sheet_names_in_stored_paths:
|
||||||
ArchivingContext.set("folder", os.path.join(folder, slugify(self.sheet), slugify(wks.title)), True)
|
ArchivingContext.set("folder", os.path.join(folder, slugify(self.sheet), slugify(wks.title)), True)
|
||||||
|
|||||||
@@ -10,16 +10,17 @@ class Gsheets(Step):
|
|||||||
# without this STEP.__init__ is not called
|
# without this STEP.__init__ is not called
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||||
#TODO: config should be responsible for conversions
|
# TODO: config should be responsible for conversions
|
||||||
try: self.header = int(self.header)
|
try: self.header = int(self.header)
|
||||||
except: pass
|
except: pass
|
||||||
assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
|
assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
|
||||||
assert self.sheet is not None, "You need to define a sheet name in your orchestration file when using gsheets."
|
assert self.sheet is not None or self.sheet_id is not None, "You need to define either a 'sheet' name or a 'sheet_id' in your orchestration file when using gsheets."
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def configs() -> dict:
|
def configs() -> dict:
|
||||||
return {
|
return {
|
||||||
"sheet": {"default": None, "help": "name of the sheet to archive"},
|
"sheet": {"default": None, "help": "name of the sheet to archive"},
|
||||||
|
"sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"},
|
||||||
"header": {"default": 1, "help": "index of the header row (starts at 1)"},
|
"header": {"default": 1, "help": "index of the header row (starts at 1)"},
|
||||||
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
|
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
|
||||||
"columns": {
|
"columns": {
|
||||||
@@ -41,4 +42,10 @@ class Gsheets(Step):
|
|||||||
"help": "names of columns in the google sheet (stringified JSON object)",
|
"help": "names of columns in the google sheet (stringified JSON object)",
|
||||||
"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
|
"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def open_sheet(self):
|
||||||
|
if self.sheet:
|
||||||
|
return self.gsheets_client.open(self.sheet)
|
||||||
|
else: # self.sheet_id
|
||||||
|
return self.gsheets_client.open_by_key(self.sheet_id)
|
||||||
|
|||||||
@@ -40,3 +40,12 @@ class DateTimeEncoder(json.JSONEncoder):
|
|||||||
|
|
||||||
def dump_payload(p):
|
def dump_payload(p):
|
||||||
return json.dumps(p, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
|
return json.dumps(p, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
|
||||||
|
|
||||||
|
|
||||||
|
def update_nested_dict(dictionary, update_dict):
|
||||||
|
# takes 2 dicts and overwrites the first with the second only on the changed balues
|
||||||
|
for key, value in update_dict.items():
|
||||||
|
if key in dictionary and isinstance(value, dict) and isinstance(dictionary[key], dict):
|
||||||
|
update_nested_dict(dictionary[key], value)
|
||||||
|
else:
|
||||||
|
dictionary[key] = value
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ _MAJOR = "0"
|
|||||||
_MINOR = "5"
|
_MINOR = "5"
|
||||||
# On main and in a nightly release the patch should be one ahead of the last
|
# On main and in a nightly release the patch should be one ahead of the last
|
||||||
# released build.
|
# released build.
|
||||||
_PATCH = "13"
|
_PATCH = "20"
|
||||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||||
_SUFFIX = ""
|
_SUFFIX = ""
|
||||||
|
|||||||
Reference in New Issue
Block a user