mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-13 05:38:29 +03:00
Archiving Context refactor complete
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
from .media import Media
|
||||
from .metadata import Metadata
|
||||
from .media import Media
|
||||
from .step import Step
|
||||
from .context import ArchivingContext
|
||||
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class ArchivingContext:
|
||||
"""
|
||||
@@ -7,11 +9,15 @@ class ArchivingContext:
|
||||
ArchivingContext.set(key, value)
|
||||
and
|
||||
ArchivingContext.get(key, default)
|
||||
|
||||
When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True)
|
||||
reset(full_reset=True) will recreate everything including the keep_on_reset status
|
||||
"""
|
||||
_instance = None
|
||||
|
||||
def __init__(self):
|
||||
self.configs = {}
|
||||
self.keep_on_reset = set()
|
||||
|
||||
@staticmethod
|
||||
def get_instance():
|
||||
@@ -20,13 +26,22 @@ class ArchivingContext:
|
||||
return ArchivingContext._instance
|
||||
|
||||
@staticmethod
|
||||
def set(key, value):
|
||||
ArchivingContext.get_instance().configs[key] = value
|
||||
def set(key, value, keep_on_reset: bool = False):
|
||||
logger.error(f"SET [{key}]={value}")
|
||||
ac = ArchivingContext.get_instance()
|
||||
ac.configs[key] = value
|
||||
if keep_on_reset: ac.keep_on_reset.add(key)
|
||||
|
||||
@staticmethod
|
||||
def get(key: str, default=None):
|
||||
return ArchivingContext.get_instance().configs.get(key, default)
|
||||
|
||||
@staticmethod
|
||||
def reset(full_reset: bool = False):
|
||||
ac = ArchivingContext.get_instance()
|
||||
if full_reset: ac.keep_on_reset = set()
|
||||
ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
|
||||
|
||||
# ---- custom getters/setters for widely used context values
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -3,19 +3,43 @@ from __future__ import annotations
|
||||
from ast import List
|
||||
from typing import Any
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses_json import dataclass_json
|
||||
from dataclasses_json import dataclass_json, config
|
||||
import mimetypes
|
||||
|
||||
from .context import ArchivingContext
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
@dataclass_json # annotation order matters
|
||||
@dataclass_json # annotation order matters
|
||||
@dataclass
|
||||
class Media:
|
||||
filename: str
|
||||
key: str = None
|
||||
urls: List[str] = field(default_factory=list)
|
||||
_mimetype: str = None # eg: image/jpeg
|
||||
properties: dict = field(default_factory=dict)
|
||||
_mimetype: str = None # eg: image/jpeg
|
||||
_stored: bool = field(default=False, repr=False, metadata=config(exclude=True))
|
||||
|
||||
def store(self: Media, override_storages: List = None, url: str = "url-not-available"):
|
||||
# stores the media into the provided/available storages [Storage]
|
||||
# repeats the process for its properties, in case they have inner media themselves
|
||||
# for now it only goes down 1 level but it's easy to make it recursive if needed
|
||||
storages = override_storages or ArchivingContext.get("storages")
|
||||
if not len(storages):
|
||||
logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
|
||||
return
|
||||
|
||||
for s in storages:
|
||||
s.store(self, url)
|
||||
# Media can be inside media properties, examples include transformations on original media
|
||||
for prop in self.properties.values():
|
||||
if isinstance(prop, Media):
|
||||
s.store(prop, url)
|
||||
if isinstance(prop, list):
|
||||
for prop_media in prop:
|
||||
if isinstance(prop_media, Media):
|
||||
s.store(prop_media, url)
|
||||
|
||||
def set(self, key: str, value: Any) -> Media:
|
||||
self.properties[key] = value
|
||||
@@ -44,10 +68,3 @@ class Media:
|
||||
|
||||
def is_audio(self) -> bool:
|
||||
return self.mimetype.startswith("audio")
|
||||
|
||||
def store(self):
|
||||
"""
|
||||
either stores this media entry and all its media descendants
|
||||
or returns if that process is already completed
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -8,9 +8,10 @@ import datetime
|
||||
from urllib.parse import urlparse
|
||||
from dateutil.parser import parse as parse_dt
|
||||
from .media import Media
|
||||
from .context import ArchivingContext
|
||||
|
||||
# annotation order matters
|
||||
@dataclass_json
|
||||
|
||||
@dataclass_json # annotation order matters
|
||||
@dataclass
|
||||
class Metadata:
|
||||
status: str = "no archiver"
|
||||
@@ -23,7 +24,6 @@ class Metadata:
|
||||
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata=config(exclude=True))
|
||||
# tmp_metadata: Dict[str, Any] = field(default_factory=dict, repr=False, metadata=config(exclude=True)) # contains internal properties not to be leaked when .to_json/repr/str is called
|
||||
|
||||
|
||||
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
||||
"""
|
||||
merges two Metadata instances, will overwrite according to overwrite_left flag
|
||||
@@ -46,6 +46,12 @@ class Metadata:
|
||||
return right.merge(self)
|
||||
return self
|
||||
|
||||
def store(self: Metadata, override_storages: List = None):
|
||||
# calls .store for all contained media. storages [Storage]
|
||||
storages = override_storages or ArchivingContext.get("storages")
|
||||
for media in self.media:
|
||||
media.store(override_storages=storages)
|
||||
|
||||
def set(self, key: str, val: Any, is_tmp=False) -> Metadata:
|
||||
# if not self.metadata: self.metadata = {}
|
||||
self.metadata[key] = val
|
||||
@@ -144,4 +150,3 @@ class Metadata:
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
@@ -25,7 +25,7 @@ class ArchivingOrchestrator:
|
||||
self.archivers: List[Archiver] = config.archivers
|
||||
self.databases: List[Database] = config.databases
|
||||
self.storages: List[Storage] = config.storages
|
||||
ArchivingContext.set("storages", self.storages)
|
||||
ArchivingContext.set("storages", self.storages, keep_on_reset=True)
|
||||
|
||||
for a in self.archivers: a.setup()
|
||||
|
||||
@@ -35,6 +35,7 @@ class ArchivingOrchestrator:
|
||||
|
||||
def feed_item(self, item: Metadata) -> Metadata:
|
||||
try:
|
||||
ArchivingContext.reset()
|
||||
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
|
||||
ArchivingContext.set_tmp_dir(tmp_dir)
|
||||
return self.archive(item)
|
||||
@@ -108,22 +109,12 @@ class ArchivingOrchestrator:
|
||||
|
||||
# 5 - store media
|
||||
# looks for Media in result.media and also result.media[x].properties (as list or dict values)
|
||||
for s in self.storages:
|
||||
for m in result.media:
|
||||
s.store(m, result) # modifies media
|
||||
# Media can be inside media properties, examples include transformations on original media
|
||||
for prop in m.properties.values():
|
||||
if isinstance(prop, Media):
|
||||
s.store(prop, result)
|
||||
if isinstance(prop, list) and len(prop) > 0 and isinstance(prop[0], Media):
|
||||
for prop_media in prop:
|
||||
s.store(prop_media, result)
|
||||
result.store()
|
||||
|
||||
# 6 - format and store formatted if needed
|
||||
# enrichers typically need access to already stored URLs etc
|
||||
if (final_media := self.formatter.format(result)):
|
||||
for s in self.storages:
|
||||
s.store(final_media, result)
|
||||
final_media.store()
|
||||
result.set_final_media(final_media)
|
||||
|
||||
if result.is_empty():
|
||||
|
||||
Reference in New Issue
Block a user