from __future__ import annotations from ast import List, Set from typing import Any, Union, Dict from dataclasses import dataclass, field from datetime import datetime # import json from media import Media @dataclass class Metadata: status: str = "" metadata: Dict[str, Any] = field(default_factory=dict) tmp_keys: Set[str] = field(default_factory=set) # keys that are not to be saved in DBs media: List[Media] = field(default_factory=list) rearchivable: bool = False # def __init__(self, url, metadata = {}) -> None: # self.set_url(url) # self.metadata = metadata def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata: """ merges two Metadata instances, will overwrite according to overwrite_left flag """ if overwrite_left: self.status = right.status self.rearchivable |= right.rearchivable for k, v in right.metadata.items(): assert k not in self.metadata or type(v) == type(self.get(k)) if type(v) not in [dict, list, set] or k not in self.metadata: self.set(k, v) else: # key conflict if type(v) in [dict, set]: self.set(k, self.get(k) | v) elif type(v) == list: self.set(k, self.get(k) + v) self.media.extend(right.media) else: # invert and do same logic return right.merge(self) return self def set(self, key: str, val: Any, is_tmp=False) -> Metadata: # if not self.metadata: self.metadata = {} self.metadata[key] = val if is_tmp: self.tmp_keys.add(key) return self def get(self, key: str, default: Any = None, create_if_missing=False) -> Union[Metadata, str]: # goes through metadata and returns the Metadata available if create_if_missing and key not in self.metadata: self.metadata[key] = default return self.metadata.get(key, default) # custom getter/setters def set_url(self, url: str) -> Metadata: assert type(url) is str and len(url) > 0, "invalid URL" return self.set("url", url) def get_url(self) -> str: url = self.get("url") assert type(url) is str and len(url) > 0, "invalid URL" return url def set_content(self, content: str) -> Metadata: # the main textual content/information from a social media post, webpage, ... return self.set("content", content) def set_title(self, title: str) -> Metadata: return self.set("title", title) def set_timestamp(self, timestamp: datetime) -> Metadata: assert type(timestamp) == datetime, "set_timestamp expects a datetime instance" return self.set("timestamp", timestamp) def add_media(self, media: Media) -> Metadata: # print(f"adding {filename} to {self.metadata.get('media')}") # return self.set("media", self.get_media() + [filename]) # return self.get_media().append(media) return self.media.append(media) # def as_json(self) -> str: # # converts all metadata and data into JSON # return json.dumps(self.metadata) # #TODO: datetime is not serializable def cleanup(self) -> Metadata: #TODO: refactor so it returns a JSON with all intended properties, except tmp_keys # the code below leads to errors if database needs tmp_keys after they are removed # """removes temporary metadata fields, ideally called after all ops except writing""" # for tmp_key in self.tmp_keys: # self.metadata.pop(tmp_key, None) # self.tmp_keys = set() pass