cleanup + rearchivable logic

This commit is contained in:
msramalho
2023-01-26 23:01:34 +00:00
parent 9dd8afed8c
commit 2508bb8a1b
8 changed files with 70 additions and 61 deletions

View File

@@ -23,10 +23,15 @@ class Archiver(Step):
# used when archivers need to login or do other one-time setup # used when archivers need to login or do other one-time setup
pass pass
def clean_url(self, url: str) -> str: def sanitize_url(self, url: str) -> str:
# used to clean unnecessary URL parameters # used to clean unnecessary URL parameters OR unfurl redirect links
return url return url
def is_rearchivable(self, url: str) -> bool:
# archivers can signal if it does not make sense to rearchive a piece of content
# default is rearchiving
return True
def _guess_file_type(self, path: str) -> str: def _guess_file_type(self, path: str) -> str:
""" """
Receives a URL or filename and returns global mimetype like 'image' or 'video' Receives a URL or filename and returns global mimetype like 'image' or 'video'
@@ -57,19 +62,3 @@ class Archiver(Step):
@abstractmethod @abstractmethod
def download(self, item: Metadata) -> Metadata: pass def download(self, item: Metadata) -> Metadata: pass
# TODO: how to fix allow predictable key
# def get_key(self, filename):
# """
# returns a key in the format "[archiverName]_[filename]" includes extension
# """
# tail = os.path.split(filename)[1] # returns filename.ext from full path
# _id, extension = os.path.splitext(tail) # returns [filename, .ext]
# if 'unknown_video' in _id:
# _id = _id.replace('unknown_video', 'jpg')
# # long filenames can cause problems, so trim them if necessary
# if len(_id) > 128:
# _id = _id[-128:]
# return f'{self.name}_{_id}{extension}'

View File

@@ -22,6 +22,10 @@ class TelegramArchiver(Archiver):
def configs() -> dict: def configs() -> dict:
return {} return {}
def is_rearchivable(self, url: str) -> bool:
# telegram posts are static
return False
def download(self, item: Metadata) -> Metadata: def download(self, item: Metadata) -> Metadata:
url = item.get_url() url = item.get_url()
# detect URLs that we definitely cannot handle # detect URLs that we definitely cannot handle

View File

@@ -39,6 +39,10 @@ class TelethonArchiver(Archiver):
} }
} }
def is_rearchivable(self, url: str) -> bool:
# telegram posts are static
return False
def setup(self) -> None: def setup(self) -> None:
""" """
1. trigger login process for telegram or proceed if already saved in a session file 1. trigger login process for telegram or proceed if already saved in a session file

View File

@@ -20,6 +20,10 @@ class TiktokArchiver(Archiver):
def configs() -> dict: def configs() -> dict:
return {} return {}
def is_rearchivable(self, url: str) -> bool:
# TikTok posts are static
return False
def download(self, item: Metadata) -> Metadata: def download(self, item: Metadata) -> Metadata:
url = item.get_url() url = item.get_url()
if 'tiktok.com' not in url: if 'tiktok.com' not in url:

View File

@@ -11,6 +11,7 @@ from . import Archiver
from ..core import Metadata from ..core import Metadata
from ..core import Media from ..core import Media
class TwitterArchiver(Archiver): class TwitterArchiver(Archiver):
""" """
This Twitter Archiver uses unofficial scraping methods. This Twitter Archiver uses unofficial scraping methods.
@@ -18,6 +19,7 @@ class TwitterArchiver(Archiver):
name = "twitter_archiver" name = "twitter_archiver"
link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
link_clean_pattern = re.compile(r"(.+twitter\.com\/.+\/\d+)(\?)*.*")
def __init__(self, config: dict) -> None: def __init__(self, config: dict) -> None:
super().__init__(config) super().__init__(config)
@@ -26,6 +28,22 @@ class TwitterArchiver(Archiver):
def configs() -> dict: def configs() -> dict:
return {} return {}
def sanitize_url(self, url: str) -> str:
# expand URL if t.co and clean tracker GET params
if 'https://t.co/' in url:
try:
r = requests.get(url)
logger.debug(f'Expanded url {url} to {r.url}')
url = r.url
except:
logger.error(f'Failed to expand url {url}')
# https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w
return self.link_clean_pattern.sub("\\1", url)
def is_rearchivable(self, url: str) -> bool:
# Twitter posts are static
return False
def download(self, item: Metadata) -> Metadata: def download(self, item: Metadata) -> Metadata:
""" """
if this url is archivable will download post info and look for other posts from the same group with media. if this url is archivable will download post info and look for other posts from the same group with media.

View File

@@ -28,6 +28,10 @@ class VkArchiver(Archiver):
"session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"}, "session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
} }
def is_rearchivable(self, url: str) -> bool:
# VK content is static
return False
def download(self, item: Metadata) -> Metadata: def download(self, item: Metadata) -> Metadata:
url = item.get_url() url = item.get_url()

View File

@@ -22,7 +22,7 @@ class Metadata:
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude": True}) # keys that are not to be saved in DBs tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude": True}) # keys that are not to be saved in DBs
media: List[Media] = field(default_factory=list) media: List[Media] = field(default_factory=list)
final_media: Media = None # can be overwritten by formatters final_media: Media = None # can be overwritten by formatters
rearchivable: bool = False rearchivable: bool = True # defaults to true, archivers can overwrite
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata: def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
""" """
@@ -140,14 +140,5 @@ class Metadata:
def get_clean_metadata(self) -> Metadata: def get_clean_metadata(self) -> Metadata:
return dict( return dict(
{k: v for k, v in self.metadata.items() if k not in self.tmp_keys}, {k: v for k, v in self.metadata.items() if k not in self.tmp_keys},
**{"processed_at": self._processed_at} # TODO: move to enrichment **{"processed_at": self._processed_at}
) )
def cleanup(self) -> Metadata:
# TODO: refactor so it returns a JSON with all intended properties, except tmp_keys
# the code below leads to errors if database needs tmp_keys after they are removed
# """removes temporary metadata fields, ideally called after all ops except writing"""
# for tmp_key in self.tmp_keys:
# self.metadata.pop(tmp_key, None)
# self.tmp_keys = set()
pass

View File

@@ -91,19 +91,19 @@ class ArchivingOrchestrator:
# default feeder is a list with 1 element # default feeder is a list with 1 element
def archive(self, result: Metadata) -> Union[Metadata, None]: def archive(self, result: Metadata) -> Union[Metadata, None]:
url = result.get_url() original_url = result.get_url()
# TODO: clean urls
for a in self.archivers:
url = a.clean_url(url)
result.set_url(url)
# should_archive = False
# for d in self.databases: should_archive |= d.should_process(url)
# should storages also be able to check?
# for s in self.storages: should_archive |= s.should_process(url)
# if not should_archive: # 1 - cleanup
# print("skipping") # each archiver is responsible for cleaning/expanding its own URLs
# return "skipping" url = original_url
for a in self.archivers: url = a.sanitize_url(url)
result.set_url(url)
if original_url != url: result.set("original_url", original_url)
# 2 - rearchiving logic + notify start to DB
# archivers can signal whether the content is rearchivable: eg: tweet vs webpage
for a in self.archivers: result.rearchivable |= a.is_rearchivable(url)
logger.debug(f"{result.rearchivable=} for {url=}")
# signal to DB that archiving has started # signal to DB that archiving has started
# and propagate already archived if it exists # and propagate already archived if it exists
@@ -117,33 +117,33 @@ class ArchivingOrchestrator:
if (local_result := d.fetch(result)): if (local_result := d.fetch(result)):
cached_result = (cached_result or Metadata()).merge(local_result) cached_result = (cached_result or Metadata()).merge(local_result)
if cached_result and not cached_result.rearchivable: if cached_result and not cached_result.rearchivable:
logger.debug("Found previously archived entry")
for d in self.databases: for d in self.databases:
d.done(cached_result) d.done(cached_result)
return cached_result return cached_result
# vk, telethon, ... # 3 - call archivers until one succeeds
for a in self.archivers: for a in self.archivers:
# with automatic try/catch in download + archived (+ the other ops below)
# should the archivers come with the config already? are there configs which change at runtime?
# think not, so no need to pass config as parameter
# do they need to be refreshed with every execution?
# this is where the Hashes come from, the place with access to all content
# the archiver does not have access to storage
# a.download(result) # TODO: refactor so there's not merge here
logger.info(f"Trying archiver {a.name}") logger.info(f"Trying archiver {a.name}")
result.merge(a.download(result)) try:
if result.is_success(): break # Q: should this be refactored so it's just a.download(result)?
result.merge(a.download(result))
if result.is_success(): break
except Exception as e: logger.error(f"Unexpected error with archiver {a.name}: {e}")
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator? # what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
# should it call the HTMLgenerator as if it's not an enrichment? # should it call the HTMLgenerator as if it's not an enrichment?
# eg: if it is enable: generates an HTML with all the returned media, should it include enrichers? yes # eg: if it is enable: generates an HTML with all the returned media, should it include enrichers? yes
# then how to execute it last? should there also be post-processors? are there other examples? # then how to execute it last? should there also be post-processors? are there other examples?
# maybe as a PDF? or a Markdown file # maybe as a PDF? or a Markdown file
# side captures: screenshot, wacz, webarchive, thumbnails, HTMLgenerator
# 4 - call enrichers: have access to archived content, can generate metadata and Media
# eg: screenshot, wacz, webarchive, thumbnails
for e in self.enrichers: for e in self.enrichers:
e.enrich(result) e.enrich(result)
# store media # 5 - store media
# looks for Media in result.media and also result.media[x].properties (as list or dict values)
for s in self.storages: for s in self.storages:
for m in result.media: for m in result.media:
s.store(m, result) # modifies media s.store(m, result) # modifies media
@@ -155,19 +155,14 @@ class ArchivingOrchestrator:
for prop_media in prop: for prop_media in prop:
s.store(prop_media, result) s.store(prop_media, result)
# formatters, enrichers, and storages will sometimes look for specific properties: eg <li>Screenshot: <img src="{res.get("screenshot")}"> </li> # 6 - format and store formatted if needed
# TODO: should there only be 1 formatter? # enrichers typically need access to already stored URLs etc
# for f in self.formatters:
# result.merge(f.format(result))
# final format and store it
if (final_media := self.formatter.format(result)): if (final_media := self.formatter.format(result)):
for s in self.storages: for s in self.storages:
s.store(final_media, result) s.store(final_media, result)
result.set_final_media(final_media) result.set_final_media(final_media)
# signal completion to databases (DBs, Google Sheets, CSV, ...) # signal completion to databases (DBs, Google Sheets, CSV, ...)
# a hash registration service could be one database: forensic archiving
result.cleanup()
for d in self.databases: d.done(result) for d in self.databases: d.done(result)
return result return result