mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-13 05:38:29 +03:00
cleanup + rearchivable logic
This commit is contained in:
@@ -23,10 +23,15 @@ class Archiver(Step):
|
|||||||
# used when archivers need to login or do other one-time setup
|
# used when archivers need to login or do other one-time setup
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def clean_url(self, url: str) -> str:
|
def sanitize_url(self, url: str) -> str:
|
||||||
# used to clean unnecessary URL parameters
|
# used to clean unnecessary URL parameters OR unfurl redirect links
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
def is_rearchivable(self, url: str) -> bool:
|
||||||
|
# archivers can signal if it does not make sense to rearchive a piece of content
|
||||||
|
# default is rearchiving
|
||||||
|
return True
|
||||||
|
|
||||||
def _guess_file_type(self, path: str) -> str:
|
def _guess_file_type(self, path: str) -> str:
|
||||||
"""
|
"""
|
||||||
Receives a URL or filename and returns global mimetype like 'image' or 'video'
|
Receives a URL or filename and returns global mimetype like 'image' or 'video'
|
||||||
@@ -57,19 +62,3 @@ class Archiver(Step):
|
|||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def download(self, item: Metadata) -> Metadata: pass
|
def download(self, item: Metadata) -> Metadata: pass
|
||||||
|
|
||||||
# TODO: how to fix allow predictable key
|
|
||||||
# def get_key(self, filename):
|
|
||||||
# """
|
|
||||||
# returns a key in the format "[archiverName]_[filename]" includes extension
|
|
||||||
# """
|
|
||||||
# tail = os.path.split(filename)[1] # returns filename.ext from full path
|
|
||||||
# _id, extension = os.path.splitext(tail) # returns [filename, .ext]
|
|
||||||
# if 'unknown_video' in _id:
|
|
||||||
# _id = _id.replace('unknown_video', 'jpg')
|
|
||||||
|
|
||||||
# # long filenames can cause problems, so trim them if necessary
|
|
||||||
# if len(_id) > 128:
|
|
||||||
# _id = _id[-128:]
|
|
||||||
|
|
||||||
# return f'{self.name}_{_id}{extension}'
|
|
||||||
@@ -22,6 +22,10 @@ class TelegramArchiver(Archiver):
|
|||||||
def configs() -> dict:
|
def configs() -> dict:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
def is_rearchivable(self, url: str) -> bool:
|
||||||
|
# telegram posts are static
|
||||||
|
return False
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
# detect URLs that we definitely cannot handle
|
# detect URLs that we definitely cannot handle
|
||||||
|
|||||||
@@ -39,6 +39,10 @@ class TelethonArchiver(Archiver):
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def is_rearchivable(self, url: str) -> bool:
|
||||||
|
# telegram posts are static
|
||||||
|
return False
|
||||||
|
|
||||||
def setup(self) -> None:
|
def setup(self) -> None:
|
||||||
"""
|
"""
|
||||||
1. trigger login process for telegram or proceed if already saved in a session file
|
1. trigger login process for telegram or proceed if already saved in a session file
|
||||||
|
|||||||
@@ -20,6 +20,10 @@ class TiktokArchiver(Archiver):
|
|||||||
def configs() -> dict:
|
def configs() -> dict:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
def is_rearchivable(self, url: str) -> bool:
|
||||||
|
# TikTok posts are static
|
||||||
|
return False
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
if 'tiktok.com' not in url:
|
if 'tiktok.com' not in url:
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ from . import Archiver
|
|||||||
from ..core import Metadata
|
from ..core import Metadata
|
||||||
from ..core import Media
|
from ..core import Media
|
||||||
|
|
||||||
|
|
||||||
class TwitterArchiver(Archiver):
|
class TwitterArchiver(Archiver):
|
||||||
"""
|
"""
|
||||||
This Twitter Archiver uses unofficial scraping methods.
|
This Twitter Archiver uses unofficial scraping methods.
|
||||||
@@ -18,6 +19,7 @@ class TwitterArchiver(Archiver):
|
|||||||
|
|
||||||
name = "twitter_archiver"
|
name = "twitter_archiver"
|
||||||
link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
||||||
|
link_clean_pattern = re.compile(r"(.+twitter\.com\/.+\/\d+)(\?)*.*")
|
||||||
|
|
||||||
def __init__(self, config: dict) -> None:
|
def __init__(self, config: dict) -> None:
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
@@ -26,6 +28,22 @@ class TwitterArchiver(Archiver):
|
|||||||
def configs() -> dict:
|
def configs() -> dict:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
def sanitize_url(self, url: str) -> str:
|
||||||
|
# expand URL if t.co and clean tracker GET params
|
||||||
|
if 'https://t.co/' in url:
|
||||||
|
try:
|
||||||
|
r = requests.get(url)
|
||||||
|
logger.debug(f'Expanded url {url} to {r.url}')
|
||||||
|
url = r.url
|
||||||
|
except:
|
||||||
|
logger.error(f'Failed to expand url {url}')
|
||||||
|
# https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w
|
||||||
|
return self.link_clean_pattern.sub("\\1", url)
|
||||||
|
|
||||||
|
def is_rearchivable(self, url: str) -> bool:
|
||||||
|
# Twitter posts are static
|
||||||
|
return False
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
"""
|
"""
|
||||||
if this url is archivable will download post info and look for other posts from the same group with media.
|
if this url is archivable will download post info and look for other posts from the same group with media.
|
||||||
|
|||||||
@@ -28,6 +28,10 @@ class VkArchiver(Archiver):
|
|||||||
"session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
|
"session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def is_rearchivable(self, url: str) -> bool:
|
||||||
|
# VK content is static
|
||||||
|
return False
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
|
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ class Metadata:
|
|||||||
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude": True}) # keys that are not to be saved in DBs
|
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude": True}) # keys that are not to be saved in DBs
|
||||||
media: List[Media] = field(default_factory=list)
|
media: List[Media] = field(default_factory=list)
|
||||||
final_media: Media = None # can be overwritten by formatters
|
final_media: Media = None # can be overwritten by formatters
|
||||||
rearchivable: bool = False
|
rearchivable: bool = True # defaults to true, archivers can overwrite
|
||||||
|
|
||||||
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
||||||
"""
|
"""
|
||||||
@@ -140,14 +140,5 @@ class Metadata:
|
|||||||
def get_clean_metadata(self) -> Metadata:
|
def get_clean_metadata(self) -> Metadata:
|
||||||
return dict(
|
return dict(
|
||||||
{k: v for k, v in self.metadata.items() if k not in self.tmp_keys},
|
{k: v for k, v in self.metadata.items() if k not in self.tmp_keys},
|
||||||
**{"processed_at": self._processed_at} # TODO: move to enrichment
|
**{"processed_at": self._processed_at}
|
||||||
)
|
)
|
||||||
|
|
||||||
def cleanup(self) -> Metadata:
|
|
||||||
# TODO: refactor so it returns a JSON with all intended properties, except tmp_keys
|
|
||||||
# the code below leads to errors if database needs tmp_keys after they are removed
|
|
||||||
# """removes temporary metadata fields, ideally called after all ops except writing"""
|
|
||||||
# for tmp_key in self.tmp_keys:
|
|
||||||
# self.metadata.pop(tmp_key, None)
|
|
||||||
# self.tmp_keys = set()
|
|
||||||
pass
|
|
||||||
|
|||||||
@@ -91,19 +91,19 @@ class ArchivingOrchestrator:
|
|||||||
# default feeder is a list with 1 element
|
# default feeder is a list with 1 element
|
||||||
|
|
||||||
def archive(self, result: Metadata) -> Union[Metadata, None]:
|
def archive(self, result: Metadata) -> Union[Metadata, None]:
|
||||||
url = result.get_url()
|
original_url = result.get_url()
|
||||||
# TODO: clean urls
|
|
||||||
for a in self.archivers:
|
|
||||||
url = a.clean_url(url)
|
|
||||||
result.set_url(url)
|
|
||||||
# should_archive = False
|
|
||||||
# for d in self.databases: should_archive |= d.should_process(url)
|
|
||||||
# should storages also be able to check?
|
|
||||||
# for s in self.storages: should_archive |= s.should_process(url)
|
|
||||||
|
|
||||||
# if not should_archive:
|
# 1 - cleanup
|
||||||
# print("skipping")
|
# each archiver is responsible for cleaning/expanding its own URLs
|
||||||
# return "skipping"
|
url = original_url
|
||||||
|
for a in self.archivers: url = a.sanitize_url(url)
|
||||||
|
result.set_url(url)
|
||||||
|
if original_url != url: result.set("original_url", original_url)
|
||||||
|
|
||||||
|
# 2 - rearchiving logic + notify start to DB
|
||||||
|
# archivers can signal whether the content is rearchivable: eg: tweet vs webpage
|
||||||
|
for a in self.archivers: result.rearchivable |= a.is_rearchivable(url)
|
||||||
|
logger.debug(f"{result.rearchivable=} for {url=}")
|
||||||
|
|
||||||
# signal to DB that archiving has started
|
# signal to DB that archiving has started
|
||||||
# and propagate already archived if it exists
|
# and propagate already archived if it exists
|
||||||
@@ -117,33 +117,33 @@ class ArchivingOrchestrator:
|
|||||||
if (local_result := d.fetch(result)):
|
if (local_result := d.fetch(result)):
|
||||||
cached_result = (cached_result or Metadata()).merge(local_result)
|
cached_result = (cached_result or Metadata()).merge(local_result)
|
||||||
if cached_result and not cached_result.rearchivable:
|
if cached_result and not cached_result.rearchivable:
|
||||||
|
logger.debug("Found previously archived entry")
|
||||||
for d in self.databases:
|
for d in self.databases:
|
||||||
d.done(cached_result)
|
d.done(cached_result)
|
||||||
return cached_result
|
return cached_result
|
||||||
|
|
||||||
# vk, telethon, ...
|
# 3 - call archivers until one succeeds
|
||||||
for a in self.archivers:
|
for a in self.archivers:
|
||||||
# with automatic try/catch in download + archived (+ the other ops below)
|
|
||||||
# should the archivers come with the config already? are there configs which change at runtime?
|
|
||||||
# think not, so no need to pass config as parameter
|
|
||||||
# do they need to be refreshed with every execution?
|
|
||||||
# this is where the Hashes come from, the place with access to all content
|
|
||||||
# the archiver does not have access to storage
|
|
||||||
# a.download(result) # TODO: refactor so there's not merge here
|
|
||||||
logger.info(f"Trying archiver {a.name}")
|
logger.info(f"Trying archiver {a.name}")
|
||||||
result.merge(a.download(result))
|
try:
|
||||||
if result.is_success(): break
|
# Q: should this be refactored so it's just a.download(result)?
|
||||||
|
result.merge(a.download(result))
|
||||||
|
if result.is_success(): break
|
||||||
|
except Exception as e: logger.error(f"Unexpected error with archiver {a.name}: {e}")
|
||||||
|
|
||||||
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
|
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
|
||||||
# should it call the HTMLgenerator as if it's not an enrichment?
|
# should it call the HTMLgenerator as if it's not an enrichment?
|
||||||
# eg: if it is enable: generates an HTML with all the returned media, should it include enrichers? yes
|
# eg: if it is enable: generates an HTML with all the returned media, should it include enrichers? yes
|
||||||
# then how to execute it last? should there also be post-processors? are there other examples?
|
# then how to execute it last? should there also be post-processors? are there other examples?
|
||||||
# maybe as a PDF? or a Markdown file
|
# maybe as a PDF? or a Markdown file
|
||||||
# side captures: screenshot, wacz, webarchive, thumbnails, HTMLgenerator
|
|
||||||
|
# 4 - call enrichers: have access to archived content, can generate metadata and Media
|
||||||
|
# eg: screenshot, wacz, webarchive, thumbnails
|
||||||
for e in self.enrichers:
|
for e in self.enrichers:
|
||||||
e.enrich(result)
|
e.enrich(result)
|
||||||
|
|
||||||
# store media
|
# 5 - store media
|
||||||
|
# looks for Media in result.media and also result.media[x].properties (as list or dict values)
|
||||||
for s in self.storages:
|
for s in self.storages:
|
||||||
for m in result.media:
|
for m in result.media:
|
||||||
s.store(m, result) # modifies media
|
s.store(m, result) # modifies media
|
||||||
@@ -155,19 +155,14 @@ class ArchivingOrchestrator:
|
|||||||
for prop_media in prop:
|
for prop_media in prop:
|
||||||
s.store(prop_media, result)
|
s.store(prop_media, result)
|
||||||
|
|
||||||
# formatters, enrichers, and storages will sometimes look for specific properties: eg <li>Screenshot: <img src="{res.get("screenshot")}"> </li>
|
# 6 - format and store formatted if needed
|
||||||
# TODO: should there only be 1 formatter?
|
# enrichers typically need access to already stored URLs etc
|
||||||
# for f in self.formatters:
|
|
||||||
# result.merge(f.format(result))
|
|
||||||
# final format and store it
|
|
||||||
if (final_media := self.formatter.format(result)):
|
if (final_media := self.formatter.format(result)):
|
||||||
for s in self.storages:
|
for s in self.storages:
|
||||||
s.store(final_media, result)
|
s.store(final_media, result)
|
||||||
result.set_final_media(final_media)
|
result.set_final_media(final_media)
|
||||||
|
|
||||||
# signal completion to databases (DBs, Google Sheets, CSV, ...)
|
# signal completion to databases (DBs, Google Sheets, CSV, ...)
|
||||||
# a hash registration service could be one database: forensic archiving
|
|
||||||
result.cleanup()
|
|
||||||
for d in self.databases: d.done(result)
|
for d in self.databases: d.done(result)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|||||||
Reference in New Issue
Block a user