mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 05:08:28 +03:00
removes rearchiving logic
This commit is contained in:
@@ -27,11 +27,6 @@ class Archiver(Step):
|
|||||||
# used to clean unnecessary URL parameters OR unfurl redirect links
|
# used to clean unnecessary URL parameters OR unfurl redirect links
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def is_rearchivable(self, url: str) -> bool:
|
|
||||||
# archivers can signal if it does not make sense to rearchive a piece of content
|
|
||||||
# default is rearchiving
|
|
||||||
return True
|
|
||||||
|
|
||||||
def _guess_file_type(self, path: str) -> str:
|
def _guess_file_type(self, path: str) -> str:
|
||||||
"""
|
"""
|
||||||
Receives a URL or filename and returns global mimetype like 'image' or 'video'
|
Receives a URL or filename and returns global mimetype like 'image' or 'video'
|
||||||
|
|||||||
@@ -19,10 +19,6 @@ class TelegramArchiver(Archiver):
|
|||||||
def configs() -> dict:
|
def configs() -> dict:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def is_rearchivable(self, url: str) -> bool:
|
|
||||||
# telegram posts are static
|
|
||||||
return False
|
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
# detect URLs that we definitely cannot handle
|
# detect URLs that we definitely cannot handle
|
||||||
|
|||||||
@@ -38,10 +38,6 @@ class TelethonArchiver(Archiver):
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
def is_rearchivable(self, url: str) -> bool:
|
|
||||||
# telegram posts are static
|
|
||||||
return False
|
|
||||||
|
|
||||||
def setup(self) -> None:
|
def setup(self) -> None:
|
||||||
"""
|
"""
|
||||||
1. trigger login process for telegram or proceed if already saved in a session file
|
1. trigger login process for telegram or proceed if already saved in a session file
|
||||||
|
|||||||
@@ -16,10 +16,6 @@ class TiktokArchiver(Archiver):
|
|||||||
def configs() -> dict:
|
def configs() -> dict:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def is_rearchivable(self, url: str) -> bool:
|
|
||||||
# TikTok posts are static
|
|
||||||
return False
|
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
if 'tiktok.com' not in url:
|
if 'tiktok.com' not in url:
|
||||||
|
|||||||
@@ -37,9 +37,8 @@ class TwitterArchiver(Archiver):
|
|||||||
# https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w
|
# https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w
|
||||||
return self.link_clean_pattern.sub("\\1", url)
|
return self.link_clean_pattern.sub("\\1", url)
|
||||||
|
|
||||||
def is_rearchivable(self, url: str) -> bool:
|
def best_quality_url(self, url: str) -> str:
|
||||||
# Twitter posts are static (for now)
|
return re.sub(r"name=(\w+)", "name=orig", url, 1)
|
||||||
return False
|
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
"""
|
"""
|
||||||
@@ -78,7 +77,7 @@ class TwitterArchiver(Archiver):
|
|||||||
media.set("src", variant.url)
|
media.set("src", variant.url)
|
||||||
mimetype = variant.contentType
|
mimetype = variant.contentType
|
||||||
elif type(tweet_media) == Photo:
|
elif type(tweet_media) == Photo:
|
||||||
media.set("src", tweet_media.fullUrl.replace('name=large', 'name=orig').replace('name=small', 'name=orig'))
|
media.set("src", self.best_quality_url(tweet_media.fullUrl))
|
||||||
mimetype = "image/jpeg"
|
mimetype = "image/jpeg"
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Could not get media URL of {tweet_media}")
|
logger.warning(f"Could not get media URL of {tweet_media}")
|
||||||
@@ -118,6 +117,7 @@ class TwitterArchiver(Archiver):
|
|||||||
|
|
||||||
for i, u in enumerate(urls):
|
for i, u in enumerate(urls):
|
||||||
media = Media(filename="")
|
media = Media(filename="")
|
||||||
|
u = self.best_quality_url(u)
|
||||||
media.set("src", u)
|
media.set("src", u)
|
||||||
ext = ""
|
ext = ""
|
||||||
if (mtype := mimetypes.guess_type(UrlUtil.remove_get_parameters(u))[0]):
|
if (mtype := mimetypes.guess_type(UrlUtil.remove_get_parameters(u))[0]):
|
||||||
|
|||||||
@@ -27,10 +27,6 @@ class VkArchiver(Archiver):
|
|||||||
"session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
|
"session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
|
||||||
}
|
}
|
||||||
|
|
||||||
def is_rearchivable(self, url: str) -> bool:
|
|
||||||
# VK content is static
|
|
||||||
return False
|
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
|
|
||||||
|
|||||||
@@ -16,7 +16,6 @@ class Metadata:
|
|||||||
status: str = "no archiver"
|
status: str = "no archiver"
|
||||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||||
media: List[Media] = field(default_factory=list)
|
media: List[Media] = field(default_factory=list)
|
||||||
rearchivable: bool = True # defaults to true, archivers can overwrite
|
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
self.set("_processed_at", datetime.datetime.utcnow())
|
self.set("_processed_at", datetime.datetime.utcnow())
|
||||||
@@ -29,7 +28,6 @@ class Metadata:
|
|||||||
if overwrite_left:
|
if overwrite_left:
|
||||||
if right.status and len(right.status):
|
if right.status and len(right.status):
|
||||||
self.status = right.status
|
self.status = right.status
|
||||||
self.rearchivable |= right.rearchivable
|
|
||||||
for k, v in right.metadata.items():
|
for k, v in right.metadata.items():
|
||||||
assert k not in self.metadata or type(v) == type(self.get(k))
|
assert k not in self.metadata or type(v) == type(self.get(k))
|
||||||
if type(v) not in [dict, list, set] or k not in self.metadata:
|
if type(v) not in [dict, list, set] or k not in self.metadata:
|
||||||
|
|||||||
@@ -62,11 +62,7 @@ class ArchivingOrchestrator:
|
|||||||
result.set_url(url)
|
result.set_url(url)
|
||||||
if original_url != url: result.set("original_url", original_url)
|
if original_url != url: result.set("original_url", original_url)
|
||||||
|
|
||||||
# 2 - rearchiving logic + notify start to DB
|
# 2 - notify start to DB
|
||||||
# archivers can signal whether the content is rearchivable: eg: tweet vs webpage
|
|
||||||
for a in self.archivers: result.rearchivable |= a.is_rearchivable(url)
|
|
||||||
logger.debug(f"{result.rearchivable=} for {url=}")
|
|
||||||
|
|
||||||
# signal to DB that archiving has started
|
# signal to DB that archiving has started
|
||||||
# and propagate already archived if it exists
|
# and propagate already archived if it exists
|
||||||
cached_result = None
|
cached_result = None
|
||||||
@@ -78,7 +74,7 @@ class ArchivingOrchestrator:
|
|||||||
d.started(result)
|
d.started(result)
|
||||||
if (local_result := d.fetch(result)):
|
if (local_result := d.fetch(result)):
|
||||||
cached_result = (cached_result or Metadata()).merge(local_result)
|
cached_result = (cached_result or Metadata()).merge(local_result)
|
||||||
if cached_result and not cached_result.rearchivable:
|
if cached_result:
|
||||||
logger.debug("Found previously archived entry")
|
logger.debug("Found previously archived entry")
|
||||||
for d in self.databases:
|
for d in self.databases:
|
||||||
d.done(cached_result)
|
d.done(cached_result)
|
||||||
|
|||||||
Reference in New Issue
Block a user