mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
minor improvements: finding best twitter image quality
This commit is contained in:
@@ -51,6 +51,7 @@ class Archiver(Step):
|
|||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
||||||
}
|
}
|
||||||
d = requests.get(url, headers=headers)
|
d = requests.get(url, headers=headers)
|
||||||
|
assert d.status_code == 200, f"got response code {d.status_code} for {url=}"
|
||||||
with open(to_filename, 'wb') as f:
|
with open(to_filename, 'wb') as f:
|
||||||
f.write(d.content)
|
f.write(d.content)
|
||||||
return to_filename
|
return to_filename
|
||||||
|
|||||||
@@ -37,9 +37,6 @@ class TwitterArchiver(Archiver):
|
|||||||
# https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w
|
# https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w
|
||||||
return self.link_clean_pattern.sub("\\1", url)
|
return self.link_clean_pattern.sub("\\1", url)
|
||||||
|
|
||||||
def best_quality_url(self, url: str) -> str:
|
|
||||||
return re.sub(r"name=(\w+)", "name=orig", url, 1)
|
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
"""
|
"""
|
||||||
if this url is archivable will download post info and look for other posts from the same group with media.
|
if this url is archivable will download post info and look for other posts from the same group with media.
|
||||||
@@ -77,7 +74,7 @@ class TwitterArchiver(Archiver):
|
|||||||
media.set("src", variant.url)
|
media.set("src", variant.url)
|
||||||
mimetype = variant.contentType
|
mimetype = variant.contentType
|
||||||
elif type(tweet_media) == Photo:
|
elif type(tweet_media) == Photo:
|
||||||
media.set("src", self.best_quality_url(tweet_media.fullUrl))
|
media.set("src", UrlUtil.twitter_best_quality_url(tweet_media.fullUrl))
|
||||||
mimetype = "image/jpeg"
|
mimetype = "image/jpeg"
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Could not get media URL of {tweet_media}")
|
logger.warning(f"Could not get media URL of {tweet_media}")
|
||||||
@@ -117,7 +114,7 @@ class TwitterArchiver(Archiver):
|
|||||||
|
|
||||||
for i, u in enumerate(urls):
|
for i, u in enumerate(urls):
|
||||||
media = Media(filename="")
|
media = Media(filename="")
|
||||||
u = self.best_quality_url(u)
|
u = UrlUtil.twitter_best_quality_url(u)
|
||||||
media.set("src", u)
|
media.set("src", u)
|
||||||
ext = ""
|
ext = ""
|
||||||
if (mtype := mimetypes.guess_type(UrlUtil.remove_get_parameters(u))[0]):
|
if (mtype := mimetypes.guess_type(UrlUtil.remove_get_parameters(u))[0]):
|
||||||
|
|||||||
@@ -45,3 +45,11 @@ class UrlUtil:
|
|||||||
if "twimg.com/profile_images" in url: return False
|
if "twimg.com/profile_images" in url: return False
|
||||||
if "twimg.com" in url and "/default_profile_images" in url: return False
|
if "twimg.com" in url and "/default_profile_images" in url: return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def twitter_best_quality_url(url: str) -> str:
|
||||||
|
"""
|
||||||
|
some twitter image URLs point to a less-than best quality
|
||||||
|
this returns the URL pointing to the highest (original) quality
|
||||||
|
"""
|
||||||
|
return re.sub(r"name=(\w+)", "name=orig", url, 1)
|
||||||
|
|||||||
Reference in New Issue
Block a user