minor improvements: finding best twitter image quality

This commit is contained in:
msramalho
2023-07-27 21:36:15 +01:00
parent f086d89111
commit 59551b3b20
3 changed files with 11 additions and 5 deletions

View File

@@ -51,6 +51,7 @@ class Archiver(Step):
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
} }
d = requests.get(url, headers=headers) d = requests.get(url, headers=headers)
assert d.status_code == 200, f"got response code {d.status_code} for {url=}"
with open(to_filename, 'wb') as f: with open(to_filename, 'wb') as f:
f.write(d.content) f.write(d.content)
return to_filename return to_filename

View File

@@ -37,9 +37,6 @@ class TwitterArchiver(Archiver):
# https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w # https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w
return self.link_clean_pattern.sub("\\1", url) return self.link_clean_pattern.sub("\\1", url)
def best_quality_url(self, url: str) -> str:
return re.sub(r"name=(\w+)", "name=orig", url, 1)
def download(self, item: Metadata) -> Metadata: def download(self, item: Metadata) -> Metadata:
""" """
if this url is archivable will download post info and look for other posts from the same group with media. if this url is archivable will download post info and look for other posts from the same group with media.
@@ -77,7 +74,7 @@ class TwitterArchiver(Archiver):
media.set("src", variant.url) media.set("src", variant.url)
mimetype = variant.contentType mimetype = variant.contentType
elif type(tweet_media) == Photo: elif type(tweet_media) == Photo:
media.set("src", self.best_quality_url(tweet_media.fullUrl)) media.set("src", UrlUtil.twitter_best_quality_url(tweet_media.fullUrl))
mimetype = "image/jpeg" mimetype = "image/jpeg"
else: else:
logger.warning(f"Could not get media URL of {tweet_media}") logger.warning(f"Could not get media URL of {tweet_media}")
@@ -117,7 +114,7 @@ class TwitterArchiver(Archiver):
for i, u in enumerate(urls): for i, u in enumerate(urls):
media = Media(filename="") media = Media(filename="")
u = self.best_quality_url(u) u = UrlUtil.twitter_best_quality_url(u)
media.set("src", u) media.set("src", u)
ext = "" ext = ""
if (mtype := mimetypes.guess_type(UrlUtil.remove_get_parameters(u))[0]): if (mtype := mimetypes.guess_type(UrlUtil.remove_get_parameters(u))[0]):

View File

@@ -45,3 +45,11 @@ class UrlUtil:
if "twimg.com/profile_images" in url: return False if "twimg.com/profile_images" in url: return False
if "twimg.com" in url and "/default_profile_images" in url: return False if "twimg.com" in url and "/default_profile_images" in url: return False
return True return True
@staticmethod
def twitter_best_quality_url(url: str) -> str:
"""
some twitter image URLs point to a less-than best quality
this returns the URL pointing to the highest (original) quality
"""
return re.sub(r"name=(\w+)", "name=orig", url, 1)