From 59551b3b2033daeee830cde1c4e0adef7bb8f6f3 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 27 Jul 2023 21:36:15 +0100 Subject: [PATCH] minor improvements: finding best twitter image quality --- src/auto_archiver/archivers/archiver.py | 1 + src/auto_archiver/archivers/twitter_archiver.py | 7 ++----- src/auto_archiver/utils/url.py | 8 ++++++++ 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/auto_archiver/archivers/archiver.py b/src/auto_archiver/archivers/archiver.py index 324c474..2928ecf 100644 --- a/src/auto_archiver/archivers/archiver.py +++ b/src/auto_archiver/archivers/archiver.py @@ -51,6 +51,7 @@ class Archiver(Step): 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' } d = requests.get(url, headers=headers) + assert d.status_code == 200, f"got response code {d.status_code} for {url=}" with open(to_filename, 'wb') as f: f.write(d.content) return to_filename diff --git a/src/auto_archiver/archivers/twitter_archiver.py b/src/auto_archiver/archivers/twitter_archiver.py index 1624484..aa764da 100644 --- a/src/auto_archiver/archivers/twitter_archiver.py +++ b/src/auto_archiver/archivers/twitter_archiver.py @@ -37,9 +37,6 @@ class TwitterArchiver(Archiver): # https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w return self.link_clean_pattern.sub("\\1", url) - def best_quality_url(self, url: str) -> str: - return re.sub(r"name=(\w+)", "name=orig", url, 1) - def download(self, item: Metadata) -> Metadata: """ if this url is archivable will download post info and look for other posts from the same group with media. @@ -77,7 +74,7 @@ class TwitterArchiver(Archiver): media.set("src", variant.url) mimetype = variant.contentType elif type(tweet_media) == Photo: - media.set("src", self.best_quality_url(tweet_media.fullUrl)) + media.set("src", UrlUtil.twitter_best_quality_url(tweet_media.fullUrl)) mimetype = "image/jpeg" else: logger.warning(f"Could not get media URL of {tweet_media}") @@ -117,7 +114,7 @@ class TwitterArchiver(Archiver): for i, u in enumerate(urls): media = Media(filename="") - u = self.best_quality_url(u) + u = UrlUtil.twitter_best_quality_url(u) media.set("src", u) ext = "" if (mtype := mimetypes.guess_type(UrlUtil.remove_get_parameters(u))[0]): diff --git a/src/auto_archiver/utils/url.py b/src/auto_archiver/utils/url.py index 3f4e4d6..9f27ef6 100644 --- a/src/auto_archiver/utils/url.py +++ b/src/auto_archiver/utils/url.py @@ -45,3 +45,11 @@ class UrlUtil: if "twimg.com/profile_images" in url: return False if "twimg.com" in url and "/default_profile_images" in url: return False return True + + @staticmethod + def twitter_best_quality_url(url: str) -> str: + """ + some twitter image URLs point to a less-than best quality + this returns the URL pointing to the highest (original) quality + """ + return re.sub(r"name=(\w+)", "name=orig", url, 1)