diff --git a/src/auto_archiver/archivers/twitter_archiver.py b/src/auto_archiver/archivers/twitter_archiver.py index b99e0d8..c5d907d 100644 --- a/src/auto_archiver/archivers/twitter_archiver.py +++ b/src/auto_archiver/archivers/twitter_archiver.py @@ -6,7 +6,7 @@ from slugify import slugify from . import Archiver from ..core import Metadata, Media -from ..utils.misc import remove_get_parameters +from ..utils import UrlUtil class TwitterArchiver(Archiver): @@ -78,7 +78,7 @@ class TwitterArchiver(Archiver): media.set("src", variant.url) mimetype = variant.contentType elif type(tweet_media) == Photo: - media.set("src", tweet_media.fullUrl.replace('name=large', 'name=orig')) + media.set("src", tweet_media.fullUrl.replace('name=large', 'name=orig').replace('name=small', 'name=orig')) mimetype = "image/jpeg" else: logger.warning(f"Could not get media URL of {tweet_media}") @@ -96,21 +96,7 @@ class TwitterArchiver(Archiver): https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-1615937362 next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816 """ - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/114.0", - "Accept": "*/*", - "Accept-Language": "en-US,en;q=0.5", - "Accept-Encoding": "gzip, deflate, br", - "Origin": "https://platform.twitter.com", - "Connection": "keep-alive", - "Referer": "https://platform.twitter.com/", - "Sec-Fetch-Dest": "empty", - "Sec-Fetch-Mode": "cors", - "Sec-Fetch-Site": "cross-site", - "Pragma": "no-cache", - "Cache-Control": "no-cache", - "TE": "trailers" - } + logger.debug(f"Trying twitter hack for {url=}") result = Metadata() @@ -134,7 +120,7 @@ class TwitterArchiver(Archiver): media = Media(filename="") media.set("src", u) ext = "" - if (mtype := mimetypes.guess_type(remove_get_parameters(u))[0]): + if (mtype := mimetypes.guess_type(UrlUtil.remove_get_parameters(u))[0]): ext = mimetypes.guess_extension(mtype) media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}', item) diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index c60bf65..8ac2ddf 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -109,6 +109,8 @@ class ArchivingOrchestrator: # looks for Media in result.media and also result.media[x].properties (as list or dict values) result.store() + #TODO: remove any duplicate media, if hash is available + # 6 - format and store formatted if needed # enrichers typically need access to already stored URLs etc if (final_media := self.formatter.format(result)): diff --git a/src/auto_archiver/enrichers/pdq_hash_enricher.py b/src/auto_archiver/enrichers/pdq_hash_enricher.py index 79cd604..9b11053 100644 --- a/src/auto_archiver/enrichers/pdq_hash_enricher.py +++ b/src/auto_archiver/enrichers/pdq_hash_enricher.py @@ -1,6 +1,7 @@ +import traceback import pdqhash import numpy as np -from PIL import Image +from PIL import Image, UnidentifiedImageError from loguru import logger from . import Enricher @@ -32,11 +33,15 @@ class PdqHashEnricher(Enricher): media.set("pdq_hash", hd) def calculate_pdq_hash(self, filename): - # returns a hexadecimal string with the perceptual hash for the given filename - with Image.open(filename) as img: - # convert the image to RGB - image_rgb = np.array(img.convert("RGB")) - # compute the 256-bit PDQ hash (we do not store the quality score) - hash_array, _ = pdqhash.compute(image_rgb) - hash = "".join(str(b) for b in hash_array) - return hex(int(hash, 2))[2:] + # returns a hexadecimal string with the perceptual hash for the given filename + try: + with Image.open(filename) as img: + # convert the image to RGB + image_rgb = np.array(img.convert("RGB")) + # compute the 256-bit PDQ hash (we do not store the quality score) + hash_array, _ = pdqhash.compute(image_rgb) + hash = "".join(str(b) for b in hash_array) + return hex(int(hash, 2))[2:] + except UnidentifiedImageError as e: + logger.error(f"Image {filename=} is likely corrupted or in unsupported format {e}: {traceback.format_exc()}") + return "" \ No newline at end of file