minor improvements

This commit is contained in:
msramalho
2023-07-27 15:42:23 +01:00
parent dd034da844
commit e8f44b652e
3 changed files with 20 additions and 27 deletions

View File

@@ -6,7 +6,7 @@ from slugify import slugify
from . import Archiver from . import Archiver
from ..core import Metadata, Media from ..core import Metadata, Media
from ..utils.misc import remove_get_parameters from ..utils import UrlUtil
class TwitterArchiver(Archiver): class TwitterArchiver(Archiver):
@@ -78,7 +78,7 @@ class TwitterArchiver(Archiver):
media.set("src", variant.url) media.set("src", variant.url)
mimetype = variant.contentType mimetype = variant.contentType
elif type(tweet_media) == Photo: elif type(tweet_media) == Photo:
media.set("src", tweet_media.fullUrl.replace('name=large', 'name=orig')) media.set("src", tweet_media.fullUrl.replace('name=large', 'name=orig').replace('name=small', 'name=orig'))
mimetype = "image/jpeg" mimetype = "image/jpeg"
else: else:
logger.warning(f"Could not get media URL of {tweet_media}") logger.warning(f"Could not get media URL of {tweet_media}")
@@ -96,21 +96,7 @@ class TwitterArchiver(Archiver):
https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-1615937362 https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-1615937362
next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816 next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816
""" """
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/114.0",
"Accept": "*/*",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Origin": "https://platform.twitter.com",
"Connection": "keep-alive",
"Referer": "https://platform.twitter.com/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "cross-site",
"Pragma": "no-cache",
"Cache-Control": "no-cache",
"TE": "trailers"
}
logger.debug(f"Trying twitter hack for {url=}") logger.debug(f"Trying twitter hack for {url=}")
result = Metadata() result = Metadata()
@@ -134,7 +120,7 @@ class TwitterArchiver(Archiver):
media = Media(filename="") media = Media(filename="")
media.set("src", u) media.set("src", u)
ext = "" ext = ""
if (mtype := mimetypes.guess_type(remove_get_parameters(u))[0]): if (mtype := mimetypes.guess_type(UrlUtil.remove_get_parameters(u))[0]):
ext = mimetypes.guess_extension(mtype) ext = mimetypes.guess_extension(mtype)
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}', item) media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}', item)

View File

@@ -109,6 +109,8 @@ class ArchivingOrchestrator:
# looks for Media in result.media and also result.media[x].properties (as list or dict values) # looks for Media in result.media and also result.media[x].properties (as list or dict values)
result.store() result.store()
#TODO: remove any duplicate media, if hash is available
# 6 - format and store formatted if needed # 6 - format and store formatted if needed
# enrichers typically need access to already stored URLs etc # enrichers typically need access to already stored URLs etc
if (final_media := self.formatter.format(result)): if (final_media := self.formatter.format(result)):

View File

@@ -1,6 +1,7 @@
import traceback
import pdqhash import pdqhash
import numpy as np import numpy as np
from PIL import Image from PIL import Image, UnidentifiedImageError
from loguru import logger from loguru import logger
from . import Enricher from . import Enricher
@@ -32,11 +33,15 @@ class PdqHashEnricher(Enricher):
media.set("pdq_hash", hd) media.set("pdq_hash", hd)
def calculate_pdq_hash(self, filename): def calculate_pdq_hash(self, filename):
# returns a hexadecimal string with the perceptual hash for the given filename # returns a hexadecimal string with the perceptual hash for the given filename
with Image.open(filename) as img: try:
# convert the image to RGB with Image.open(filename) as img:
image_rgb = np.array(img.convert("RGB")) # convert the image to RGB
# compute the 256-bit PDQ hash (we do not store the quality score) image_rgb = np.array(img.convert("RGB"))
hash_array, _ = pdqhash.compute(image_rgb) # compute the 256-bit PDQ hash (we do not store the quality score)
hash = "".join(str(b) for b in hash_array) hash_array, _ = pdqhash.compute(image_rgb)
return hex(int(hash, 2))[2:] hash = "".join(str(b) for b in hash_array)
return hex(int(hash, 2))[2:]
except UnidentifiedImageError as e:
logger.error(f"Image {filename=} is likely corrupted or in unsupported format {e}: {traceback.format_exc()}")
return ""