excludes files from perceptual hash

This commit is contained in:
msramalho
2023-07-28 12:51:24 +01:00
parent f15a70f859
commit 139bdec051

View File

@@ -29,8 +29,8 @@ class PdqHashEnricher(Enricher):
for m in to_enrich.media:
for media in m.all_inner_media(True):
if media.is_image() and media.get("id") != "screenshot" and len(hd := self.calculate_pdq_hash(media.filename)):
media.set("pdq_hash", hd)
if media.is_image() and "screenshot" not in media.get("id") and "warc-file-" not in media.get("id") and len(hd := self.calculate_pdq_hash(media.filename)):
media.set("pdq_hash", hd)
def calculate_pdq_hash(self, filename):
# returns a hexadecimal string with the perceptual hash for the given filename
@@ -44,4 +44,4 @@ class PdqHashEnricher(Enricher):
return hex(int(hash, 2))[2:]
except UnidentifiedImageError as e:
logger.error(f"Image {filename=} is likely corrupted or in unsupported format {e}: {traceback.format_exc()}")
return ""
return ""