diff --git a/src/auto_archiver/enrichers/pdq_hash_enricher.py b/src/auto_archiver/enrichers/pdq_hash_enricher.py index 9b11053..da99375 100644 --- a/src/auto_archiver/enrichers/pdq_hash_enricher.py +++ b/src/auto_archiver/enrichers/pdq_hash_enricher.py @@ -29,8 +29,8 @@ class PdqHashEnricher(Enricher): for m in to_enrich.media: for media in m.all_inner_media(True): - if media.is_image() and media.get("id") != "screenshot" and len(hd := self.calculate_pdq_hash(media.filename)): - media.set("pdq_hash", hd) + if media.is_image() and "screenshot" not in media.get("id") and "warc-file-" not in media.get("id") and len(hd := self.calculate_pdq_hash(media.filename)): + media.set("pdq_hash", hd) def calculate_pdq_hash(self, filename): # returns a hexadecimal string with the perceptual hash for the given filename @@ -44,4 +44,4 @@ class PdqHashEnricher(Enricher): return hex(int(hash, 2))[2:] except UnidentifiedImageError as e: logger.error(f"Image {filename=} is likely corrupted or in unsupported format {e}: {traceback.format_exc()}") - return "" \ No newline at end of file + return ""