mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
excludes files from perceptual hash
This commit is contained in:
@@ -29,8 +29,8 @@ class PdqHashEnricher(Enricher):
|
||||
|
||||
for m in to_enrich.media:
|
||||
for media in m.all_inner_media(True):
|
||||
if media.is_image() and media.get("id") != "screenshot" and len(hd := self.calculate_pdq_hash(media.filename)):
|
||||
media.set("pdq_hash", hd)
|
||||
if media.is_image() and "screenshot" not in media.get("id") and "warc-file-" not in media.get("id") and len(hd := self.calculate_pdq_hash(media.filename)):
|
||||
media.set("pdq_hash", hd)
|
||||
|
||||
def calculate_pdq_hash(self, filename):
|
||||
# returns a hexadecimal string with the perceptual hash for the given filename
|
||||
@@ -44,4 +44,4 @@ class PdqHashEnricher(Enricher):
|
||||
return hex(int(hash, 2))[2:]
|
||||
except UnidentifiedImageError as e:
|
||||
logger.error(f"Image {filename=} is likely corrupted or in unsupported format {e}: {traceback.format_exc()}")
|
||||
return ""
|
||||
return ""
|
||||
|
||||
Reference in New Issue
Block a user