From 60a1f3a27ab01f082ba114bf1498e7b27282108e Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 31 Jul 2023 16:08:48 +0100 Subject: [PATCH] minor fixes --- src/auto_archiver/enrichers/pdq_hash_enricher.py | 7 ++++++- src/auto_archiver/formatters/templates/macros.html | 2 +- src/auto_archiver/utils/url.py | 5 +++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/auto_archiver/enrichers/pdq_hash_enricher.py b/src/auto_archiver/enrichers/pdq_hash_enricher.py index da99375..ff88bab 100644 --- a/src/auto_archiver/enrichers/pdq_hash_enricher.py +++ b/src/auto_archiver/enrichers/pdq_hash_enricher.py @@ -26,11 +26,16 @@ class PdqHashEnricher(Enricher): def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() logger.debug(f"calculating perceptual hashes for {url=}") + media_with_hashes = [] for m in to_enrich.media: for media in m.all_inner_media(True): - if media.is_image() and "screenshot" not in media.get("id") and "warc-file-" not in media.get("id") and len(hd := self.calculate_pdq_hash(media.filename)): + media_id = media.get("id", "") + if media.is_image() and "screenshot" not in media_id and "warc-file-" not in media_id and len(hd := self.calculate_pdq_hash(media.filename)): media.set("pdq_hash", hd) + media_with_hashes.append(media.filename) + + logger.debug(f"calculated '{len(media_with_hashes)}' perceptual hashes for {url=}: {media_with_hashes}") def calculate_pdq_hash(self, filename): # returns a hexadecimal string with the perceptual hash for the given filename diff --git a/src/auto_archiver/formatters/templates/macros.html b/src/auto_archiver/formatters/templates/macros.html index d03a36c..50c3673 100644 --- a/src/auto_archiver/formatters/templates/macros.html +++ b/src/auto_archiver/formatters/templates/macros.html @@ -16,7 +16,7 @@ No URL available for {{ m.key }}. Google LensYandexBing,  - Tineye,  + Tineye

diff --git a/src/auto_archiver/utils/url.py b/src/auto_archiver/utils/url.py index 6c43909..ce96330 100644 --- a/src/auto_archiver/utils/url.py +++ b/src/auto_archiver/utils/url.py @@ -52,6 +52,11 @@ class UrlUtil: # telegram if "https://telegram.org/img/emoji/" in url: return False + # youtube + if "https://www.youtube.com/s/gaming/emoji/" in url: return False + if "https://yt3.ggpht.com" in url and "default-user=" in url: return False + if "https://www.youtube.com/s/search/audio/" in url: return False + return True @staticmethod