Compare commits

..

3 Commits

Author SHA1 Message Date
msramalho
ca37d54b7f Bump version to v0.6.3 for release 2023-08-05 13:58:39 +01:00
msramalho
a1742b5565 fixing whisper enricher 2023-08-05 13:57:09 +01:00
msramalho
60a1f3a27a minor fixes 2023-07-31 16:08:48 +01:00
6 changed files with 18 additions and 6 deletions

View File

@@ -89,7 +89,7 @@ class Media:
try:
streams = ffmpeg.probe(self.filename, select_streams='v')['streams']
logger.warning(f"STREAMS FOR {self.filename} {streams}")
return any(s.get("duration_ts") > 0 for s in streams)
return any(s.get("duration_ts", 0) > 0 for s in streams)
except Error: return False # ffmpeg errors when reading bad files
except Exception as e:
logger.error(e)

View File

@@ -26,11 +26,16 @@ class PdqHashEnricher(Enricher):
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
logger.debug(f"calculating perceptual hashes for {url=}")
media_with_hashes = []
for m in to_enrich.media:
for media in m.all_inner_media(True):
if media.is_image() and "screenshot" not in media.get("id") and "warc-file-" not in media.get("id") and len(hd := self.calculate_pdq_hash(media.filename)):
media_id = media.get("id", "")
if media.is_image() and "screenshot" not in media_id and "warc-file-" not in media_id and len(hd := self.calculate_pdq_hash(media.filename)):
media.set("pdq_hash", hd)
media_with_hashes.append(media.filename)
logger.debug(f"calculated '{len(media_with_hashes)}' perceptual hashes for {url=}: {media_with_hashes}")
def calculate_pdq_hash(self, filename):
# returns a hexadecimal string with the perceptual hash for the given filename

View File

@@ -18,17 +18,18 @@ class WhisperEnricher(Enricher):
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
assert type(self.api_endpoint) == str and len(self.api_endpoint) > 0, "please provide a value for the whisper_enricher api_endpoint"
assert type(self.api_key) == str and len(self.api_key) > 0, "please provide a value for the whisper_enricher api_key"
self.timeout = int(self.timeout)
@staticmethod
def configs() -> dict:
return {
"api_endpoint": {"default": "https://whisper.spoettel.dev/api/v1", "help": "WhisperApi api endpoint"},
"api_endpoint": {"default": None, "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
"api_key": {"default": None, "help": "WhisperApi api key for authentication"},
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
"action": {"default": "translation", "help": "which Whisper operation to execute", "choices": ["transcript", "translation", "language_detection"]},
"action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]},
}
@@ -76,6 +77,7 @@ class WhisperEnricher(Enricher):
"type": self.action,
# "language": "string" # may be a config
}
logger.debug(f"calling API with {payload=}")
response = requests.post(f'{self.api_endpoint}/jobs', json=payload, headers={'Authorization': f'Bearer {self.api_key}'})
assert response.status_code == 201, f"calling the whisper api {self.api_endpoint} returned a non-success code: {response.status_code}"
logger.debug(response.json())

View File

@@ -16,7 +16,7 @@ No URL available for {{ m.key }}.
<a href="https://lens.google.com/uploadbyurl?url={{ url | quote }}">Google Lens</a>,&nbsp;
<a href="https://yandex.ru/images/touch/search?rpt=imageview&url={{ url | quote }}">Yandex</a>,&nbsp;
<a href="https://www.bing.com/images/search?view=detailv2&iss=sbi&form=SBIVSP&sbisrc=UrlPaste&q=imgurl:{{ url | quote }}">Bing</a>,&nbsp;
<a href="https://www.tineye.com/search/?url={{ url | quote }}">Tineye</a>,&nbsp;
<a href="https://www.tineye.com/search/?url={{ url | quote }}">Tineye</a>
</div>
<p></p>
</div>

View File

@@ -52,6 +52,11 @@ class UrlUtil:
# telegram
if "https://telegram.org/img/emoji/" in url: return False
# youtube
if "https://www.youtube.com/s/gaming/emoji/" in url: return False
if "https://yt3.ggpht.com" in url and "default-user=" in url: return False
if "https://www.youtube.com/s/search/audio/" in url: return False
return True
@staticmethod

View File

@@ -3,7 +3,7 @@ _MAJOR = "0"
_MINOR = "6"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "2"
_PATCH = "3"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""