Compare commits

...

13 Commits

Author SHA1 Message Date
msramalho
3bd6bed825 Bump version to v0.5.10 for release 2023-05-02 19:44:00 +01:00
msramalho
2659675f06 skip trim 2023-05-02 19:06:10 +01:00
msramalho
9d44f4b207 content append instead of replace 2023-05-02 19:06:00 +01:00
msramalho
5b0bff612e whisper transcripts to content 2023-05-02 19:05:32 +01:00
msramalho
ae7ceba0e5 better debug 2023-05-02 19:05:18 +01:00
msramalho
97821a81bc log cleanup 2023-05-02 19:05:06 +01:00
msramalho
9191b38cf2 tbot archiver works 2023-05-02 19:04:51 +01:00
msramalho
567edfc35e Bump version to v0.5.8 for release 2023-05-02 14:30:49 +01:00
msramalho
8c22a9df72 fixes "url-not-found" 2023-05-02 14:30:07 +01:00
msramalho
d2d6db162b Bump version to v0.5.7 for release 2023-04-18 19:28:51 +01:00
msramalho
5cfbcc0137 html template copy ux 2023-04-18 19:28:43 +01:00
msramalho
5fdaa6c739 whisper improvements 2023-04-18 19:28:36 +01:00
msramalho
3d389ee05b add url info 2023-04-18 19:14:47 +01:00
8 changed files with 25 additions and 20 deletions

View File

@@ -31,7 +31,7 @@ class InstagramTbotArchiver(Archiver):
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"}, "api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"}, "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
"session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."}, "session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
"timeout": {"default": 15, "help": "timeout to fetch the instagram content in seconds."}, "timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."},
} }
def setup(self) -> None: def setup(self) -> None:
@@ -52,9 +52,9 @@ class InstagramTbotArchiver(Archiver):
attempts = 0 attempts = 0
seen_media = [] seen_media = []
message = "" message = ""
time.sleep(4) time.sleep(3)
# media is added before text by the bot so it can be used as a stop-logic mechanism # media is added before text by the bot so it can be used as a stop-logic mechanism
while attempts < self.timeout and (not message or not len(seen_media)): while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
attempts += 1 attempts += 1
time.sleep(1) time.sleep(1)
for post in self.client.iter_messages(chat, min_id=since_id): for post in self.client.iter_messages(chat, min_id=since_id):

View File

@@ -27,7 +27,7 @@ class ArchivingContext:
@staticmethod @staticmethod
def set(key, value, keep_on_reset: bool = False): def set(key, value, keep_on_reset: bool = False):
logger.error(f"SET [{key}]={value}") logger.debug(f"SET [{key}]={value}")
ac = ArchivingContext.get_instance() ac = ArchivingContext.get_instance()
ac.configs[key] = value ac.configs[key] = value
if keep_on_reset: ac.keep_on_reset.add(key) if keep_on_reset: ac.keep_on_reset.add(key)

View File

@@ -47,7 +47,7 @@ class Metadata:
# calls .store for all contained media. storages [Storage] # calls .store for all contained media. storages [Storage]
storages = override_storages or ArchivingContext.get("storages") storages = override_storages or ArchivingContext.get("storages")
for media in self.media: for media in self.media:
media.store(override_storages=storages) media.store(override_storages=storages, url=self.get_url())
def set(self, key: str, val: Any) -> Metadata: def set(self, key: str, val: Any) -> Metadata:
self.metadata[key] = val self.metadata[key] = val
@@ -89,7 +89,8 @@ class Metadata:
def set_content(self, content: str) -> Metadata: def set_content(self, content: str) -> Metadata:
# a dump with all the relevant content # a dump with all the relevant content
return self.set("content", content) append_content = (self.get("content", "") + content + "\n").strip()
return self.set("content", append_content)
def set_title(self, title: str) -> Metadata: def set_title(self, title: str) -> Metadata:
return self.set("title", title) return self.set("title", title)

View File

@@ -93,7 +93,7 @@ class ArchivingOrchestrator:
# Q: should this be refactored so it's just a.download(result)? # Q: should this be refactored so it's just a.download(result)?
result.merge(a.download(result)) result.merge(a.download(result))
if result.is_success(): break if result.is_success(): break
except Exception as e: logger.error(f"Unexpected error with archiver {a.name}: {e}") except Exception as e: logger.error(f"Unexpected error with archiver {a.name}: {e}: {traceback.format_exc()}")
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator? # what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
# should it call the HTMLgenerator as if it's not an enrichment? # should it call the HTMLgenerator as if it's not an enrichment?
@@ -105,7 +105,7 @@ class ArchivingOrchestrator:
# eg: screenshot, wacz, webarchive, thumbnails # eg: screenshot, wacz, webarchive, thumbnails
for e in self.enrichers: for e in self.enrichers:
try: e.enrich(result) try: e.enrich(result)
except Exception as exc: logger.error(f"Unexpected error with enricher {e.name}: {exc}") except Exception as exc: logger.error(f"Unexpected error with enricher {e.name}: {exc}: {traceback.format_exc()}")
# 5 - store media # 5 - store media
# looks for Media in result.media and also result.media[x].properties (as list or dict values) # looks for Media in result.media and also result.media[x].properties (as list or dict values)
@@ -114,7 +114,7 @@ class ArchivingOrchestrator:
# 6 - format and store formatted if needed # 6 - format and store formatted if needed
# enrichers typically need access to already stored URLs etc # enrichers typically need access to already stored URLs etc
if (final_media := self.formatter.format(result)): if (final_media := self.formatter.format(result)):
final_media.store() final_media.store(url=url)
result.set_final_media(final_media) result.set_final_media(final_media)
if result.is_empty(): if result.is_empty():

View File

@@ -62,7 +62,7 @@ class GsheetsDb(Database):
batch_if_valid('archive', "\n".join(media.urls)) batch_if_valid('archive', "\n".join(media.urls))
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()) batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
batch_if_valid('title', item.get_title()) batch_if_valid('title', item.get_title())
batch_if_valid('text', item.get("content", "")[:500]) batch_if_valid('text', item.get("content", ""))
batch_if_valid('timestamp', item.get_timestamp()) batch_if_valid('timestamp', item.get_timestamp())
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"): if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
batch_if_valid('screenshot', "\n".join(screenshot.urls)) batch_if_valid('screenshot', "\n".join(screenshot.urls))

View File

@@ -26,6 +26,7 @@ class WhisperEnricher(Enricher):
return { return {
"api_endpoint": {"default": "https://whisper.spoettel.dev/api/v1", "help": "WhisperApi api endpoint"}, "api_endpoint": {"default": "https://whisper.spoettel.dev/api/v1", "help": "WhisperApi api endpoint"},
"api_key": {"default": None, "help": "WhisperApi api key for authentication"}, "api_key": {"default": None, "help": "WhisperApi api key for authentication"},
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."}, "timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
"action": {"default": "translation", "help": "which Whisper operation to execute", "choices": ["transcript", "translation", "language_detection"]}, "action": {"default": "translation", "help": "which Whisper operation to execute", "choices": ["transcript", "translation", "language_detection"]},
@@ -42,7 +43,7 @@ class WhisperEnricher(Enricher):
job_results = {} job_results = {}
for i, m in enumerate(to_enrich.media): for i, m in enumerate(to_enrich.media):
if m.is_video() or m.is_audio(): if m.is_video() or m.is_audio():
m.store() m.store(url=url)
try: try:
job_id = self.submit_job(m) job_id = self.submit_job(m)
job_results[job_id] = False job_results[job_id] = False
@@ -58,8 +59,13 @@ class WhisperEnricher(Enricher):
job_id = to_enrich.media[i].get("whisper_model")["job_id"] job_id = to_enrich.media[i].get("whisper_model")["job_id"]
to_enrich.media[i].set("whisper_model", { to_enrich.media[i].set("whisper_model", {
"job_id": job_id, "job_id": job_id,
self.action: job_results[job_id] **(job_results[job_id] if job_results[job_id] else {"result": "incomplete or failed job"})
}) })
# append the extracted text to the content of the post so it gets written to the DBs like gsheets text column
if job_results[job_id]:
for k,v in job_results[job_id].items():
if "_text" in k and len(v):
to_enrich.set_content(f"\n[automatic video transcript]: {v}")
def submit_job(self, media: Media): def submit_job(self, media: Media):
s3 = self._get_s3_storage() s3 = self._get_s3_storage()
@@ -100,18 +106,16 @@ class WhisperEnricher(Enricher):
r_res = requests.get(f'{self.api_endpoint}/jobs/{job_id}/artifacts', headers={'Authorization': f'Bearer {self.api_key}'}) r_res = requests.get(f'{self.api_endpoint}/jobs/{job_id}/artifacts', headers={'Authorization': f'Bearer {self.api_key}'})
assert r_res.status_code == 200, f"Job artifacts did not respond with 200, instead with: {r_res.status_code}" assert r_res.status_code == 200, f"Job artifacts did not respond with 200, instead with: {r_res.status_code}"
logger.success(r_res.json()) logger.success(r_res.json())
result = [] result = {}
for artifact in r_res.json(): for art_id, artifact in enumerate(r_res.json()):
subtitle = [] subtitle = []
full_text = [] full_text = []
for i, d in enumerate(artifact.get("data")): for i, d in enumerate(artifact.get("data")):
subtitle.append(f"{i+1}\n{d.get('start')} --> {d.get('end')}\n{d.get('text').strip()}") subtitle.append(f"{i+1}\n{d.get('start')} --> {d.get('end')}\n{d.get('text').strip()}")
full_text.append(d.get('text').strip()) full_text.append(d.get('text').strip())
if not len(subtitle): continue if not len(subtitle): continue
result.append({ if self.include_srt: result[f"artifact_{art_id}_subtitle"] = "\n".join(subtitle)
"subtitle": "\n".join(subtitle), result[f"artifact_{art_id}_text"] = "\n".join(full_text)
"full_text": "\n".join(full_text),
})
return result return result
return False return False

View File

@@ -42,7 +42,7 @@
} }
.copy:hover { .copy:hover {
font-weight: 600; background: aliceblue;
cursor: copy; cursor: copy;
} }

View File

@@ -3,7 +3,7 @@ _MAJOR = "0"
_MINOR = "5" _MINOR = "5"
# On main and in a nightly release the patch should be one ahead of the last # On main and in a nightly release the patch should be one ahead of the last
# released build. # released build.
_PATCH = "6" _PATCH = "10"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See # This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics. # https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = "" _SUFFIX = ""