Compare commits

...

13 Commits

Author SHA1 Message Date
msramalho
3bd6bed825 Bump version to v0.5.10 for release 2023-05-02 19:44:00 +01:00
msramalho
2659675f06 skip trim 2023-05-02 19:06:10 +01:00
msramalho
9d44f4b207 content append instead of replace 2023-05-02 19:06:00 +01:00
msramalho
5b0bff612e whisper transcripts to content 2023-05-02 19:05:32 +01:00
msramalho
ae7ceba0e5 better debug 2023-05-02 19:05:18 +01:00
msramalho
97821a81bc log cleanup 2023-05-02 19:05:06 +01:00
msramalho
9191b38cf2 tbot archiver works 2023-05-02 19:04:51 +01:00
msramalho
567edfc35e Bump version to v0.5.8 for release 2023-05-02 14:30:49 +01:00
msramalho
8c22a9df72 fixes "url-not-found" 2023-05-02 14:30:07 +01:00
msramalho
d2d6db162b Bump version to v0.5.7 for release 2023-04-18 19:28:51 +01:00
msramalho
5cfbcc0137 html template copy ux 2023-04-18 19:28:43 +01:00
msramalho
5fdaa6c739 whisper improvements 2023-04-18 19:28:36 +01:00
msramalho
3d389ee05b add url info 2023-04-18 19:14:47 +01:00
8 changed files with 25 additions and 20 deletions

View File

@@ -31,7 +31,7 @@ class InstagramTbotArchiver(Archiver):
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
"session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
"timeout": {"default": 15, "help": "timeout to fetch the instagram content in seconds."},
"timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."},
}
def setup(self) -> None:
@@ -52,9 +52,9 @@ class InstagramTbotArchiver(Archiver):
attempts = 0
seen_media = []
message = ""
time.sleep(4)
time.sleep(3)
# media is added before text by the bot so it can be used as a stop-logic mechanism
while attempts < self.timeout and (not message or not len(seen_media)):
while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
attempts += 1
time.sleep(1)
for post in self.client.iter_messages(chat, min_id=since_id):

View File

@@ -27,7 +27,7 @@ class ArchivingContext:
@staticmethod
def set(key, value, keep_on_reset: bool = False):
logger.error(f"SET [{key}]={value}")
logger.debug(f"SET [{key}]={value}")
ac = ArchivingContext.get_instance()
ac.configs[key] = value
if keep_on_reset: ac.keep_on_reset.add(key)

View File

@@ -47,7 +47,7 @@ class Metadata:
# calls .store for all contained media. storages [Storage]
storages = override_storages or ArchivingContext.get("storages")
for media in self.media:
media.store(override_storages=storages)
media.store(override_storages=storages, url=self.get_url())
def set(self, key: str, val: Any) -> Metadata:
self.metadata[key] = val
@@ -89,7 +89,8 @@ class Metadata:
def set_content(self, content: str) -> Metadata:
# a dump with all the relevant content
return self.set("content", content)
append_content = (self.get("content", "") + content + "\n").strip()
return self.set("content", append_content)
def set_title(self, title: str) -> Metadata:
return self.set("title", title)

View File

@@ -93,7 +93,7 @@ class ArchivingOrchestrator:
# Q: should this be refactored so it's just a.download(result)?
result.merge(a.download(result))
if result.is_success(): break
except Exception as e: logger.error(f"Unexpected error with archiver {a.name}: {e}")
except Exception as e: logger.error(f"Unexpected error with archiver {a.name}: {e}: {traceback.format_exc()}")
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
# should it call the HTMLgenerator as if it's not an enrichment?
@@ -105,7 +105,7 @@ class ArchivingOrchestrator:
# eg: screenshot, wacz, webarchive, thumbnails
for e in self.enrichers:
try: e.enrich(result)
except Exception as exc: logger.error(f"Unexpected error with enricher {e.name}: {exc}")
except Exception as exc: logger.error(f"Unexpected error with enricher {e.name}: {exc}: {traceback.format_exc()}")
# 5 - store media
# looks for Media in result.media and also result.media[x].properties (as list or dict values)
@@ -114,7 +114,7 @@ class ArchivingOrchestrator:
# 6 - format and store formatted if needed
# enrichers typically need access to already stored URLs etc
if (final_media := self.formatter.format(result)):
final_media.store()
final_media.store(url=url)
result.set_final_media(final_media)
if result.is_empty():

View File

@@ -62,7 +62,7 @@ class GsheetsDb(Database):
batch_if_valid('archive', "\n".join(media.urls))
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
batch_if_valid('title', item.get_title())
batch_if_valid('text', item.get("content", "")[:500])
batch_if_valid('text', item.get("content", ""))
batch_if_valid('timestamp', item.get_timestamp())
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
batch_if_valid('screenshot', "\n".join(screenshot.urls))

View File

@@ -26,6 +26,7 @@ class WhisperEnricher(Enricher):
return {
"api_endpoint": {"default": "https://whisper.spoettel.dev/api/v1", "help": "WhisperApi api endpoint"},
"api_key": {"default": None, "help": "WhisperApi api key for authentication"},
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
"action": {"default": "translation", "help": "which Whisper operation to execute", "choices": ["transcript", "translation", "language_detection"]},
@@ -42,7 +43,7 @@ class WhisperEnricher(Enricher):
job_results = {}
for i, m in enumerate(to_enrich.media):
if m.is_video() or m.is_audio():
m.store()
m.store(url=url)
try:
job_id = self.submit_job(m)
job_results[job_id] = False
@@ -58,8 +59,13 @@ class WhisperEnricher(Enricher):
job_id = to_enrich.media[i].get("whisper_model")["job_id"]
to_enrich.media[i].set("whisper_model", {
"job_id": job_id,
self.action: job_results[job_id]
**(job_results[job_id] if job_results[job_id] else {"result": "incomplete or failed job"})
})
# append the extracted text to the content of the post so it gets written to the DBs like gsheets text column
if job_results[job_id]:
for k,v in job_results[job_id].items():
if "_text" in k and len(v):
to_enrich.set_content(f"\n[automatic video transcript]: {v}")
def submit_job(self, media: Media):
s3 = self._get_s3_storage()
@@ -100,18 +106,16 @@ class WhisperEnricher(Enricher):
r_res = requests.get(f'{self.api_endpoint}/jobs/{job_id}/artifacts', headers={'Authorization': f'Bearer {self.api_key}'})
assert r_res.status_code == 200, f"Job artifacts did not respond with 200, instead with: {r_res.status_code}"
logger.success(r_res.json())
result = []
for artifact in r_res.json():
result = {}
for art_id, artifact in enumerate(r_res.json()):
subtitle = []
full_text = []
for i, d in enumerate(artifact.get("data")):
subtitle.append(f"{i+1}\n{d.get('start')} --> {d.get('end')}\n{d.get('text').strip()}")
full_text.append(d.get('text').strip())
if not len(subtitle): continue
result.append({
"subtitle": "\n".join(subtitle),
"full_text": "\n".join(full_text),
})
if self.include_srt: result[f"artifact_{art_id}_subtitle"] = "\n".join(subtitle)
result[f"artifact_{art_id}_text"] = "\n".join(full_text)
return result
return False

View File

@@ -42,7 +42,7 @@
}
.copy:hover {
font-weight: 600;
background: aliceblue;
cursor: copy;
}

View File

@@ -3,7 +3,7 @@ _MAJOR = "0"
_MINOR = "5"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "6"
_PATCH = "10"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""