mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3bd6bed825 | ||
|
|
2659675f06 | ||
|
|
9d44f4b207 | ||
|
|
5b0bff612e | ||
|
|
ae7ceba0e5 | ||
|
|
97821a81bc | ||
|
|
9191b38cf2 | ||
|
|
567edfc35e | ||
|
|
8c22a9df72 |
@@ -31,7 +31,7 @@ class InstagramTbotArchiver(Archiver):
|
|||||||
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
|
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
|
||||||
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
||||||
"session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
"session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
||||||
"timeout": {"default": 15, "help": "timeout to fetch the instagram content in seconds."},
|
"timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."},
|
||||||
}
|
}
|
||||||
|
|
||||||
def setup(self) -> None:
|
def setup(self) -> None:
|
||||||
@@ -52,9 +52,9 @@ class InstagramTbotArchiver(Archiver):
|
|||||||
attempts = 0
|
attempts = 0
|
||||||
seen_media = []
|
seen_media = []
|
||||||
message = ""
|
message = ""
|
||||||
time.sleep(4)
|
time.sleep(3)
|
||||||
# media is added before text by the bot so it can be used as a stop-logic mechanism
|
# media is added before text by the bot so it can be used as a stop-logic mechanism
|
||||||
while attempts < self.timeout and (not message or not len(seen_media)):
|
while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
|
||||||
attempts += 1
|
attempts += 1
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
for post in self.client.iter_messages(chat, min_id=since_id):
|
for post in self.client.iter_messages(chat, min_id=since_id):
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ class ArchivingContext:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def set(key, value, keep_on_reset: bool = False):
|
def set(key, value, keep_on_reset: bool = False):
|
||||||
logger.error(f"SET [{key}]={value}")
|
logger.debug(f"SET [{key}]={value}")
|
||||||
ac = ArchivingContext.get_instance()
|
ac = ArchivingContext.get_instance()
|
||||||
ac.configs[key] = value
|
ac.configs[key] = value
|
||||||
if keep_on_reset: ac.keep_on_reset.add(key)
|
if keep_on_reset: ac.keep_on_reset.add(key)
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ class Metadata:
|
|||||||
# calls .store for all contained media. storages [Storage]
|
# calls .store for all contained media. storages [Storage]
|
||||||
storages = override_storages or ArchivingContext.get("storages")
|
storages = override_storages or ArchivingContext.get("storages")
|
||||||
for media in self.media:
|
for media in self.media:
|
||||||
media.store(override_storages=storages)
|
media.store(override_storages=storages, url=self.get_url())
|
||||||
|
|
||||||
def set(self, key: str, val: Any) -> Metadata:
|
def set(self, key: str, val: Any) -> Metadata:
|
||||||
self.metadata[key] = val
|
self.metadata[key] = val
|
||||||
@@ -89,7 +89,8 @@ class Metadata:
|
|||||||
|
|
||||||
def set_content(self, content: str) -> Metadata:
|
def set_content(self, content: str) -> Metadata:
|
||||||
# a dump with all the relevant content
|
# a dump with all the relevant content
|
||||||
return self.set("content", content)
|
append_content = (self.get("content", "") + content + "\n").strip()
|
||||||
|
return self.set("content", append_content)
|
||||||
|
|
||||||
def set_title(self, title: str) -> Metadata:
|
def set_title(self, title: str) -> Metadata:
|
||||||
return self.set("title", title)
|
return self.set("title", title)
|
||||||
|
|||||||
@@ -93,7 +93,7 @@ class ArchivingOrchestrator:
|
|||||||
# Q: should this be refactored so it's just a.download(result)?
|
# Q: should this be refactored so it's just a.download(result)?
|
||||||
result.merge(a.download(result))
|
result.merge(a.download(result))
|
||||||
if result.is_success(): break
|
if result.is_success(): break
|
||||||
except Exception as e: logger.error(f"Unexpected error with archiver {a.name}: {e}")
|
except Exception as e: logger.error(f"Unexpected error with archiver {a.name}: {e}: {traceback.format_exc()}")
|
||||||
|
|
||||||
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
|
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
|
||||||
# should it call the HTMLgenerator as if it's not an enrichment?
|
# should it call the HTMLgenerator as if it's not an enrichment?
|
||||||
@@ -105,7 +105,7 @@ class ArchivingOrchestrator:
|
|||||||
# eg: screenshot, wacz, webarchive, thumbnails
|
# eg: screenshot, wacz, webarchive, thumbnails
|
||||||
for e in self.enrichers:
|
for e in self.enrichers:
|
||||||
try: e.enrich(result)
|
try: e.enrich(result)
|
||||||
except Exception as exc: logger.error(f"Unexpected error with enricher {e.name}: {exc}")
|
except Exception as exc: logger.error(f"Unexpected error with enricher {e.name}: {exc}: {traceback.format_exc()}")
|
||||||
|
|
||||||
# 5 - store media
|
# 5 - store media
|
||||||
# looks for Media in result.media and also result.media[x].properties (as list or dict values)
|
# looks for Media in result.media and also result.media[x].properties (as list or dict values)
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ class GsheetsDb(Database):
|
|||||||
batch_if_valid('archive', "\n".join(media.urls))
|
batch_if_valid('archive', "\n".join(media.urls))
|
||||||
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
|
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
|
||||||
batch_if_valid('title', item.get_title())
|
batch_if_valid('title', item.get_title())
|
||||||
batch_if_valid('text', item.get("content", "")[:500])
|
batch_if_valid('text', item.get("content", ""))
|
||||||
batch_if_valid('timestamp', item.get_timestamp())
|
batch_if_valid('timestamp', item.get_timestamp())
|
||||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
|
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
|
||||||
batch_if_valid('screenshot', "\n".join(screenshot.urls))
|
batch_if_valid('screenshot', "\n".join(screenshot.urls))
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ class WhisperEnricher(Enricher):
|
|||||||
return {
|
return {
|
||||||
"api_endpoint": {"default": "https://whisper.spoettel.dev/api/v1", "help": "WhisperApi api endpoint"},
|
"api_endpoint": {"default": "https://whisper.spoettel.dev/api/v1", "help": "WhisperApi api endpoint"},
|
||||||
"api_key": {"default": None, "help": "WhisperApi api key for authentication"},
|
"api_key": {"default": None, "help": "WhisperApi api key for authentication"},
|
||||||
|
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
|
||||||
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
|
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
|
||||||
"action": {"default": "translation", "help": "which Whisper operation to execute", "choices": ["transcript", "translation", "language_detection"]},
|
"action": {"default": "translation", "help": "which Whisper operation to execute", "choices": ["transcript", "translation", "language_detection"]},
|
||||||
|
|
||||||
@@ -58,8 +59,13 @@ class WhisperEnricher(Enricher):
|
|||||||
job_id = to_enrich.media[i].get("whisper_model")["job_id"]
|
job_id = to_enrich.media[i].get("whisper_model")["job_id"]
|
||||||
to_enrich.media[i].set("whisper_model", {
|
to_enrich.media[i].set("whisper_model", {
|
||||||
"job_id": job_id,
|
"job_id": job_id,
|
||||||
**job_results[job_id]
|
**(job_results[job_id] if job_results[job_id] else {"result": "incomplete or failed job"})
|
||||||
})
|
})
|
||||||
|
# append the extracted text to the content of the post so it gets written to the DBs like gsheets text column
|
||||||
|
if job_results[job_id]:
|
||||||
|
for k,v in job_results[job_id].items():
|
||||||
|
if "_text" in k and len(v):
|
||||||
|
to_enrich.set_content(f"\n[automatic video transcript]: {v}")
|
||||||
|
|
||||||
def submit_job(self, media: Media):
|
def submit_job(self, media: Media):
|
||||||
s3 = self._get_s3_storage()
|
s3 = self._get_s3_storage()
|
||||||
@@ -108,7 +114,7 @@ class WhisperEnricher(Enricher):
|
|||||||
subtitle.append(f"{i+1}\n{d.get('start')} --> {d.get('end')}\n{d.get('text').strip()}")
|
subtitle.append(f"{i+1}\n{d.get('start')} --> {d.get('end')}\n{d.get('text').strip()}")
|
||||||
full_text.append(d.get('text').strip())
|
full_text.append(d.get('text').strip())
|
||||||
if not len(subtitle): continue
|
if not len(subtitle): continue
|
||||||
result[f"artifact_{art_id}_subtitle"] = "\n".join(subtitle)
|
if self.include_srt: result[f"artifact_{art_id}_subtitle"] = "\n".join(subtitle)
|
||||||
result[f"artifact_{art_id}_text"] = "\n".join(full_text)
|
result[f"artifact_{art_id}_text"] = "\n".join(full_text)
|
||||||
return result
|
return result
|
||||||
return False
|
return False
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ _MAJOR = "0"
|
|||||||
_MINOR = "5"
|
_MINOR = "5"
|
||||||
# On main and in a nightly release the patch should be one ahead of the last
|
# On main and in a nightly release the patch should be one ahead of the last
|
||||||
# released build.
|
# released build.
|
||||||
_PATCH = "7"
|
_PATCH = "10"
|
||||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||||
_SUFFIX = ""
|
_SUFFIX = ""
|
||||||
|
|||||||
Reference in New Issue
Block a user