0.8.0 new features and dependency updates (#119)

* wacz can extract_screenshot only

* new meta enricher

* twitter api can use multiple authentication tokens in sequence

* cleanup non-dup logic

* meta info on archive duration

* minor html report update

* updated dependencies

* new version
This commit is contained in:
Miguel Sozinho Ramalho
2023-12-20 14:13:22 +00:00
committed by GitHub
parent 499832d146
commit e6b6b83007
8 changed files with 413 additions and 332 deletions

View File

@@ -16,36 +16,55 @@ class TwitterApiArchiver(TwitterArchiver, Archiver):
def __init__(self, config: dict) -> None:
super().__init__(config)
self.api_index = 0
self.apis = []
if len(self.bearer_tokens):
self.apis.extend([Api(bearer_token=bearer_token) for bearer_token in self.bearer_tokens])
if self.bearer_token:
self.assert_valid_string("bearer_token")
self.api = Api(bearer_token=self.bearer_token)
elif self.consumer_key and self.consumer_secret and self.access_token and self.access_secret:
self.apis.append(Api(bearer_token=self.bearer_token))
if self.consumer_key and self.consumer_secret and self.access_token and self.access_secret:
self.assert_valid_string("consumer_key")
self.assert_valid_string("consumer_secret")
self.assert_valid_string("access_token")
self.assert_valid_string("access_secret")
self.api = Api(
consumer_key=self.consumer_key, consumer_secret=self.consumer_secret, access_token=self.access_token, access_secret=self.access_secret)
assert hasattr(self, "api") and self.api is not None, "Missing Twitter API configurations, please provide either bearer_token OR (consumer_key, consumer_secret, access_token, access_secret) to use this archiver."
self.apis.append(Api(consumer_key=self.consumer_key, consumer_secret=self.consumer_secret,
access_token=self.access_token, access_secret=self.access_secret))
assert self.api_client is not None, "Missing Twitter API configurations, please provide either AND/OR (consumer_key, consumer_secret, access_token, access_secret) to use this archiver, you can provide both for better rate-limit results."
@staticmethod
def configs() -> dict:
return {
"bearer_token": {"default": None, "help": "twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
"bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
"bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))},
"consumer_key": {"default": None, "help": "twitter API consumer_key"},
"consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
"access_token": {"default": None, "help": "twitter API access_token"},
"access_secret": {"default": None, "help": "twitter API access_secret"},
}
@property # getter .mimetype
def api_client(self) -> str:
return self.apis[self.api_index]
def download(self, item: Metadata) -> Metadata:
# call download retry until success or no more apis
while self.api_index < len(self.apis):
if res := self.download_retry(item): return res
self.api_index += 1
self.api_index = 0
return False
def download_retry(self, item: Metadata) -> Metadata:
url = item.get_url()
# detect URLs that we definitely cannot handle
username, tweet_id = self.get_username_tweet_id(url)
if not username: return False
try:
tweet = self.api.get_tweet(tweet_id, expansions=["attachments.media_keys"], media_fields=["type", "duration_ms", "url", "variants"], tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"])
tweet = self.api_client.get_tweet(tweet_id, expansions=["attachments.media_keys"], media_fields=["type", "duration_ms", "url", "variants"], tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"])
logger.debug(tweet)
except Exception as e:
logger.error(f"Could not get tweet: {e}")
return False

View File

@@ -6,4 +6,5 @@ from .thumbnail_enricher import ThumbnailEnricher
from .wacz_enricher import WaczArchiverEnricher
from .whisper_enricher import WhisperEnricher
from .pdq_hash_enricher import PdqHashEnricher
from .metadata_enricher import MetadataEnricher
from .metadata_enricher import MetadataEnricher
from .meta_enricher import MetaEnricher

View File

@@ -0,0 +1,55 @@
import datetime
import os
from loguru import logger
from . import Enricher
from ..core import Metadata
class MetaEnricher(Enricher):
"""
Adds metadata information about the archive operations, to be included at the end of all enrichments
"""
name = "meta_enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
@staticmethod
def configs() -> dict:
return {
}
def enrich(self, to_enrich: Metadata) -> None:
logger.debug(f"calculating archive metadata information for url={to_enrich.get_url()}")
self.enrich_file_sizes(to_enrich)
self.enrich_archive_duration(to_enrich)
def enrich_file_sizes(self, to_enrich):
logger.debug(f"calculating archive file sizes for url={to_enrich.get_url()} ({len(to_enrich.media)} media files)")
total_size = 0
for i, m in enumerate(to_enrich.media):
file_stats = os.stat(m.filename)
to_enrich.media[i].set("bytes", file_stats.st_size)
to_enrich.media[i].set("size", self.human_readable_bytes(file_stats.st_size))
total_size += file_stats.st_size
to_enrich.set("total_bytes", total_size)
to_enrich.set("total_size", self.human_readable_bytes(total_size))
def human_readable_bytes(self, size: int) -> str:
# receives number of bytes and returns human readble size
for unit in ["bytes", "KB", "MB", "GB", "TB"]:
if size < 1024:
return f"{size:.1f} {unit}"
size /= 1024
def enrich_archive_duration(self, to_enrich):
logger.debug(f"calculating archive duration for url={to_enrich.get_url()} ")
archive_duration = datetime.datetime.utcnow() - to_enrich.get("_processed_at")
to_enrich.set("archive_duration_seconds", archive_duration.seconds)

View File

@@ -30,7 +30,8 @@ class WaczArchiverEnricher(Enricher, Archiver):
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
"docker_commands": {"default": None, "help":"if a custom docker invocation is needed"},
"timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
"extract_media": {"default": True, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media. The .wacz file will be kept untouched."}
"extract_media": {"default": False, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
"extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."}
}
def download(self, item: Metadata) -> Metadata:
@@ -105,7 +106,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
return False
to_enrich.add_media(Media(wacz_fn), "browsertrix")
if self.extract_media:
if self.extract_media or self.extract_screenshot:
self.extract_media_from_wacz(to_enrich, wacz_fn)
if use_docker:
@@ -155,12 +156,13 @@ class WaczArchiverEnricher(Enricher, Archiver):
with open(warc_filename, 'rb') as warc_stream:
for record in ArchiveIterator(warc_stream):
# only include fetched resources
if record.rec_type == "resource": # screenshots
if record.rec_type == "resource" and self.extract_screenshot: # screenshots
fn = os.path.join(tmp_dir, f"warc-file-{counter}.png")
with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
m = Media(filename=fn)
to_enrich.add_media(m, "browsertrix-screenshot")
counter += 1
if not self.extract_media: continue
if record.rec_type != 'response': continue
record_url = record.rec_headers.get_header('WARC-Target-URI')

View File

@@ -115,7 +115,7 @@
<table class="content">
<tr>
<th>about</th>
<th>preview(s)</th>
<th>files and preview</th>
</tr>
<tbody>
{% for m in media %}

View File

@@ -52,20 +52,6 @@ class S3Storage(Storage):
def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None:
if not self.is_upload_needed(media): return True
if self.random_no_duplicate:
# checks if a folder with the hash already exists, if so it skips the upload
he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}})
hd = he.calculate_hash(media.filename)
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
if existing_key:=self.file_in_folder(path):
media.key = existing_key
logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
return True
_, ext = os.path.splitext(media.key)
media.key = os.path.join(path, f"{random_str(24)}{ext}")
extra_args = kwargs.get("extra_args", {})
if not self.private and 'ACL' not in extra_args:
extra_args['ACL'] = 'public-read'
@@ -89,6 +75,7 @@ class S3Storage(Storage):
if existing_key:=self.file_in_folder(path):
media.key = existing_key
media.set("previously archived", True)
logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
return False

View File

@@ -1,9 +1,9 @@
_MAJOR = "0"
_MINOR = "7"
_MINOR = "8"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "10"
_PATCH = "0"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""