mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-13 05:38:29 +03:00
wacz: allow exceptional cases where more than one resource image is available
This commit is contained in:
@@ -194,7 +194,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
shutil.copyfileobj(infile, outfile)
|
shutil.copyfileobj(infile, outfile)
|
||||||
|
|
||||||
# get media out of .warc
|
# get media out of .warc
|
||||||
counter = 0
|
counter_warc_files = 0
|
||||||
|
counter_screenshots = 0
|
||||||
seen_urls = set()
|
seen_urls = set()
|
||||||
|
|
||||||
with open(warc_filename, "rb") as warc_stream:
|
with open(warc_filename, "rb") as warc_stream:
|
||||||
@@ -203,12 +204,12 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
if (
|
if (
|
||||||
record.rec_type == "resource" and record.content_type == "image/png" and self.extract_screenshot
|
record.rec_type == "resource" and record.content_type == "image/png" and self.extract_screenshot
|
||||||
): # screenshots
|
): # screenshots
|
||||||
fn = os.path.join(tmp_dir, f"warc-file-{counter}.png")
|
fn = os.path.join(tmp_dir, f"warc-file-{counter_screenshots}.png")
|
||||||
with open(fn, "wb") as outf:
|
with open(fn, "wb") as outf:
|
||||||
outf.write(record.raw_stream.read())
|
outf.write(record.raw_stream.read())
|
||||||
m = Media(filename=fn)
|
m = Media(filename=fn)
|
||||||
to_enrich.add_media(m, "browsertrix-screenshot")
|
to_enrich.add_media(m, f"browsertrix-screenshot-{counter_screenshots}")
|
||||||
counter += 1
|
counter_screenshots += 1
|
||||||
if not self.extract_media:
|
if not self.extract_media:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -231,7 +232,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
|
|
||||||
# create local file and add media
|
# create local file and add media
|
||||||
ext = mimetypes.guess_extension(content_type)
|
ext = mimetypes.guess_extension(content_type)
|
||||||
warc_fn = f"warc-file-{counter}{ext}"
|
warc_fn = f"warc-file-{counter_screenshots}{ext}"
|
||||||
fn = os.path.join(tmp_dir, warc_fn)
|
fn = os.path.join(tmp_dir, warc_fn)
|
||||||
|
|
||||||
record_url_best_qual = UrlUtil.twitter_best_quality_url(record_url)
|
record_url_best_qual = UrlUtil.twitter_best_quality_url(record_url)
|
||||||
@@ -256,6 +257,6 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
to_enrich.add_media(m, warc_fn)
|
to_enrich.add_media(m, warc_fn)
|
||||||
counter += 1
|
counter_warc_files += 1
|
||||||
seen_urls.add(record_url)
|
seen_urls.add(record_url)
|
||||||
logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)")
|
logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter_warc_files + counter_screenshots} relevant media file(s)")
|
||||||
|
|||||||
@@ -119,4 +119,4 @@ def test_extract_media(wacz_enricher, metadata, tmp_path, mocker) -> None:
|
|||||||
metadata.add_media(Media("something.wacz"), "browsertrix")
|
metadata.add_media(Media("something.wacz"), "browsertrix")
|
||||||
wacz_enricher.extract_media_from_wacz(metadata, str(wacz_file))
|
wacz_enricher.extract_media_from_wacz(metadata, str(wacz_file))
|
||||||
assert len(metadata.media) == 2
|
assert len(metadata.media) == 2
|
||||||
assert metadata.media[1].properties.get("id") == "browsertrix-screenshot"
|
assert metadata.media[1].properties.get("id") == "browsertrix-screenshot-0"
|
||||||
|
|||||||
Reference in New Issue
Block a user