improving ignored content from waczs

This commit is contained in:
msramalho
2023-07-28 12:19:14 +01:00
parent 7a5c9c65bd
commit aa71c85a98
3 changed files with 32 additions and 2 deletions

View File

@@ -180,6 +180,9 @@ class WaczArchiverEnricher(Enricher, Archiver):
m.set("src_alternative", record_url)
except Exception as e: logger.warning(f"Unable to download best quality URL for {record_url=} got error {e}, using original in WARC.")
# remove bad videos
if m.is_video() and not m.is_valid_video(): continue
to_enrich.add_media(m, warc_fn)
counter += 1
seen_urls.add(record_url)