diff --git a/example.orchestration.yaml b/example.orchestration.yaml index a040f5c..eee7e0f 100644 --- a/example.orchestration.yaml +++ b/example.orchestration.yaml @@ -18,7 +18,7 @@ steps: # - thumbnail_enricher # - wayback_archiver_enricher # - wacz_enricher - # - pdq_hash_enricher + # - pdq_hash_enricher # if you want to calculate hashes for thumbnails, include this after thumbnail_enricher formatter: html_formatter # defaults to mute_formatter storages: - local_storage diff --git a/src/auto_archiver/archivers/twitter_archiver.py b/src/auto_archiver/archivers/twitter_archiver.py index dfa9504..b99e0d8 100644 --- a/src/auto_archiver/archivers/twitter_archiver.py +++ b/src/auto_archiver/archivers/twitter_archiver.py @@ -6,6 +6,7 @@ from slugify import slugify from . import Archiver from ..core import Metadata, Media +from ..utils.misc import remove_get_parameters class TwitterArchiver(Archiver): @@ -133,7 +134,7 @@ class TwitterArchiver(Archiver): media = Media(filename="") media.set("src", u) ext = "" - if (mtype := mimetypes.guess_type(u)[0]): + if (mtype := mimetypes.guess_type(remove_get_parameters(u))[0]): ext = mimetypes.guess_extension(mtype) media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}', item) diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py index 22d5502..c96a39f 100644 --- a/src/auto_archiver/utils/misc.py +++ b/src/auto_archiver/utils/misc.py @@ -2,6 +2,7 @@ import os, json, requests from datetime import datetime from loguru import logger +from urllib.parse import urlparse, urlunparse def mkdir_if_not_exists(folder): @@ -20,6 +21,13 @@ def expand_url(url): logger.error(f'Failed to expand url {url}') return url +def remove_get_parameters(url): + # http://example.com/file.mp4?t=1 -> http://example.com/file.mp4 + # useful for mimetypes to work + parsed_url = urlparse(url) + new_url = urlunparse(parsed_url._replace(query='')) + return new_url + def getattr_or(o: object, prop: str, default=None): try: