fix: twitter hack videos extension detection

This commit is contained in:
msramalho
2023-07-26 16:12:56 +01:00
parent 086a9e6c84
commit 888ad8f004
3 changed files with 11 additions and 2 deletions

View File

@@ -18,7 +18,7 @@ steps:
# - thumbnail_enricher
# - wayback_archiver_enricher
# - wacz_enricher
# - pdq_hash_enricher
# - pdq_hash_enricher # if you want to calculate hashes for thumbnails, include this after thumbnail_enricher
formatter: html_formatter # defaults to mute_formatter
storages:
- local_storage

View File

@@ -6,6 +6,7 @@ from slugify import slugify
from . import Archiver
from ..core import Metadata, Media
from ..utils.misc import remove_get_parameters
class TwitterArchiver(Archiver):
@@ -133,7 +134,7 @@ class TwitterArchiver(Archiver):
media = Media(filename="")
media.set("src", u)
ext = ""
if (mtype := mimetypes.guess_type(u)[0]):
if (mtype := mimetypes.guess_type(remove_get_parameters(u))[0]):
ext = mimetypes.guess_extension(mtype)
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}', item)

View File

@@ -2,6 +2,7 @@
import os, json, requests
from datetime import datetime
from loguru import logger
from urllib.parse import urlparse, urlunparse
def mkdir_if_not_exists(folder):
@@ -20,6 +21,13 @@ def expand_url(url):
logger.error(f'Failed to expand url {url}')
return url
def remove_get_parameters(url):
# http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
# useful for mimetypes to work
parsed_url = urlparse(url)
new_url = urlunparse(parsed_url._replace(query=''))
return new_url
def getattr_or(o: object, prop: str, default=None):
try: