From c3dd19f3092fe4aa88b31293aab85498e6802b2b Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 15 Jan 2025 17:02:19 +0100 Subject: [PATCH] Sniff filetype of downloaded media and add extension Also download in chunks - fixes 2 x TODOs --- poetry.lock | 16 ++++++++++-- pyproject.toml | 3 ++- src/auto_archiver/archivers/archiver.py | 33 +++++++++++++++++++------ 3 files changed, 41 insertions(+), 11 deletions(-) diff --git a/poetry.lock b/poetry.lock index 1b31740..97e1035 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.0.0 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -889,6 +889,18 @@ future = "*" [package.extras] dev = ["Sphinx (==2.1.0)", "future (==0.17.1)", "numpy (==1.16.4)", "pytest (==4.6.1)", "pytest-mock (==1.10.4)", "tox (==3.12.1)"] +[[package]] +name = "filetype" +version = "1.2.0" +description = "Infer file type and MIME type of any file/buffer. No external dependencies." +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25"}, + {file = "filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb"}, +] + [[package]] name = "flask" version = "3.1.0" @@ -3296,4 +3308,4 @@ test = ["pytest (>=8.1,<9.0)"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.13" -content-hash = "7c7dc6d26e5af1c9bb6e4393b4ac64b155049d20a9f5317baec48c964a2708ac" +content-hash = "df1bd49271b2682b82da437c2e6ce3842d116aa0fc7769e9ab9958c91a8647b2" diff --git a/pyproject.toml b/pyproject.toml index 9fd4547..c5d2a9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,8 @@ dependencies = [ "retrying (>=0.0.0)", "tsp-client (>=0.0.0)", "certvalidator (>=0.0.0)", - "toml (>=0.10.2,<0.11.0)" + "toml (>=0.10.2,<0.11.0)", + "filetype (>=1.2.0,<2.0.0)" ] [tool.poetry.group.dev.dependencies] diff --git a/src/auto_archiver/archivers/archiver.py b/src/auto_archiver/archivers/archiver.py index 25e08c3..24bb53c 100644 --- a/src/auto_archiver/archivers/archiver.py +++ b/src/auto_archiver/archivers/archiver.py @@ -1,6 +1,8 @@ from __future__ import annotations +from pathlib import Path from abc import abstractmethod from dataclasses import dataclass +import filetype import os import mimetypes, requests from loguru import logger @@ -46,10 +48,8 @@ class Archiver(Step): @retry(wait_random_min=500, wait_random_max=3500, stop_max_attempt_number=5) def download_from_url(self, url: str, to_filename: str = None, verbose=True) -> str: """ - downloads a URL to provided filename, or inferred from URL, returns local filename + downloads a URL to provided filename, or inferred from URL, returns local filename """ - # TODO: should we refactor to use requests.get(url, stream=True) and write to file in chunks? compare approaches - # TODO: should we guess the extension? if not to_filename: to_filename = url.split('/')[-1].split('?')[0] if len(to_filename) > 64: @@ -59,11 +59,28 @@ class Archiver(Step): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' } - d = requests.get(url, headers=headers) - assert d.status_code == 200, f"got response code {d.status_code} for {url=}" - with open(to_filename, 'wb') as f: - f.write(d.content) - return to_filename + try: + d = requests.get(url, stream=True, headers=headers) + d.raise_for_status() + + # Peek at the first 256 bytes + first_256 = d.raw.read(256) + + # Use filetype to guess the extension if there isn't already one + if not Path(to_filename).suffix: + guessed = filetype.guess(first_256) + extension = guessed.extension if guessed else None + if extension: + to_filename += f".{extension}" + + with open(to_filename, 'wb') as f: + f.write(first_256) + for chunk in d.iter_content(chunk_size=8192): + f.write(chunk) + return to_filename + + except requests.RequestException as e: + logger.warning(f"Failed to fetch the Media URL: {e}") @abstractmethod def download(self, item: Metadata) -> Metadata: pass