From bc06de8e5c5002f621571ed144851992d3965d9d Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 27 Apr 2026 12:34:47 +0100 Subject: [PATCH] fixes incomplete yt-dlp parts download --- src/auto_archiver/core/metadata.py | 4 +++ src/auto_archiver/utils/misc.py | 3 ++ tests/test_metadata.py | 16 +++++++++++ tests/utils/test_misc.py | 45 ++++++++++++++++++++++++++++++ 4 files changed, 68 insertions(+) diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index 05f7d9c..f8fb5f5 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -11,6 +11,7 @@ Key Functionalities: from __future__ import annotations import hashlib +import os from typing import Any, List, Union, Dict from dataclasses import dataclass, field from dataclasses_json import dataclass_json @@ -186,6 +187,9 @@ class Metadata: continue h = m.get("hash") if not h: + if not os.path.exists(m.filename): + logger.warning(f"Skipping missing media file: {m.filename}") + continue h = calculate_hash_in_chunks(hashlib.sha256(), int(1.6e7), m.filename) if len(h) and h in media_hashes: continue diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py index 747d57e..c16ed5c 100644 --- a/src/auto_archiver/utils/misc.py +++ b/src/auto_archiver/utils/misc.py @@ -120,6 +120,9 @@ def ydl_entry_to_filename(ydl, entry: dict) -> str: directory = os.path.dirname(base_filename) # '/get/path/to' basename = os.path.basename(base_filename) # 'file' for f in os.listdir(directory): + # skip incomplete downloads left behind by yt-dlp + if f.endswith(".part"): + continue if ( f.startswith(basename) or (entry_url and os.path.splitext(f)[0] in entry_url) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index e838979..9908d92 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -86,6 +86,22 @@ def test_media_management(basic_metadata, media_file): assert basic_metadata.get_media_by_id("m1") == media1 +def test_remove_duplicate_skips_missing_files(basic_metadata, media_file, tmp_path): + """Missing files should be dropped instead of crashing with FileNotFoundError.""" + real_file = tmp_path / "exists.txt" + real_file.write_text("content") + valid = media_file(filename=str(real_file), hash_value="abc") + missing = media_file(filename="/nonexistent/path/gone.mp4") + + basic_metadata.add_media(valid, "valid") + basic_metadata.add_media(missing, "missing") + + assert len(basic_metadata.media) == 2 + basic_metadata.remove_duplicate_media_by_hash() + assert len(basic_metadata.media) == 1 + assert basic_metadata.get_media_by_id("valid") == valid + + def test_success(): m = Metadata() assert not m.is_success() diff --git a/tests/utils/test_misc.py b/tests/utils/test_misc.py index a3902c7..05dce22 100644 --- a/tests/utils/test_misc.py +++ b/tests/utils/test_misc.py @@ -14,6 +14,7 @@ from auto_archiver.utils.misc import ( calculate_file_hash, random_str, get_timestamp, + ydl_entry_to_filename, ) @@ -139,3 +140,47 @@ class TestMiscUtils: def test_invalid_timestamp_returns_none(self): assert get_timestamp("invalid-date") is None + + +class TestYdlEntryToFilename: + """Tests for ydl_entry_to_filename, especially .part file filtering.""" + + def _make_mock_ydl(self, prepared_filename): + class MockYDL: + def prepare_filename(self, entry): + return prepared_filename + + return MockYDL() + + def test_returns_exact_file_if_exists(self, tmp_path): + video = tmp_path / "video.mp4" + video.write_bytes(b"data") + ydl = self._make_mock_ydl(str(video)) + assert ydl_entry_to_filename(ydl, {}) == str(video) + + def test_skips_part_file_returns_complete(self, tmp_path): + """Simulates yt-dlp leaving a .part file from a failed format + while a complete .webm exists.""" + (tmp_path / "f5U3IKfoSYs.f399.mp4.part").write_bytes(b"incomplete") + webm = tmp_path / "f5U3IKfoSYs.webm" + webm.write_bytes(b"complete video") + + # ydl.prepare_filename returns the expected .mp4 which doesn't exist + ydl = self._make_mock_ydl(str(tmp_path / "f5U3IKfoSYs.mp4")) + result = ydl_entry_to_filename(ydl, {}) + + assert result == str(webm) + assert not result.endswith(".part") + + def test_skips_part_file_returns_false_if_no_other_match(self, tmp_path): + """Only a .part file exists — should return False.""" + (tmp_path / "video.f399.mp4.part").write_bytes(b"incomplete") + + ydl = self._make_mock_ydl(str(tmp_path / "video.mp4")) + assert ydl_entry_to_filename(ydl, {}) is False + + def test_returns_false_when_no_files_match(self, tmp_path): + (tmp_path / "unrelated.txt").write_bytes(b"data") + + ydl = self._make_mock_ydl(str(tmp_path / "video.mp4")) + assert ydl_entry_to_filename(ydl, {}) is False