mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-07 19:08:30 +03:00
fixes incomplete yt-dlp parts download
This commit is contained in:
@@ -11,6 +11,7 @@ Key Functionalities:
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import os
|
||||||
from typing import Any, List, Union, Dict
|
from typing import Any, List, Union, Dict
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from dataclasses_json import dataclass_json
|
from dataclasses_json import dataclass_json
|
||||||
@@ -186,6 +187,9 @@ class Metadata:
|
|||||||
continue
|
continue
|
||||||
h = m.get("hash")
|
h = m.get("hash")
|
||||||
if not h:
|
if not h:
|
||||||
|
if not os.path.exists(m.filename):
|
||||||
|
logger.warning(f"Skipping missing media file: {m.filename}")
|
||||||
|
continue
|
||||||
h = calculate_hash_in_chunks(hashlib.sha256(), int(1.6e7), m.filename)
|
h = calculate_hash_in_chunks(hashlib.sha256(), int(1.6e7), m.filename)
|
||||||
if len(h) and h in media_hashes:
|
if len(h) and h in media_hashes:
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -120,6 +120,9 @@ def ydl_entry_to_filename(ydl, entry: dict) -> str:
|
|||||||
directory = os.path.dirname(base_filename) # '/get/path/to'
|
directory = os.path.dirname(base_filename) # '/get/path/to'
|
||||||
basename = os.path.basename(base_filename) # 'file'
|
basename = os.path.basename(base_filename) # 'file'
|
||||||
for f in os.listdir(directory):
|
for f in os.listdir(directory):
|
||||||
|
# skip incomplete downloads left behind by yt-dlp
|
||||||
|
if f.endswith(".part"):
|
||||||
|
continue
|
||||||
if (
|
if (
|
||||||
f.startswith(basename)
|
f.startswith(basename)
|
||||||
or (entry_url and os.path.splitext(f)[0] in entry_url)
|
or (entry_url and os.path.splitext(f)[0] in entry_url)
|
||||||
|
|||||||
@@ -86,6 +86,22 @@ def test_media_management(basic_metadata, media_file):
|
|||||||
assert basic_metadata.get_media_by_id("m1") == media1
|
assert basic_metadata.get_media_by_id("m1") == media1
|
||||||
|
|
||||||
|
|
||||||
|
def test_remove_duplicate_skips_missing_files(basic_metadata, media_file, tmp_path):
|
||||||
|
"""Missing files should be dropped instead of crashing with FileNotFoundError."""
|
||||||
|
real_file = tmp_path / "exists.txt"
|
||||||
|
real_file.write_text("content")
|
||||||
|
valid = media_file(filename=str(real_file), hash_value="abc")
|
||||||
|
missing = media_file(filename="/nonexistent/path/gone.mp4")
|
||||||
|
|
||||||
|
basic_metadata.add_media(valid, "valid")
|
||||||
|
basic_metadata.add_media(missing, "missing")
|
||||||
|
|
||||||
|
assert len(basic_metadata.media) == 2
|
||||||
|
basic_metadata.remove_duplicate_media_by_hash()
|
||||||
|
assert len(basic_metadata.media) == 1
|
||||||
|
assert basic_metadata.get_media_by_id("valid") == valid
|
||||||
|
|
||||||
|
|
||||||
def test_success():
|
def test_success():
|
||||||
m = Metadata()
|
m = Metadata()
|
||||||
assert not m.is_success()
|
assert not m.is_success()
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ from auto_archiver.utils.misc import (
|
|||||||
calculate_file_hash,
|
calculate_file_hash,
|
||||||
random_str,
|
random_str,
|
||||||
get_timestamp,
|
get_timestamp,
|
||||||
|
ydl_entry_to_filename,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -139,3 +140,47 @@ class TestMiscUtils:
|
|||||||
|
|
||||||
def test_invalid_timestamp_returns_none(self):
|
def test_invalid_timestamp_returns_none(self):
|
||||||
assert get_timestamp("invalid-date") is None
|
assert get_timestamp("invalid-date") is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestYdlEntryToFilename:
|
||||||
|
"""Tests for ydl_entry_to_filename, especially .part file filtering."""
|
||||||
|
|
||||||
|
def _make_mock_ydl(self, prepared_filename):
|
||||||
|
class MockYDL:
|
||||||
|
def prepare_filename(self, entry):
|
||||||
|
return prepared_filename
|
||||||
|
|
||||||
|
return MockYDL()
|
||||||
|
|
||||||
|
def test_returns_exact_file_if_exists(self, tmp_path):
|
||||||
|
video = tmp_path / "video.mp4"
|
||||||
|
video.write_bytes(b"data")
|
||||||
|
ydl = self._make_mock_ydl(str(video))
|
||||||
|
assert ydl_entry_to_filename(ydl, {}) == str(video)
|
||||||
|
|
||||||
|
def test_skips_part_file_returns_complete(self, tmp_path):
|
||||||
|
"""Simulates yt-dlp leaving a .part file from a failed format
|
||||||
|
while a complete .webm exists."""
|
||||||
|
(tmp_path / "f5U3IKfoSYs.f399.mp4.part").write_bytes(b"incomplete")
|
||||||
|
webm = tmp_path / "f5U3IKfoSYs.webm"
|
||||||
|
webm.write_bytes(b"complete video")
|
||||||
|
|
||||||
|
# ydl.prepare_filename returns the expected .mp4 which doesn't exist
|
||||||
|
ydl = self._make_mock_ydl(str(tmp_path / "f5U3IKfoSYs.mp4"))
|
||||||
|
result = ydl_entry_to_filename(ydl, {})
|
||||||
|
|
||||||
|
assert result == str(webm)
|
||||||
|
assert not result.endswith(".part")
|
||||||
|
|
||||||
|
def test_skips_part_file_returns_false_if_no_other_match(self, tmp_path):
|
||||||
|
"""Only a .part file exists — should return False."""
|
||||||
|
(tmp_path / "video.f399.mp4.part").write_bytes(b"incomplete")
|
||||||
|
|
||||||
|
ydl = self._make_mock_ydl(str(tmp_path / "video.mp4"))
|
||||||
|
assert ydl_entry_to_filename(ydl, {}) is False
|
||||||
|
|
||||||
|
def test_returns_false_when_no_files_match(self, tmp_path):
|
||||||
|
(tmp_path / "unrelated.txt").write_bytes(b"data")
|
||||||
|
|
||||||
|
ydl = self._make_mock_ydl(str(tmp_path / "video.mp4"))
|
||||||
|
assert ydl_entry_to_filename(ydl, {}) is False
|
||||||
|
|||||||
Reference in New Issue
Block a user