From 9c7cab1ae202742539981485ae85c947d07f5f6a Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 22 Oct 2025 21:07:12 +0100 Subject: [PATCH 1/5] dependencies update --- poetry.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index ca589a5..9664319 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3058,14 +3058,14 @@ pyasn1 = ">=0.1.3" [[package]] name = "ruamel-yaml" -version = "0.18.15" +version = "0.18.16" description = "ruamel.yaml is a YAML parser/emitter that supports roundtrip preservation of comments, seq/map flow style, and map key order" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "ruamel.yaml-0.18.15-py3-none-any.whl", hash = "sha256:148f6488d698b7a5eded5ea793a025308b25eca97208181b6a026037f391f701"}, - {file = "ruamel.yaml-0.18.15.tar.gz", hash = "sha256:dbfca74b018c4c3fba0b9cc9ee33e53c371194a9000e694995e620490fd40700"}, + {file = "ruamel.yaml-0.18.16-py3-none-any.whl", hash = "sha256:048f26d64245bae57a4f9ef6feb5b552a386830ef7a826f235ffb804c59efbba"}, + {file = "ruamel.yaml-0.18.16.tar.gz", hash = "sha256:a6e587512f3c998b2225d68aa1f35111c29fad14aed561a26e73fab729ec5e5a"}, ] [package.dependencies] From 43cbc6ac56f4d1869f3e4c45d7929556fc6bee7d Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 23 Oct 2025 09:51:14 +0100 Subject: [PATCH 2/5] generic extractor improvements --- .../generic_extractor/generic_extractor.py | 17 +++++++++-------- tests/extractors/test_generic_extractor.py | 3 +-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 71bdbf2..dae9381 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -4,6 +4,7 @@ import datetime import os import importlib import subprocess +import traceback import zipfile from typing import Generator, Type @@ -305,7 +306,7 @@ class GenericExtractor(Extractor): result.set_url(url) if "description" in video_data and not result.get("content"): - result.set_content(video_data.get("description")) + result.set_content(video_data.pop("description")) # extract comments if enabled if self.comments and video_data.get("comments", None) is not None: result.set( @@ -406,9 +407,9 @@ class GenericExtractor(Extractor): logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}") result.add_media(new_media) except Exception as e: - logger.error(f"Error processing entry {entry}: {e}") + logger.error(f"Error processing entry {str(entry)[:256]}: {e} {traceback.format_exc()}") if not len(result.media): - logger.info(f"No media found for entry {entry}, skipping.") + logger.info(f"No media found for entry {str(entry)[:256]}, skipping.") return False return self.add_metadata(data, info_extractor, url, result) @@ -604,9 +605,9 @@ class GenericExtractor(Extractor): validated_options ) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en" + result: Metadata = None for info_extractor in self.suitable_extractors(url): - result = self.download_for_extractor(info_extractor, url, ydl) - if result: - return result - - return False + local_result: Metadata = self.download_for_extractor(info_extractor, url, ydl) + if local_result: + result = result.merge(local_result) if result else local_result + return result if result else False diff --git a/tests/extractors/test_generic_extractor.py b/tests/extractors/test_generic_extractor.py index fe3baf5..e5d2f3a 100644 --- a/tests/extractors/test_generic_extractor.py +++ b/tests/extractors/test_generic_extractor.py @@ -48,8 +48,6 @@ class TestGenericExtractor(TestExtractorBase): ("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]), ("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]), ("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]), - ("https://www.facebook.com/nytimes/videos/10160796550110716", ["facebook"]), - ("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/", ["facebook"]), ], ) def test_suitable_extractors(self, url, suitable_extractors): @@ -148,6 +146,7 @@ class TestGenericExtractor(TestExtractorBase): def test_bluesky_download_video(self, make_item): item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i") result = self.extractor.download(item) + assert result.get_url() == "https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i" assert result is not False @pytest.mark.skipif(not TEST_TRUTH_SOCIAL, reason="Truth social download tests disabled in environment variables.") From 01bdb35f5d4555bedf42771bd7fe5192a94829c8 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 23 Oct 2025 09:51:31 +0100 Subject: [PATCH 3/5] version bump --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8f5c569..0229dd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [project] name = "auto-archiver" -version = "1.1.5" +version = "1.1.6" description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)." requires-python = ">=3.10,<3.13" From dbb3dfa04fb52d8f3617528bb6840b6408db78e7 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 23 Oct 2025 10:04:44 +0100 Subject: [PATCH 4/5] fixes wikipedia test --- tests/extractors/test_antibot_extractor_enricher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/extractors/test_antibot_extractor_enricher.py b/tests/extractors/test_antibot_extractor_enricher.py index 3ec34f8..2fd89c0 100644 --- a/tests/extractors/test_antibot_extractor_enricher.py +++ b/tests/extractors/test_antibot_extractor_enricher.py @@ -57,7 +57,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase): "https://en.wikipedia.org/wiki/Western_barn_owl", "western barn owl", "Tyto alba", - 5, + 4, 0, ), ( From 3a22cc28c00c14f970355af082fbf78c5cab2701 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 23 Oct 2025 10:17:14 +0100 Subject: [PATCH 5/5] skip tiktok antibot test in CI --- .../test_antibot_extractor_enricher.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tests/extractors/test_antibot_extractor_enricher.py b/tests/extractors/test_antibot_extractor_enricher.py index 2fd89c0..9b4fdc4 100644 --- a/tests/extractors/test_antibot_extractor_enricher.py +++ b/tests/extractors/test_antibot_extractor_enricher.py @@ -5,6 +5,9 @@ from auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher from .test_extractor_base import TestExtractorBase +CI = os.getenv("GITHUB_ACTIONS", "") == "true" + + class DummySB: def __init__(self, url="", title="", visible_texts=None, visible_elements=None): self._url = url @@ -51,7 +54,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase): @pytest.mark.download @pytest.mark.parametrize( - "url,in_title,in_text,image_count,video_count", + "url,in_title,in_text,image_count,video_count,skip_ci", [ ( "https://en.wikipedia.org/wiki/Western_barn_owl", @@ -59,6 +62,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase): "Tyto alba", 4, 0, + False, ), ( "https://www.bellingcat.com/news/2025/04/29/open-sources-show-myanmar-junta-airstrike-damages-despite-post-earthquake-ceasefire/", @@ -66,6 +70,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase): "Bellingcat has geolocated", 5, 0, + False, ), ( "https://www.bellingcat.com/news/2025/03/27/gaza-israel-palestine-shot-killed-injured-destroyed-dangerous-drone-journalists-in-gaza/", @@ -73,6 +78,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase): "continued the work of Gazan journalists", 5, 1, + False, ), ( "https://www.bellingcat.com/about/general-information", @@ -80,6 +86,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase): "Stichting Bellingcat", 0, # SVGs are ignored 0, + False, ), ( "https://vk.com/wikipedia?from=search&w=wall-36156673_20451", @@ -87,6 +94,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase): "16 сентября 1985 года лейблом EMI Records.", 5, 0, + False, ), ( "https://www.tiktok.com/@tracy_2424/photo/7418200173953830162", @@ -94,13 +102,19 @@ class TestAntibotExtractorEnricher(TestExtractorBase): "Dito ko lang", 1, 0, + True, ), ], ) - def test_download_pages_with_media(self, setup_module, make_item, url, in_title, in_text, image_count, video_count): + def test_download_pages_with_media( + self, setup_module, make_item, url, in_title, in_text, image_count, video_count, skip_ci + ): """ Test downloading pages with media. """ + if CI and skip_ci: + pytest.skip("Skipping test in CI environment") + self.extractor = setup_module( self.extractor_module, self.config