mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
56526a9ac7 | ||
|
|
3a22cc28c0 | ||
|
|
dbb3dfa04f | ||
|
|
01bdb35f5d | ||
|
|
43cbc6ac56 | ||
|
|
9c7cab1ae2 |
6
poetry.lock
generated
6
poetry.lock
generated
@@ -3058,14 +3058,14 @@ pyasn1 = ">=0.1.3"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ruamel-yaml"
|
name = "ruamel-yaml"
|
||||||
version = "0.18.15"
|
version = "0.18.16"
|
||||||
description = "ruamel.yaml is a YAML parser/emitter that supports roundtrip preservation of comments, seq/map flow style, and map key order"
|
description = "ruamel.yaml is a YAML parser/emitter that supports roundtrip preservation of comments, seq/map flow style, and map key order"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
groups = ["main"]
|
groups = ["main"]
|
||||||
files = [
|
files = [
|
||||||
{file = "ruamel.yaml-0.18.15-py3-none-any.whl", hash = "sha256:148f6488d698b7a5eded5ea793a025308b25eca97208181b6a026037f391f701"},
|
{file = "ruamel.yaml-0.18.16-py3-none-any.whl", hash = "sha256:048f26d64245bae57a4f9ef6feb5b552a386830ef7a826f235ffb804c59efbba"},
|
||||||
{file = "ruamel.yaml-0.18.15.tar.gz", hash = "sha256:dbfca74b018c4c3fba0b9cc9ee33e53c371194a9000e694995e620490fd40700"},
|
{file = "ruamel.yaml-0.18.16.tar.gz", hash = "sha256:a6e587512f3c998b2225d68aa1f35111c29fad14aed561a26e73fab729ec5e5a"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "auto-archiver"
|
name = "auto-archiver"
|
||||||
version = "1.1.5"
|
version = "1.1.6"
|
||||||
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
||||||
|
|
||||||
requires-python = ">=3.10,<3.13"
|
requires-python = ">=3.10,<3.13"
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import datetime
|
|||||||
import os
|
import os
|
||||||
import importlib
|
import importlib
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import traceback
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
from typing import Generator, Type
|
from typing import Generator, Type
|
||||||
@@ -305,7 +306,7 @@ class GenericExtractor(Extractor):
|
|||||||
result.set_url(url)
|
result.set_url(url)
|
||||||
|
|
||||||
if "description" in video_data and not result.get("content"):
|
if "description" in video_data and not result.get("content"):
|
||||||
result.set_content(video_data.get("description"))
|
result.set_content(video_data.pop("description"))
|
||||||
# extract comments if enabled
|
# extract comments if enabled
|
||||||
if self.comments and video_data.get("comments", None) is not None:
|
if self.comments and video_data.get("comments", None) is not None:
|
||||||
result.set(
|
result.set(
|
||||||
@@ -406,9 +407,9 @@ class GenericExtractor(Extractor):
|
|||||||
logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}")
|
logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}")
|
||||||
result.add_media(new_media)
|
result.add_media(new_media)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error processing entry {entry}: {e}")
|
logger.error(f"Error processing entry {str(entry)[:256]}: {e} {traceback.format_exc()}")
|
||||||
if not len(result.media):
|
if not len(result.media):
|
||||||
logger.info(f"No media found for entry {entry}, skipping.")
|
logger.info(f"No media found for entry {str(entry)[:256]}, skipping.")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return self.add_metadata(data, info_extractor, url, result)
|
return self.add_metadata(data, info_extractor, url, result)
|
||||||
@@ -604,9 +605,9 @@ class GenericExtractor(Extractor):
|
|||||||
validated_options
|
validated_options
|
||||||
) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
|
) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
|
||||||
|
|
||||||
|
result: Metadata = None
|
||||||
for info_extractor in self.suitable_extractors(url):
|
for info_extractor in self.suitable_extractors(url):
|
||||||
result = self.download_for_extractor(info_extractor, url, ydl)
|
local_result: Metadata = self.download_for_extractor(info_extractor, url, ydl)
|
||||||
if result:
|
if local_result:
|
||||||
return result
|
result = result.merge(local_result) if result else local_result
|
||||||
|
return result if result else False
|
||||||
return False
|
|
||||||
|
|||||||
@@ -5,6 +5,9 @@ from auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher
|
|||||||
from .test_extractor_base import TestExtractorBase
|
from .test_extractor_base import TestExtractorBase
|
||||||
|
|
||||||
|
|
||||||
|
CI = os.getenv("GITHUB_ACTIONS", "") == "true"
|
||||||
|
|
||||||
|
|
||||||
class DummySB:
|
class DummySB:
|
||||||
def __init__(self, url="", title="", visible_texts=None, visible_elements=None):
|
def __init__(self, url="", title="", visible_texts=None, visible_elements=None):
|
||||||
self._url = url
|
self._url = url
|
||||||
@@ -51,14 +54,15 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
|
|
||||||
@pytest.mark.download
|
@pytest.mark.download
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"url,in_title,in_text,image_count,video_count",
|
"url,in_title,in_text,image_count,video_count,skip_ci",
|
||||||
[
|
[
|
||||||
(
|
(
|
||||||
"https://en.wikipedia.org/wiki/Western_barn_owl",
|
"https://en.wikipedia.org/wiki/Western_barn_owl",
|
||||||
"western barn owl",
|
"western barn owl",
|
||||||
"Tyto alba",
|
"Tyto alba",
|
||||||
5,
|
4,
|
||||||
0,
|
0,
|
||||||
|
False,
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"https://www.bellingcat.com/news/2025/04/29/open-sources-show-myanmar-junta-airstrike-damages-despite-post-earthquake-ceasefire/",
|
"https://www.bellingcat.com/news/2025/04/29/open-sources-show-myanmar-junta-airstrike-damages-despite-post-earthquake-ceasefire/",
|
||||||
@@ -66,6 +70,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
"Bellingcat has geolocated",
|
"Bellingcat has geolocated",
|
||||||
5,
|
5,
|
||||||
0,
|
0,
|
||||||
|
False,
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"https://www.bellingcat.com/news/2025/03/27/gaza-israel-palestine-shot-killed-injured-destroyed-dangerous-drone-journalists-in-gaza/",
|
"https://www.bellingcat.com/news/2025/03/27/gaza-israel-palestine-shot-killed-injured-destroyed-dangerous-drone-journalists-in-gaza/",
|
||||||
@@ -73,6 +78,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
"continued the work of Gazan journalists",
|
"continued the work of Gazan journalists",
|
||||||
5,
|
5,
|
||||||
1,
|
1,
|
||||||
|
False,
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"https://www.bellingcat.com/about/general-information",
|
"https://www.bellingcat.com/about/general-information",
|
||||||
@@ -80,6 +86,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
"Stichting Bellingcat",
|
"Stichting Bellingcat",
|
||||||
0, # SVGs are ignored
|
0, # SVGs are ignored
|
||||||
0,
|
0,
|
||||||
|
False,
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"https://vk.com/wikipedia?from=search&w=wall-36156673_20451",
|
"https://vk.com/wikipedia?from=search&w=wall-36156673_20451",
|
||||||
@@ -87,6 +94,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
"16 сентября 1985 года лейблом EMI Records.",
|
"16 сентября 1985 года лейблом EMI Records.",
|
||||||
5,
|
5,
|
||||||
0,
|
0,
|
||||||
|
False,
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"https://www.tiktok.com/@tracy_2424/photo/7418200173953830162",
|
"https://www.tiktok.com/@tracy_2424/photo/7418200173953830162",
|
||||||
@@ -94,13 +102,19 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
"Dito ko lang",
|
"Dito ko lang",
|
||||||
1,
|
1,
|
||||||
0,
|
0,
|
||||||
|
True,
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_download_pages_with_media(self, setup_module, make_item, url, in_title, in_text, image_count, video_count):
|
def test_download_pages_with_media(
|
||||||
|
self, setup_module, make_item, url, in_title, in_text, image_count, video_count, skip_ci
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Test downloading pages with media.
|
Test downloading pages with media.
|
||||||
"""
|
"""
|
||||||
|
if CI and skip_ci:
|
||||||
|
pytest.skip("Skipping test in CI environment")
|
||||||
|
|
||||||
self.extractor = setup_module(
|
self.extractor = setup_module(
|
||||||
self.extractor_module,
|
self.extractor_module,
|
||||||
self.config
|
self.config
|
||||||
|
|||||||
@@ -48,8 +48,6 @@ class TestGenericExtractor(TestExtractorBase):
|
|||||||
("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
|
("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
|
||||||
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
|
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
|
||||||
("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
|
("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
|
||||||
("https://www.facebook.com/nytimes/videos/10160796550110716", ["facebook"]),
|
|
||||||
("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/", ["facebook"]),
|
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_suitable_extractors(self, url, suitable_extractors):
|
def test_suitable_extractors(self, url, suitable_extractors):
|
||||||
@@ -148,6 +146,7 @@ class TestGenericExtractor(TestExtractorBase):
|
|||||||
def test_bluesky_download_video(self, make_item):
|
def test_bluesky_download_video(self, make_item):
|
||||||
item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
|
item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
|
||||||
result = self.extractor.download(item)
|
result = self.extractor.download(item)
|
||||||
|
assert result.get_url() == "https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i"
|
||||||
assert result is not False
|
assert result is not False
|
||||||
|
|
||||||
@pytest.mark.skipif(not TEST_TRUTH_SOCIAL, reason="Truth social download tests disabled in environment variables.")
|
@pytest.mark.skipif(not TEST_TRUTH_SOCIAL, reason="Truth social download tests disabled in environment variables.")
|
||||||
|
|||||||
Reference in New Issue
Block a user