mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
Further refactoring of youtubedl_archiver->base_archiver
* Keep twitter_api_archiver * Remove unit tests for obsolete archivers * Guess filename of media using the 'Content-Type' header * Add mechanism to run 'expensive' tests last (see conftest.py) and also flag expensive tests to fail straight off (pytest.mark.incremental)
This commit is contained in:
141
tests/archivers/test_base_archiver.py
Normal file
141
tests/archivers/test_base_archiver.py
Normal file
@@ -0,0 +1,141 @@
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
import datetime
|
||||
|
||||
from auto_archiver.archivers.base_archiver import BaseArchiver
|
||||
|
||||
from .test_archiver_base import TestArchiverBase
|
||||
|
||||
class TestBaseArchiver(TestArchiverBase):
|
||||
"""Tests Base Archiver
|
||||
"""
|
||||
archiver_class = BaseArchiver
|
||||
config = {
|
||||
'subtitles': False,
|
||||
'comments': False,
|
||||
'livestreams': False,
|
||||
'live_from_start': False,
|
||||
'end_means_success': True,
|
||||
'allow_playlist': False,
|
||||
'max_downloads': "inf",
|
||||
'proxy': None,
|
||||
'cookies_from_browser': False,
|
||||
'cookie_file': None,
|
||||
}
|
||||
|
||||
@pytest.mark.parametrize("url, is_suitable", [
|
||||
("https://www.youtube.com/watch?v=5qap5aO4i9A", True),
|
||||
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", True),
|
||||
("https://www.instagram.com/p/CU1J9JYJ9Zz/", True),
|
||||
("https://www.facebook.com/nytimes/videos/10160796550110716", True),
|
||||
("https://www.twitch.tv/videos/1167226570", True),
|
||||
("https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/", True),
|
||||
("https://google.com", True)])
|
||||
def test_suitable_urls(self, make_item, url, is_suitable):
|
||||
"""
|
||||
Note: expected behaviour is to return True for all URLs, as YoutubeDLArchiver should be able to handle all URLs
|
||||
This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for,
|
||||
and then if and only if all archivers fails, does it fall back to the generic archiver)
|
||||
"""
|
||||
assert self.archiver.suitable(url) == is_suitable
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_tiktok(self, make_item):
|
||||
item = make_item("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970")
|
||||
result = self.archiver.download(item)
|
||||
assert result.get_url() == "https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970"
|
||||
|
||||
@pytest.mark.download
|
||||
def test_youtube_download(self, make_item):
|
||||
# url https://www.youtube.com/watch?v=5qap5aO4i9A
|
||||
item = make_item("https://www.youtube.com/watch?v=J---aiyznGQ")
|
||||
result = self.archiver.download(item)
|
||||
assert result.get_url() == "https://www.youtube.com/watch?v=J---aiyznGQ"
|
||||
assert result.get_title() == "Keyboard Cat! - THE ORIGINAL!"
|
||||
assert result.get('description') == "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
|
||||
assert len(result.media) == 2
|
||||
assert Path(result.media[0].filename).name == "J---aiyznGQ.webm"
|
||||
assert Path(result.media[1].filename).name == "hqdefault.jpg"
|
||||
|
||||
@pytest.mark.download
|
||||
def test_bluesky_download_multiple_images(self, make_item):
|
||||
item = make_item("https://bsky.app/profile/colborne.bsky.social/post/3lec2bqjc5s2y")
|
||||
result = self.archiver.download(item)
|
||||
assert result is not False
|
||||
|
||||
@pytest.mark.skip("ytdlp supports bluesky, but there's currently no way to extract info from pages without videos")
|
||||
@pytest.mark.download
|
||||
def test_bluesky_download_single_image(self, make_item):
|
||||
item = make_item("https://bsky.app/profile/colborne.bsky.social/post/3lcxcpgt6j42l")
|
||||
result = self.archiver.download(item)
|
||||
assert result is not False
|
||||
|
||||
@pytest.mark.download
|
||||
def test_bluesky_download_no_media(self, make_item):
|
||||
item = make_item("https://bsky.app/profile/bellingcat.com/post/3lfphwmcs4c2z")
|
||||
result = self.archiver.download(item)
|
||||
assert result is not False
|
||||
|
||||
@pytest.mark.download
|
||||
def test_bluesky_download_video(self, make_item):
|
||||
item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
|
||||
result = self.archiver.download(item)
|
||||
assert result is not False
|
||||
|
||||
@pytest.mark.download
|
||||
def test_twitter_download_nonexistend_tweet(self, make_item):
|
||||
# this tweet does not exist
|
||||
url = "https://x.com/Bellingcat/status/17197025860711058"
|
||||
response = self.archiver.download(make_item(url))
|
||||
assert not response
|
||||
|
||||
@pytest.mark.download
|
||||
def test_twitter_download_malformed_tweetid(self, make_item):
|
||||
# this tweet does not exist
|
||||
url = "https://x.com/Bellingcat/status/1719702a586071100058"
|
||||
response = self.archiver.download(make_item(url))
|
||||
assert not response
|
||||
|
||||
@pytest.mark.download
|
||||
def test_twitter_download_tweet_no_media(self, make_item):
|
||||
|
||||
item = make_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
|
||||
post = self.archiver.download(item)
|
||||
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
"Onion rings are just vegetable donuts.",
|
||||
datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
|
||||
"yt-dlp_Twitter: success"
|
||||
)
|
||||
|
||||
@pytest.mark.download
|
||||
def test_twitter_download_video(self, make_item):
|
||||
url = "https://x.com/bellingcat/status/1871552600346415571"
|
||||
post = self.archiver.download(make_item(url))
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
"Bellingcat - This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services",
|
||||
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
|
||||
)
|
||||
|
||||
@pytest.mark.xfail(reason="Currently failing, sensitive content requires logged in users/cookies - not yet implemented")
|
||||
@pytest.mark.download
|
||||
@pytest.mark.parametrize("url, title, timestamp, image_hash", [
|
||||
("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
])
|
||||
def test_twitter_download_sensitive_media(self, url, title, timestamp, image_hash, make_item):
|
||||
|
||||
"""Download tweets with sensitive media"""
|
||||
|
||||
post = self.archiver.download(make_item(url))
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
title,
|
||||
timestamp
|
||||
)
|
||||
assert len(post.media) == 1
|
||||
assert post.media[0].hash == image_hash
|
||||
@@ -1,73 +0,0 @@
|
||||
import pytest
|
||||
|
||||
from auto_archiver.archivers.bluesky_archiver import BlueskyArchiver
|
||||
from .test_archiver_base import TestArchiverBase
|
||||
|
||||
class TestBlueskyArchiver(TestArchiverBase):
|
||||
"""Tests Bluesky Archiver
|
||||
|
||||
Note that these tests will download API responses from the bluesky API, so they may be slow.
|
||||
This is an intended feature, as we want to test to ensure the bluesky API format hasn't changed,
|
||||
and also test the archiver's ability to download media.
|
||||
"""
|
||||
|
||||
archiver_class = BlueskyArchiver
|
||||
config = {}
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_media_with_images(self):
|
||||
# url https://bsky.app/profile/colborne.bsky.social/post/3lec2bqjc5s2y
|
||||
post = self.archiver._get_post_from_uri("https://bsky.app/profile/colborne.bsky.social/post/3lec2bqjc5s2y")
|
||||
|
||||
# just make sure bsky haven't changed their format, images should be under "record/embed/media/images"
|
||||
# there should be 2 images
|
||||
assert "record" in post
|
||||
assert "embed" in post["record"]
|
||||
assert "media" in post["record"]["embed"]
|
||||
assert "images" in post["record"]["embed"]["media"]
|
||||
assert len(post["record"]["embed"]["media"]["images"]) == 2
|
||||
|
||||
# try downloading the media files
|
||||
media = self.archiver._download_bsky_embeds(post)
|
||||
assert len(media) == 2
|
||||
|
||||
# check the IDs
|
||||
assert "bafkreiflrkfihcvwlhka5tb2opw2qog6gfvywsdzdlibveys2acozh75tq" in media[0].get('src')
|
||||
assert "bafkreibsprmwchf7r6xcstqkdvvuj3ijw7efciw7l3y4crxr4cmynseo7u" in media[1].get('src')
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_post_with_single_image(self):
|
||||
# url https://bsky.app/profile/bellingcat.com/post/3lcxcpgt6j42l
|
||||
post = self.archiver._get_post_from_uri("https://bsky.app/profile/bellingcat.com/post/3lcxcpgt6j42l")
|
||||
|
||||
# just make sure bsky haven't changed their format, images should be under "record/embed/images"
|
||||
# there should be 1 image
|
||||
assert "record" in post
|
||||
assert "embed" in post["record"]
|
||||
assert "images" in post["record"]["embed"]
|
||||
assert len(post["record"]["embed"]["images"]) == 1
|
||||
|
||||
media = self.archiver._download_bsky_embeds(post)
|
||||
assert len(media) == 1
|
||||
|
||||
# check the ID
|
||||
assert "bafkreihljdtomy4yulx4nfxuqdatlgvdg45vxdmjzzhclsd4ludk7zfma4" in media[0].get('src')
|
||||
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_post_with_video(self):
|
||||
# url https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i
|
||||
post = self.archiver._get_post_from_uri("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
|
||||
|
||||
# just make sure bsky haven't changed their format, video should be under "record/embed/video"
|
||||
assert "record" in post
|
||||
assert "embed" in post["record"]
|
||||
assert "video" in post["record"]["embed"]
|
||||
|
||||
media = self.archiver._download_bsky_embeds(post)
|
||||
assert len(media) == 1
|
||||
|
||||
# check the ID
|
||||
assert "bafkreiaiskn2nt5cxjnxbgcqqcrnurvkr2ni3unekn6zvhvgr5nrqg6u2q" in media[0].get('src')
|
||||
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
import pytest
|
||||
|
||||
from .test_archiver_base import TestArchiverBase
|
||||
from auto_archiver.archivers.tiktok_archiver import TiktokArchiver
|
||||
|
||||
class TestBlueskyArchiver(TestArchiverBase):
|
||||
|
||||
archiver_class = TiktokArchiver
|
||||
config = {}
|
||||
|
||||
@pytest.mark.xfail(reason="Tiktok API is not working")
|
||||
@pytest.mark.download
|
||||
def test_download_video(self, make_item):
|
||||
# cat video
|
||||
url = "https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en"
|
||||
item = self.archiver.download(make_item(url))
|
||||
assert item.success
|
||||
@@ -1,19 +1,31 @@
|
||||
import os
|
||||
import datetime
|
||||
|
||||
import pytest
|
||||
|
||||
from auto_archiver.archivers.twitter_archiver import TwitterArchiver
|
||||
|
||||
from pytwitter.models.media import MediaVariant
|
||||
from .test_archiver_base import TestArchiverBase
|
||||
from auto_archiver.archivers import TwitterApiArchiver
|
||||
|
||||
class TestTwitterArchiver(TestArchiverBase):
|
||||
|
||||
archiver_class = TwitterArchiver
|
||||
config = {}
|
||||
@pytest.mark.incremental
|
||||
class TestTwitterApiArchiver(TestArchiverBase):
|
||||
|
||||
archiver_class = TwitterApiArchiver
|
||||
config = {
|
||||
"bearer_tokens": [],
|
||||
"bearer_token": os.environ.get("TWITTER_BEARER_TOKEN"),
|
||||
"consumer_key": os.environ.get("TWITTER_CONSUMER_KEY"),
|
||||
"consumer_secret": os.environ.get("TWITTER_CONSUMER_SECRET"),
|
||||
"access_token": os.environ.get("TWITTER_ACCESS_TOKEN"),
|
||||
"access_secret": os.environ.get("TWITTER_ACCESS_SECRET"),
|
||||
}
|
||||
|
||||
@pytest.mark.parametrize("url, expected", [
|
||||
("https://t.co/yl3oOJatFp", "https://www.bellingcat.com/category/resources/"), # t.co URL
|
||||
("https://x.com/bellingcat/status/1874097816571961839", "https://x.com/bellingcat/status/1874097816571961839"), # x.com urls unchanged
|
||||
("https://twitter.com/bellingcat/status/1874097816571961839", "https://twitter.com/bellingcat/status/1874097816571961839"), # twitter urls unchanged
|
||||
("https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://twitter.com/bellingcat/status/1874097816571961839"), # strip tracking params
|
||||
("https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # don't strip params from twitter urls (changed Jan 2025)
|
||||
("https://www.bellingcat.com/category/resources/", "https://www.bellingcat.com/category/resources/"), # non-twitter/x urls unchanged
|
||||
("https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # shouldn't strip params from non-twitter/x URLs
|
||||
])
|
||||
@@ -25,64 +37,25 @@ class TestTwitterArchiver(TestArchiverBase):
|
||||
("https://x.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
|
||||
("https://www.bellingcat.com/category/resources/", False, False)
|
||||
])
|
||||
|
||||
def test_get_username_tweet_id_from_url(self, url, exptected_username, exptected_tweetid):
|
||||
|
||||
username, tweet_id = self.archiver.get_username_tweet_id(url)
|
||||
assert exptected_username == username
|
||||
assert exptected_tweetid == tweet_id
|
||||
|
||||
|
||||
def test_choose_variants(self):
|
||||
# taken from the response for url https://x.com/bellingcat/status/1871552600346415571
|
||||
variant_list = [{'content_type': 'application/x-mpegURL', 'url': 'https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b'},
|
||||
{'bitrate': 256000, 'content_type': 'video/mp4', 'url': 'https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/480x270/OqZIrKV0LFswMvxS.mp4?tag=12'},
|
||||
{'bitrate': 832000, 'content_type': 'video/mp4', 'url': 'https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12'},
|
||||
{'bitrate': 2176000, 'content_type': 'video/mp4', 'url': 'https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12'}
|
||||
variant_list = [MediaVariant(content_type='application/x-mpegURL', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b'),
|
||||
MediaVariant(bit_rate=256000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/480x270/OqZIrKV0LFswMvxS.mp4?tag=12'),
|
||||
MediaVariant(bit_rate=832000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12'),
|
||||
MediaVariant(bit_rate=2176000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12')
|
||||
]
|
||||
chosen_variant = self.archiver.choose_variant(variant_list)
|
||||
assert chosen_variant == variant_list[3]
|
||||
|
||||
@pytest.mark.parametrize("tweet_id, expected_token", [
|
||||
("1874097816571961839", "4jjngwkifa"),
|
||||
("1674700676612386816", "42586mwa3uv"),
|
||||
("1877747914073620506", "4jv4aahw36n"),
|
||||
("1876710769913450647", "4jruzjz5lux"),
|
||||
("1346554693649113090", "39ibqxei7mo")
|
||||
])
|
||||
def test_reverse_engineer_token(self, tweet_id, expected_token):
|
||||
# see Vercel's implementation here: https://github.com/vercel/react-tweet/blob/main/packages/react-tweet/src/api/fetch-tweet.ts#L27C1-L31C2
|
||||
# and the discussion here: https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-2211358215
|
||||
|
||||
generated_token = self.archiver.generate_token(tweet_id)
|
||||
assert expected_token == generated_token
|
||||
|
||||
@pytest.mark.download
|
||||
def test_youtube_dlp_archiver(self, make_item):
|
||||
|
||||
url = "https://x.com/bellingcat/status/1874097816571961839"
|
||||
post = self.archiver.download_yt_dlp(make_item(url), url, "1874097816571961839")
|
||||
assert post
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
"As 2024 comes to a close, here’s some examples of what Bellingcat investigated per month in our 10th year! 🧵",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
|
||||
"twitter-ytdl"
|
||||
)
|
||||
|
||||
@pytest.mark.download
|
||||
def test_syndication_archiver(self, make_item):
|
||||
|
||||
url = "https://x.com/bellingcat/status/1874097816571961839"
|
||||
post = self.archiver.download_syndication(make_item(url), url, "1874097816571961839")
|
||||
assert post
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
"As 2024 comes to a close, here’s some examples of what Bellingcat investigated per month in our 10th year! 🧵",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)
|
||||
)
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_nonexistend_tweet(self, make_item):
|
||||
def test_download_nonexistent_tweet(self, make_item):
|
||||
# this tweet does not exist
|
||||
url = "https://x.com/Bellingcat/status/17197025860711058"
|
||||
response = self.archiver.download(make_item(url))
|
||||
@@ -105,9 +78,9 @@ class TestTwitterArchiver(TestArchiverBase):
|
||||
post,
|
||||
"Onion rings are just vegetable donuts.",
|
||||
datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
|
||||
"twitter-ytdl"
|
||||
"twitter-api: success"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_video(self, make_item):
|
||||
url = "https://x.com/bellingcat/status/1871552600346415571"
|
||||
@@ -118,14 +91,13 @@ class TestTwitterArchiver(TestArchiverBase):
|
||||
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
|
||||
)
|
||||
|
||||
@pytest.mark.xfail(reason="Currently failing, sensitive content requires logged in users/cookies - not yet implemented")
|
||||
@pytest.mark.download
|
||||
@pytest.mark.parametrize("url, title, timestamp, image_hash", [
|
||||
("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence https://t.co/syYDSkpjZD", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive https://t.co/XE7cRdjzYq", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity https://t.co/YxCFbbhYE3", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
])
|
||||
@pytest.mark.download
|
||||
def test_download_sensitive_media(self, url, title, timestamp, image_hash, make_item):
|
||||
|
||||
"""Download tweets with sensitive media"""
|
||||
@@ -1,71 +0,0 @@
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
|
||||
from auto_archiver.archivers.youtubedl_archiver import YoutubeDLArchiver
|
||||
|
||||
from .test_archiver_base import TestArchiverBase
|
||||
|
||||
class TestYoutubeDLArchiver(TestArchiverBase):
|
||||
"""Tests YoutubeDL Archiver
|
||||
"""
|
||||
archiver_class = YoutubeDLArchiver
|
||||
config = {
|
||||
'subtitles': False,
|
||||
'comments': False,
|
||||
'livestreams': False,
|
||||
'live_from_start': False,
|
||||
'end_means_success': True,
|
||||
'allow_playlist': False,
|
||||
'max_downloads': "inf",
|
||||
'proxy': None,
|
||||
'cookies_from_browser': False,
|
||||
'cookie_file': None,
|
||||
}
|
||||
|
||||
@pytest.mark.parametrize("url, is_suitable", [
|
||||
("https://www.youtube.com/watch?v=5qap5aO4i9A", True),
|
||||
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", True),
|
||||
("https://www.instagram.com/p/CU1J9JYJ9Zz/", True),
|
||||
("https://www.facebook.com/nytimes/videos/10160796550110716", True),
|
||||
("https://www.twitch.tv/videos/1167226570", True),
|
||||
("https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/", True),
|
||||
("https://google.com", True)])
|
||||
def test_suitable_urls(self, make_item, url, is_suitable):
|
||||
"""
|
||||
Note: expected behaviour is to return True for all URLs, as YoutubeDLArchiver should be able to handle all URLs
|
||||
This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for,
|
||||
and then if and only if all archivers fails, does it fall back to the generic archiver)
|
||||
"""
|
||||
assert self.archiver.suitable(make_item(url)) == is_suitable
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_tiktok(self, make_item):
|
||||
item = make_item("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970")
|
||||
result = self.archiver.download(item)
|
||||
assert result.get_url() == "https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970"
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_youtube(self, make_item):
|
||||
# url https://www.youtube.com/watch?v=5qap5aO4i9A
|
||||
item = make_item("https://www.youtube.com/watch?v=J---aiyznGQ")
|
||||
result = self.archiver.download(item)
|
||||
assert result.get_url() == "https://www.youtube.com/watch?v=J---aiyznGQ"
|
||||
assert result.get_title() == "Keyboard Cat! - THE ORIGINAL!"
|
||||
assert result.get('description') == "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
|
||||
assert len(result.media) == 2
|
||||
assert Path(result.media[0].filename).name == "J---aiyznGQ.webm"
|
||||
assert Path(result.media[1].filename).name == "hqdefault.jpg"
|
||||
|
||||
@pytest.mark.skip("ytdlp supports bluesky, but there's currently no way to extract info from pages without videos")
|
||||
@pytest.mark.download
|
||||
def test_download_bluesky_with_images(self, make_item):
|
||||
item = make_item("https://bsky.app/profile/colborne.bsky.social/post/3lec2bqjc5s2y")
|
||||
result = self.archiver.download(item)
|
||||
assert result is not False
|
||||
|
||||
@pytest.mark.skip("ytdlp supports twitter, but there's currently no way to extract info from pages without videos")
|
||||
@pytest.mark.download
|
||||
def test_download_twitter_textonly(self, make_item):
|
||||
item = make_item("https://x.com/bellingcat/status/1874097816571961839")
|
||||
result = self.archiver.download(item)
|
||||
assert result is not False
|
||||
@@ -1,6 +1,19 @@
|
||||
"""
|
||||
pytest conftest file, for shared fixtures and configuration
|
||||
"""
|
||||
|
||||
from typing import Dict, Tuple
|
||||
|
||||
import pytest
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
|
||||
# Test names inserted into this list will be run last. This is useful for expensive/costly tests
|
||||
# that you only want to run if everything else succeeds (e.g. API calls). The order here is important
|
||||
# what comes first will be run first (at the end of all other tests not mentioned)
|
||||
# format is the name of the module (python file) without the .py extension
|
||||
TESTS_TO_RUN_LAST = ['test_twitter_api_archiver']
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def make_item():
|
||||
def _make_item(url: str, **kwargs) -> Metadata:
|
||||
@@ -9,4 +22,61 @@ def make_item():
|
||||
item.set(key, value)
|
||||
return item
|
||||
|
||||
return _make_item
|
||||
return _make_item
|
||||
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(items):
|
||||
module_mapping = {item: item.module.__name__.split(".")[-1] for item in items}
|
||||
|
||||
sorted_items = items.copy()
|
||||
# Iteratively move tests of each module to the end of the test queue
|
||||
for module in TESTS_TO_RUN_LAST:
|
||||
if module in module_mapping.values():
|
||||
for item in sorted_items:
|
||||
if module_mapping[item] == module:
|
||||
sorted_items.remove(item)
|
||||
sorted_items.append(item)
|
||||
|
||||
items[:] = sorted_items
|
||||
|
||||
|
||||
|
||||
# Incremental testing - fail tests in a class if any previous test fails
|
||||
# taken from https://docs.pytest.org/en/latest/example/simple.html#incremental-testing-test-steps
|
||||
|
||||
# store history of failures per test class name and per index in parametrize (if parametrize used)
|
||||
_test_failed_incremental: Dict[str, Dict[Tuple[int, ...], str]] = {}
|
||||
|
||||
def pytest_runtest_makereport(item, call):
|
||||
if "incremental" in item.keywords:
|
||||
# incremental marker is used
|
||||
if call.excinfo is not None:
|
||||
# the test has failed
|
||||
# retrieve the class name of the test
|
||||
cls_name = str(item.cls)
|
||||
# retrieve the index of the test (if parametrize is used in combination with incremental)
|
||||
parametrize_index = (
|
||||
tuple(item.callspec.indices.values())
|
||||
if hasattr(item, "callspec")
|
||||
else ()
|
||||
)
|
||||
# retrieve the name of the test function
|
||||
test_name = item.originalname or item.name
|
||||
# store in _test_failed_incremental the original name of the failed test
|
||||
_test_failed_incremental.setdefault(cls_name, {}).setdefault(
|
||||
parametrize_index, test_name
|
||||
)
|
||||
|
||||
|
||||
def pytest_runtest_setup(item):
|
||||
if "incremental" in item.keywords:
|
||||
# retrieve the class name of the test
|
||||
cls_name = str(item.cls)
|
||||
# check if a previous test has failed for this class
|
||||
if cls_name in _test_failed_incremental:
|
||||
# retrieve the name of the first test function to fail for this class name and index
|
||||
test_name = _test_failed_incremental[cls_name].get((), None)
|
||||
# if name found, test has failed for the combination of class name & test name
|
||||
if test_name is not None:
|
||||
pytest.xfail(f"previous test failed ({test_name})")
|
||||
Reference in New Issue
Block a user