From f53e34d6bda85c3d62964c2fe4d9485747335d5f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 9 Jun 2025 20:55:07 +0000 Subject: [PATCH 01/19] Bump webrecorder/browsertrix-crawler from 1.6.1 to 1.6.2 Bumps webrecorder/browsertrix-crawler from 1.6.1 to 1.6.2. --- updated-dependencies: - dependency-name: webrecorder/browsertrix-crawler dependency-version: 1.6.2 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 8100f95..f9d037d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.6.1 AS base +FROM webrecorder/browsertrix-crawler:1.6.2 AS base ENV RUNNING_IN_DOCKER=1 \ LANG=C.UTF-8 \ From c815488daa9a9e44b4737eafdeb05c19bdc7c5bb Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 Jun 2025 15:44:52 +0100 Subject: [PATCH 02/19] adds new URLs to ignore --- src/auto_archiver/utils/url.py | 8 ++++++++ tests/utils/test_urls.py | 5 +++++ 2 files changed, 13 insertions(+) diff --git a/src/auto_archiver/utils/url.py b/src/auto_archiver/utils/url.py index 368d93c..9d7730b 100644 --- a/src/auto_archiver/utils/url.py +++ b/src/auto_archiver/utils/url.py @@ -78,6 +78,8 @@ def remove_get_parameters(url: str) -> str: def is_relevant_url(url: str) -> bool: """ Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc. + + Assumption: URLs are relevant if they refer to files that can be downloaded with curl/requests, so excludes extensions like .m3u8. """ clean_url = remove_get_parameters(url) @@ -104,11 +106,17 @@ def is_relevant_url(url: str) -> bool: ("vk.com/images/reaction/",), # wikipedia ("wikipedia.org/static",), + # reddit + ("styles.redditmedia.com",), # opinionated but excludes may irrelevant images like avatars and banners + ("emoji.redditmedia.com",), ] IRRELEVANT_ENDS_WITH = [ ".svg", # ignore SVGs ".ico", # ignore icons + ".m3u8", + ".mpd", + ".ism", # ignore index files for videos, these should be handled by ytdlp ] for end in IRRELEVANT_ENDS_WITH: diff --git a/tests/utils/test_urls.py b/tests/utils/test_urls.py index 7871847..2fb66a5 100644 --- a/tests/utils/test_urls.py +++ b/tests/utils/test_urls.py @@ -95,6 +95,11 @@ def test_remove_get_parameters(url, without_get): ("https://example.com/150x150.jpg", True), ("https://example.com/rsrc.php/", True), ("https://example.com/img/emoji/", True), + ("https://styles.redditmedia.com/123", False), + ("https://emoji.redditmedia.com/abc.jpg", False), + ("https://example.com/rsrc.m3u8?asdasd=10", False), + ("https://example.com/rsrc.mpd", False), + ("https://example.com/rsrc.ism?vid=12", False), ], ) def test_is_relevant_url(url, relevant): From 287e823f43ffeb5a85facaed9f6587279a4cf2a6 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 Jun 2025 16:09:38 +0100 Subject: [PATCH 03/19] improves twitter URL cleaning and introduces another bestquality check --- src/auto_archiver/utils/url.py | 34 ++++++++++++++++++++++++-- tests/utils/test_urls.py | 44 +++++++++++++++++++++++++++++++++- 2 files changed, 75 insertions(+), 3 deletions(-) diff --git a/src/auto_archiver/utils/url.py b/src/auto_archiver/utils/url.py index 9d7730b..ea03d7f 100644 --- a/src/auto_archiver/utils/url.py +++ b/src/auto_archiver/utils/url.py @@ -133,6 +133,36 @@ def is_relevant_url(url: str) -> bool: def twitter_best_quality_url(url: str) -> str: """ some twitter image URLs point to a less-than best quality - this returns the URL pointing to the highest (original) quality + this returns the URL pointing to the highest (original) quality (with 'name=orig') """ - return re.sub(r"name=(\w+)", "name=orig", url, 1) + parsed = urlparse(url) + query = parsed.query + if "name=" in query: + # Replace only the first occurrence of name=xxx with name=orig + new_query = re.sub(r"name=[^&]*", "name=orig", query, 1) + parsed = parsed._replace(query=new_query) + return urlunparse(parsed) + return url + + +def get_media_url_best_quality(url: str) -> str: + """ + Returns the best quality URL for the given media URL, it may not exist. + """ + parsed = urlparse(url) + + # twitter case + if any(d in parsed.netloc.replace("www", "") for d in ("twitter.com", "twimg.com", "x.com")): + url = twitter_best_quality_url(url) + parsed = urlparse(url) + + # some cases https://example.com/media-1280x720.mp4 to https://example.com/media.mp4 + basename = parsed.path.split("/")[-1] + match = re.match(r"(.+)-\d+x\d+(\.[a-zA-Z0-9]+)$", basename) + if match: + orig_basename = match.group(1) + match.group(2) + new_path = "/".join(parsed.path.split("/")[:-1] + [orig_basename]) + parsed = parsed._replace(path=new_path) # keep the query unchanged + url = urlunparse(parsed) + + return url diff --git a/tests/utils/test_urls.py b/tests/utils/test_urls.py index 2fb66a5..df8e0f3 100644 --- a/tests/utils/test_urls.py +++ b/tests/utils/test_urls.py @@ -6,6 +6,7 @@ from auto_archiver.utils.url import ( is_relevant_url, remove_get_parameters, twitter_best_quality_url, + get_media_url_best_quality, ) @@ -109,10 +110,51 @@ def test_is_relevant_url(url, relevant): @pytest.mark.parametrize( "url, best_quality", [ - ("https://twitter.com/some_image.jpg?name=small", "https://twitter.com/some_image.jpg?name=orig"), + ( + "https://twitter.com/some_image.jpg?name=small&this_is_another=145", + "https://twitter.com/some_image.jpg?name=orig&this_is_another=145", + ), ("https://twitter.com/some_image.jpg", "https://twitter.com/some_image.jpg"), ("https://twitter.com/some_image.jpg?name=orig", "https://twitter.com/some_image.jpg?name=orig"), ], ) def test_twitter_best_quality_url(url, best_quality): assert twitter_best_quality_url(url) == best_quality + + +@pytest.mark.parametrize( + "input_url,expected_url", + [ + # Twitter: add/replace name= to name=orig + ( + "https://pbs.twimg.com/media/abc123?format=jpg&name=small", + "https://pbs.twimg.com/media/abc123?format=jpg&name=orig", + ), + ("https://pbs.twimg.com/media/abc123?name=large", "https://pbs.twimg.com/media/abc123?name=orig"), + ("https://pbs.twimg.com/media/abc123?format=jpg", "https://pbs.twimg.com/media/abc123?format=jpg"), + # Twitter: already orig + ( + "https://pbs.twimg.com/media/abc123?format=jpg&name=orig", + "https://pbs.twimg.com/media/abc123?format=jpg&name=orig", + ), + # X.com domain + ("https://x.com/media/abc123?name=medium", "https://x.com/media/abc123?name=orig"), + # twimg.com domain + ("https://twimg.com/media/abc123?name=thumb", "https://twimg.com/media/abc123?name=orig"), + # Non-twitter domain, no change + ("https://example.com/media/file.mp4", "https://example.com/media/file.mp4"), + # Remove -WxH from basename + ("https://example.com/media/file-1280x720.mp4", "https://example.com/media/file.mp4"), + ("https://example.com/media/file-1920x1080.jpg?foo=bar", "https://example.com/media/file.jpg?foo=bar"), + # Both twitter and -WxH + ("https://pbs.twimg.com/media/abc-1280x720.jpg?name=small", "https://pbs.twimg.com/media/abc.jpg?name=orig"), + # No match for -WxH, no change + ("https://example.com/media/file.mp4?foo=bar", "https://example.com/media/file.mp4?foo=bar"), + # Path with multiple directories + ("https://example.com/a/b/c/file-640x480.png", "https://example.com/a/b/c/file.png"), + # -WxH in directory, not basename (should not change) + ("https://example.com/media-1280x720/file.mp4", "https://example.com/media-1280x720/file.mp4"), + ], +) +def test_get_media_url_best_quality(input_url, expected_url): + assert get_media_url_best_quality(input_url) == expected_url From 6bd493a79179b1e3e259728594f9a807435b90af Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 Jun 2025 16:11:55 +0100 Subject: [PATCH 04/19] dropin with new ytdlp feature and helper method --- .../antibot_extractor_enricher/dropin.py | 77 ++++++++++++++++++- .../antibot_extractor_enricher/dropins/vk.py | 61 ++------------- 2 files changed, 81 insertions(+), 57 deletions(-) diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py index 805edfd..15c2e28 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py @@ -1,7 +1,10 @@ +import os +from loguru import logger from seleniumbase import SB +import yt_dlp -from auto_archiver.core.extractor import Extractor -from auto_archiver.core.metadata import Metadata +from auto_archiver.core import Extractor, Media, Metadata +from auto_archiver.utils.misc import ydl_entry_to_filename class Dropin: @@ -36,6 +39,20 @@ class Dropin: """ return url + @staticmethod + def images_selectors() -> str: + """ + CSS selector to find images in the HTML page + """ + return "img" + + @staticmethod + def video_selectors() -> str: + """ + CSS selector to find videos in the HTML page. + """ + return "video, source" + def open_page(self, url) -> bool: """ Make sure the page is opened, even if it requires authentication, captcha solving, etc. @@ -50,3 +67,59 @@ class Dropin: :return: A tuple (number of Images added, number of Videos added). """ raise NotImplementedError("This method should be implemented in the subclass") + + def _get_username_password(self, site) -> tuple[str, str]: + """ + Get the username and password for the site from the extractor's auth data. + :return: A tuple (username, password). + """ + auth = self.extractor.auth_for_site(site) + username = auth.get("username", "") + password = auth.get("password", "") + if not username or not password: + raise ValueError(f"{site} authentication requires a username and password.") + return username, password + + def _download_videos_with_ytdlp(self, video_urls: list[str], to_enrich: Metadata) -> int: + """ + Download videos using yt-dlp. + :param video_urls: List of video URLs to download. + :return: The number of videos downloaded. + """ + if type(self.extractor.max_download_videos) is int: + video_urls = video_urls[: self.extractor.max_download_videos] + + if not video_urls: + return 0 + + ydl_options = [ + "-o", + os.path.join(self.extractor.tmp_dir, "%(id)s.%(ext)s"), + "--quiet", + "--no-playlist", + "--no-write-subs", + "--no-write-auto-subs", + "--postprocessor-args", + "ffmpeg:-bitexact", + "--max-filesize", + "1000M", # Limit to 1GB per video + ] + *_, validated_options = yt_dlp.parse_options(ydl_options) + downloaded = 0 + with yt_dlp.YoutubeDL(validated_options) as ydl: + for url in video_urls: + try: + logger.debug(f"Downloading video from URL: {url}") + info = ydl.extract_info(url, download=True) + filename = ydl_entry_to_filename(ydl, info) + if not filename: # Failed to download video. + continue + media = Media(filename) + for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]: + if x in info: + media.set(x, info[x]) + to_enrich.add_media(media) + downloaded += 1 + except Exception as e: + logger.error(f"Error downloading {url}: {e}") + return downloaded diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py index 6f54187..76e176e 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py @@ -1,12 +1,8 @@ -import os import re -from auto_archiver.core.media import Media from auto_archiver.core.metadata import Metadata from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin -from auto_archiver.utils.misc import ydl_entry_to_filename -import yt_dlp from loguru import logger @@ -37,8 +33,8 @@ class VkDropin(Dropin): def open_page(self, url) -> bool: if self.sb.is_text_visible("Sign in to VK"): - self._login() - self.sb.open(url) + if self._login(): + self.sb.open(url) return True def _login(self) -> bool: @@ -50,13 +46,9 @@ class VkDropin(Dropin): return True # need to login - logger.debug("Logging in to VK...") - auth = self.extractor.auth_for_site("vk.com") - username = auth.get("username", "") - password = auth.get("password", "") - if not username or not password: - raise ValueError("VK authentication requires a username and password.") - logger.debug("Using username: {}", username) + username, password = self._get_username_password("vk.com") + logger.debug("Logging in to VK with username: {}", username) + self.sb.click('[data-testid="enter-another-way"]', timeout=10) self.sb.clear('input[name="login"][type="tel"]', by="css selector", timeout=10) self.sb.type('input[name="login"][type="tel"]', username, by="css selector", timeout=10) @@ -80,47 +72,6 @@ class VkDropin(Dropin): @logger.catch def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]: - """ - Extract video data from the currently open post with SeleniumBase. - - :return: A tuple (number of Images added, number of Videos added). - """ video_urls = [v.get_attribute("href") for v in self.sb.find_elements('a[href*="/video-"]')] - if type(self.extractor.max_download_videos) is int: - video_urls = video_urls[: self.extractor.max_download_videos] - if not video_urls: - return 0, 0 - - logger.debug(f"Found {len(video_urls)} video URLs in the post, using ytdlp for download.") - ydl_options = [ - "-o", - os.path.join(self.extractor.tmp_dir, "%(id)s.%(ext)s"), - "--quiet", - "--no-playlist", - "--no-write-subs", - "--no-write-auto-subs", - "--postprocessor-args", - "ffmpeg:-bitexact", - "--max-filesize", - "1000M", # Limit to 1GB per video - ] - *_, validated_options = yt_dlp.parse_options(ydl_options) - downloaded = 0 - with yt_dlp.YoutubeDL(validated_options) as ydl: - for url in video_urls: - try: - logger.debug(f"Downloading video from URL: {url}") - info = ydl.extract_info(url, download=True) - filename = ydl_entry_to_filename(ydl, info) - if not filename: # Failed to download video. - continue - media = Media(filename) - for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]: - if x in info: - media.set(x, info[x]) - to_enrich.add_media(media) - downloaded += 1 - except Exception as e: - logger.error(f"Error downloading {url}: {e}") - return 0, downloaded + return 0, self._download_videos_with_ytdlp(video_urls, to_enrich) From 6d82655cc40e2a817be73b1725ec2b31dd6ed872 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 Jun 2025 16:14:34 +0100 Subject: [PATCH 05/19] manifest improvement for antibot --- .../modules/antibot_extractor_enricher/__manifest__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py index d2e9d66..d93c072 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py @@ -31,7 +31,9 @@ }, }, "description": """ - Uses a browser controlled by SeleniumBase to capture HTML, media, and screenshots/PDFs of a web page, by bypassing anti-bot measures like Cloudflare's Turnstile. + Uses a browser controlled by SeleniumBase to capture HTML, media, and screenshots/PDFs of a web page, by bypassing anti-bot measures like Cloudflare's Turnstile or Google Recaptcha. + + Still in trial development, please report any issues or suggestions via GitHub Issues. ### Features - Extracts the HTML source code of the page. @@ -40,7 +42,6 @@ - Downloads images and videos from the page, excluding specified file extensions. ### Notes - - Requires a WebDriver (e.g., ChromeDriver) installed and accessible via the system's PATH. - Using a proxy affects Cloudflare Turnstile captcha handling, so it is recommended to use a proxy only if necessary. """, } From 809b8c7749168cf36f11406ee3eafd2c97a443ed Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 Jun 2025 16:14:42 +0100 Subject: [PATCH 06/19] default dropin introduced --- .../dropins/default.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py new file mode 100644 index 0000000..c5c865a --- /dev/null +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py @@ -0,0 +1,18 @@ +from auto_archiver.core.metadata import Metadata +from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin + + +class DefaultDropin(Dropin): + """ + A default fallback drop-in class for handling generic cases in the antibot extractor enricher module. + """ + + @staticmethod + def suitable(url: str) -> bool: + return False + + def open_page(self, url) -> bool: + return True + + def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]: + return 0, 0 From 6bbc7fb47a67a52d244083eb9159bda692bfe382 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 Jun 2025 16:29:07 +0100 Subject: [PATCH 07/19] improves antibot flow and makes auth_wall detection optional --- .../__manifest__.py | 5 ++ .../antibot_extractor_enricher.py | 68 +++++++++++-------- .../test_antibot_extractor_enricher.py | 22 +++++- 3 files changed, 64 insertions(+), 31 deletions(-) diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py index d93c072..214653c 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py @@ -25,6 +25,11 @@ "default": "secrets/antibot_user_data", "help": "Path to the user data directory for the webdriver. This is used to persist browser state, such as cookies and local storage. When using docker it's best to let docker create the folder otherwise there may be permission issues. The Extractor will try to work without it if that error occurs but login sessions will not be used or preserved on those runs.", }, + "detect_auth_wall": { + "default": True, + "type": "bool", + "help": "detect if the page is behind an authentication wall (e.g. login required) and skip it. disable if you want to archive pages where logins are required.", + }, "proxy": { "default": None, "help": "proxy to use for the webdriver, Format: 'SERVER:PORT' or 'USER:PASS@SERVER:PORT'", diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py index e82a2f8..08d0c03 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py @@ -15,7 +15,9 @@ from seleniumbase import SB from auto_archiver.core import Extractor, Enricher, Metadata, Media from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin +from auto_archiver.modules.antibot_extractor_enricher.dropins.default import DefaultDropin from auto_archiver.utils.misc import random_str +from auto_archiver.utils.url import is_relevant_url class AntibotExtractorEnricher(Extractor, Enricher): @@ -102,39 +104,41 @@ class AntibotExtractorEnricher(Extractor, Enricher): sb.uc_open_with_reconnect(url, 4) logger.debug(f"ANTIBOT handling CAPTCHAs for {url_sample}...") + sb.uc_gui_handle_cf() + sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future - # TODO: implement other Captcha handling - sb.uc_gui_handle_captcha() # handles Cloudflare Turnstile captcha if detected + dropin = self._get_suitable_dropin(url, sb) + dropin.open_page(url) - suitable_dropin = self._get_suitable_dropin(url, sb) - - if suitable_dropin: - suitable_dropin.open_page(url) - - if self._hit_auth_wall(sb): + if self.detect_auth_wall and self._hit_auth_wall(sb): logger.warning(f"ANTIBOT SKIP since auth wall or CAPTCHA was detected for {url_sample}") return False - logger.debug(f"ANTIBOT no auth wall detected for {url_sample}...") + sb.wait_for_ready_state_complete() sb.sleep(1) # margin for the page to load completely to_enrich.set_title(sb.get_title()) self._enrich_html_source_code(sb, to_enrich) + self._enrich_full_page_screenshot(sb, to_enrich) if self.save_to_pdf: self._enrich_full_page_pdf(sb, to_enrich) - downloaded_images, downloaded_videos = 0, 0 - if suitable_dropin: - downloaded_images, downloaded_videos = suitable_dropin.add_extra_media(to_enrich) + downloaded_images, downloaded_videos = dropin.add_extra_media(to_enrich) self._enrich_download_media( - sb, to_enrich, css_selector="img", max_media=self.max_download_images - downloaded_images + sb, + to_enrich, + css_selector=dropin.images_selectors(), + max_media=self.max_download_images - downloaded_images, ) self._enrich_download_media( - sb, to_enrich, css_selector="video, source", max_media=self.max_download_videos - downloaded_videos + sb, + to_enrich, + css_selector=dropin.video_selectors(), + max_media=self.max_download_videos - downloaded_videos, ) - logger.success(f"ANTIBOT completed for {url_sample}") + logger.info(f"ANTIBOT completed for {url_sample}") return to_enrich except selenium.common.exceptions.SessionNotCreatedException as e: @@ -155,10 +159,10 @@ class AntibotExtractorEnricher(Extractor, Enricher): """ for dropin in self.dropins: if dropin.suitable(url): - logger.debug(f"ANTIBOT using drop-in {dropin.__class__.__name__} for {url}") + logger.debug(f"ANTIBOT using drop-in {dropin.__name__} for {url}") return dropin(sb, self) - # logger.warning(f"ANTIBOT no suitable drop-in found for {url}") - return None + + return DefaultDropin(sb, self) def _hit_auth_wall(self, sb: SB) -> bool: """ @@ -168,8 +172,8 @@ class AntibotExtractorEnricher(Extractor, Enricher): # TODO: improve this detection logic, currently it is very basic and may not cover all cases # Common URL patterns - url = sb.get_current_url().lower() - if any(kw in url for kw in ["login", "signin", "signup", "register", "captcha"]): + current_url = sb.get_current_url().lower() + if any(kw in current_url for kw in ["login", "signin", "signup", "register", "captcha"]): return True # Common visible text markers @@ -245,8 +249,12 @@ class AntibotExtractorEnricher(Extractor, Enricher): Enriches the full page screenshot of the Metadata object. This method is called by the enrich method. """ - x = sb.execute_script("return document.documentElement.scrollWidth") - y = min(sb.execute_script("return document.documentElement.scrollHeight"), 25_000) + start_size = sb.get_window_size() + w, h = start_size["width"], start_size["height"] + + x = max(sb.execute_script("return document.documentElement.scrollWidth"), w) + y = min(max(sb.execute_script("return document.documentElement.scrollHeight"), h), 25_000) + logger.debug(f"Setting window size to {x}x{y} for full page screenshot.") sb.set_window_size(x, y) screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png") @@ -278,12 +286,9 @@ class AntibotExtractorEnricher(Extractor, Enricher): """ if max_media == 0: return - logger.debug( - f"Downloading media from {to_enrich.get_url()} with selector '{css_selector}' up to {max_media} items." - ) url = to_enrich.get_url() all_urls = set() - # media_elements = sb.find_elements(css_selector) + sources = sb.execute_script(f""" return Array.from(document.querySelectorAll("{css_selector}")) .map(el => el.src || el.href) @@ -293,10 +298,15 @@ class AntibotExtractorEnricher(Extractor, Enricher): if len(all_urls) >= max_media: logger.debug(f"Reached max download limit of {max_media} images/videos.") break - mimerype = mimetypes.guess_type(src)[0] - if mimerype in self.exclude_media_mimetypes: + if not is_relevant_url(src): + continue + mimetype = mimetypes.guess_type(src)[0] + if mimetype in self.exclude_media_mimetypes: continue full_src = urljoin(url, src) - if full_src not in all_urls and (filename := self.download_from_url(full_src)): + if full_src not in all_urls: + filename, full_src = self.download_from_url(full_src, try_best_quality=True) + if not filename: + continue all_urls.add(full_src) to_enrich.add_media(Media(filename=filename, properties={"url": full_src})) diff --git a/tests/extractors/test_antibot_extractor_enricher.py b/tests/extractors/test_antibot_extractor_enricher.py index 1da025d..600baf2 100644 --- a/tests/extractors/test_antibot_extractor_enricher.py +++ b/tests/extractors/test_antibot_extractor_enricher.py @@ -129,15 +129,33 @@ class TestAntibotExtractorEnricher(TestExtractorBase): ), ( "https://seleniumbase.io/apps/turnstile", - 'id="captcha-success"', + '', + ), + ( + "https://seleniumbase.io/apps/form_turnstile", + '', + ), + ( + "https://gitlab.com/users/sign_in", + "Password", ), ], ) - def test_download_with_cloudflare_turnstile(self, setup_module, make_item, url, in_html): + def test_overcome_cloudflare_turnstile(self, setup_module, make_item, url, in_html): """ Test downloading a page with Cloudflare Turnstile captcha. """ + self.extractor = setup_module( + self.extractor_module, + { + "save_to_pdf": True, + "detect_auth_wall": False, + "max_download_images": 5, + "max_download_videos": "inf", + }, + ) + item = make_item(url) self.extractor.enrich(item) From ef0e909a728e1adada532cb39a235718e45e6633 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 Jun 2025 16:29:35 +0100 Subject: [PATCH 08/19] extractor to auto detect best quality --- src/auto_archiver/core/extractor.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index cf42f1e..ca3359d 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -8,6 +8,7 @@ Factory method to initialize an extractor instance based on its name. from __future__ import annotations from abc import abstractmethod +from contextlib import suppress import mimetypes import os import requests @@ -16,6 +17,7 @@ from retrying import retry import re from auto_archiver.core import Metadata, BaseModule +from auto_archiver.utils.url import get_media_url_best_quality class Extractor(BaseModule): @@ -70,10 +72,20 @@ class Extractor(BaseModule): return "" @retry(wait_random_min=500, wait_random_max=3500, stop_max_attempt_number=5) - def download_from_url(self, url: str, to_filename: str = None, verbose=True) -> str: + def download_from_url(self, url: str, to_filename: str = None, verbose=True, try_best_quality=False) -> str: """ downloads a URL to provided filename, or inferred from URL, returns local filename + Warning: if try_best_quality is True, it will return a tuple of (filename, best_quality_url) if the download was successful. """ + + if try_best_quality: + with suppress(Exception): + # Attempt to download the original URL + best_quality_url = get_media_url_best_quality(url) + orig_download = self.download_from_url(best_quality_url, to_filename, verbose) + if orig_download: + return orig_download, best_quality_url + if not to_filename: to_filename = url.split("/")[-1].split("?")[0] if len(to_filename) > 64: @@ -98,10 +110,12 @@ class Extractor(BaseModule): with open(to_filename, "wb") as f: for chunk in d.iter_content(chunk_size=8192): f.write(chunk) + if try_best_quality: + return to_filename, url return to_filename except requests.RequestException as e: - logger.warning(f"Failed to fetch the Media URL: {e}") + logger.warning(f"Failed to fetch the Media URL: {str(e)[:250]}") @abstractmethod def download(self, item: Metadata) -> Metadata | False: From 773fa82f0687e296cc94db78703fabecb56aba76 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 Jun 2025 16:31:19 +0100 Subject: [PATCH 09/19] introduces reddit dropin --- .../dropins/reddit.py | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py new file mode 100644 index 0000000..c46ca17 --- /dev/null +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py @@ -0,0 +1,77 @@ +from contextlib import suppress +from auto_archiver.core.metadata import Metadata +from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin + +from loguru import logger + + +class RedditDropin(Dropin): + """ + A class to handle Reddit drop-in functionality for the antibot extractor enricher module. + """ + + @staticmethod + def suitable(url: str) -> bool: + return "reddit.com" in url + + @staticmethod + def images_selectors() -> str: + return "shreddit-post img" + + @staticmethod + def video_selectors() -> str: + return "shreddit-post video, shreddit-post source" + + def open_page(self, url) -> bool: + if self.sb.is_text_visible("You've been blocked by network security."): + self._login() + if url != self.sb.get_current_url(): + self.sb.open(url) + return True + + def _login(self): + self.sb.click_link_text("Log in") + self.sb.wait_for_ready_state_complete() + self._close_cookies_banner() + + username, password = self._get_username_password("reddit.com") + logger.debug("RedditDropin Logging in to VK with username: {}", username) + + self.sb.type("#login-username", username) + self.sb.type("#login-password", password) + + elem = self.sb.find_element("button.login") + self.sb.execute_script("arguments[0].scrollIntoView(true);", elem) + self.sb.slow_click("button.login") + self.sb.wait_for_ready_state_complete() + + if "https://www.reddit.com/login/" in self.sb.get_current_url(): + self.sb.sleep(5) + self.sb.wait_for_ready_state_complete() + + if self.sb.is_text_visible("You've been blocked by network security."): + self.sb.click_link_text("Log in") + self.sb.wait_for_ready_state_complete() + if self.sb.is_text_visible("Welcome back"): + logger.debug("RedditDropin Login successful") + self.sb.click_if_visible("this link") + + def _close_cookies_banner(self): + with suppress(Exception): # selenium.common.exceptions.JavascriptException + self.sb.execute_script(""" + document + .querySelector("reddit-cookie-banner") + .shadowRoot.querySelector("faceplate-dialog") + .querySelector("#accept-all-cookies-button button") + .click() + """) + + @logger.catch + def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]: + filtered_urls = self.sb.execute_script(f""" + return [...document.querySelectorAll("{self.video_selectors()}")] + .map(el => el.src || el.href) + .filter(url => url && /\.(m3u8|mpd|ism)$/.test(url)); + """) + logger.debug("RedditDropin Found {} video URLs", len(filtered_urls)) + return 0, self._download_videos_with_ytdlp(filtered_urls, to_enrich) From ca00aa302d291c3eff5b33951ec701f36d0a8ab9 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 Jun 2025 16:31:32 +0100 Subject: [PATCH 10/19] version bump breaking --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 466d090..cdbb86b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [project] name = "auto-archiver" -version = "1.0.1" +version = "1.1.0" description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)." requires-python = ">=3.10,<3.13" From 4270e067286ea2d52ae3f0663c931af69963bbad Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 Jun 2025 16:33:47 +0100 Subject: [PATCH 11/19] npm update on scripts/settings --- scripts/settings/package-lock.json | 293 +++++++++++++++-------------- 1 file changed, 150 insertions(+), 143 deletions(-) diff --git a/scripts/settings/package-lock.json b/scripts/settings/package-lock.json index a60d74e..4aced9a 100644 --- a/scripts/settings/package-lock.json +++ b/scripts/settings/package-lock.json @@ -10,21 +10,21 @@ "dependencies": { "@dnd-kit/core": "^6.3.1", "@dnd-kit/sortable": "^10.0.0", - "@emotion/react": "*", - "@emotion/styled": "*", + "@emotion/react": "latest", + "@emotion/styled": "latest", "@mui/icons-material": "^7.1.1", - "@mui/material": "*", + "@mui/material": "latest", "react": "19.1.0", "react-dom": "19.1.0", "react-markdown": "^10.0.0", "yaml": "^2.7.0" }, "devDependencies": { - "@types/react": "*", - "@types/react-dom": "*", - "@vitejs/plugin-react": "*", - "typescript": "*", - "vite": "*", + "@types/react": "latest", + "@types/react-dom": "latest", + "@vitejs/plugin-react": "latest", + "typescript": "latest", + "vite": "latest", "vite-plugin-singlefile": "^2.1.0" } }, @@ -57,9 +57,9 @@ } }, "node_modules/@babel/compat-data": { - "version": "7.27.3", - "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.27.3.tgz", - "integrity": "sha512-V42wFfx1ymFte+ecf6iXghnnP8kWTO+ZLXIyZq+1LAXHHvTZdVxicn4yiVYdYMGaCO3tmqub11AorKkv+iodqw==", + "version": "7.27.5", + "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.27.5.tgz", + "integrity": "sha512-KiRAp/VoJaWkkte84TvUd9qjdbZAdiqyvMxrGl1N6vzFogKmaLgoM3L1kgtLicp2HP5fBJS8JrZKLVIZGVJAVg==", "dev": true, "license": "MIT", "engines": { @@ -105,12 +105,12 @@ "license": "MIT" }, "node_modules/@babel/generator": { - "version": "7.27.3", - "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.27.3.tgz", - "integrity": "sha512-xnlJYj5zepml8NXtjkG0WquFUv8RskFqyFcVgTBp5k+NaA/8uw/K+OSVf8AMGw5e9HKP2ETd5xpK5MLZQD6b4Q==", + "version": "7.27.5", + "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.27.5.tgz", + "integrity": "sha512-ZGhA37l0e/g2s1Cnzdix0O3aLYm66eF8aufiVteOgnwxgnRP8GoyMj7VWsgWnQbVKXyge7hqrFh2K2TQM6t1Hw==", "license": "MIT", "dependencies": { - "@babel/parser": "^7.27.3", + "@babel/parser": "^7.27.5", "@babel/types": "^7.27.3", "@jridgewell/gen-mapping": "^0.3.5", "@jridgewell/trace-mapping": "^0.3.25", @@ -207,23 +207,23 @@ } }, "node_modules/@babel/helpers": { - "version": "7.27.4", - "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.27.4.tgz", - "integrity": "sha512-Y+bO6U+I7ZKaM5G5rDUZiYfUvQPUibYmAFe7EnKdnKBbVXDZxvp+MWOH5gYciY0EPk4EScsuFMQBbEfpdRKSCQ==", + "version": "7.27.6", + "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.27.6.tgz", + "integrity": "sha512-muE8Tt8M22638HU31A3CgfSUciwz1fhATfoVai05aPXGor//CdWDCbnlY1yvBPo07njuVOCNGCSp/GTt12lIug==", "dev": true, "license": "MIT", "dependencies": { "@babel/template": "^7.27.2", - "@babel/types": "^7.27.3" + "@babel/types": "^7.27.6" }, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/parser": { - "version": "7.27.4", - "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.27.4.tgz", - "integrity": "sha512-BRmLHGwpUqLFR2jzx9orBuX/ABDkj2jLKOXrHDTN2aOKL+jFDDKaRNo9nyYsIl9h/UE/7lMKdDjKQQyxKKDZ7g==", + "version": "7.27.5", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.27.5.tgz", + "integrity": "sha512-OsQd175SxWkGlzbny8J3K8TnnDD0N3lrIUtB92xwyRpzaenGZhxDvxN/JgU00U3CDZNj9tPuDJ5H0WS4Nt3vKg==", "license": "MIT", "dependencies": { "@babel/types": "^7.27.3" @@ -268,9 +268,9 @@ } }, "node_modules/@babel/runtime": { - "version": "7.27.4", - "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.27.4.tgz", - "integrity": "sha512-t3yaEOuGu9NlIZ+hIeGbBjFtZT7j2cb2tg0fuaJKeGotchRjjLfrBA9Kwf8quhpP1EUuxModQg04q/mBwyg8uA==", + "version": "7.27.6", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.27.6.tgz", + "integrity": "sha512-vbavdySgbTTrmFE+EsiqUTzlOr5bzlnJtUv9PynGCAKvfQqjIXbvFdumPM/GxMDfyuGMJaJAU6TO4zc1Jf1i8Q==", "license": "MIT", "engines": { "node": ">=6.9.0" @@ -309,9 +309,9 @@ } }, "node_modules/@babel/types": { - "version": "7.27.3", - "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.27.3.tgz", - "integrity": "sha512-Y1GkI4ktrtvmawoSq+4FCVHNryea6uR+qUQy0AGxLSsjCX0nVmkYQMBLHDkXZuo5hGx7eYdnIaslsdBFm7zbUw==", + "version": "7.27.6", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.27.6.tgz", + "integrity": "sha512-ETyHEk2VHHvl9b9jZP5IHPavHYk57EhanlRRuae9XCpb/j5bDCbPPMOBfCWhnl/7EDJz0jEMCi/RhccCE8r1+Q==", "license": "MIT", "dependencies": { "@babel/helper-string-parser": "^7.27.1", @@ -1237,16 +1237,16 @@ } }, "node_modules/@rolldown/pluginutils": { - "version": "1.0.0-beta.9", - "resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.9.tgz", - "integrity": "sha512-e9MeMtVWo186sgvFFJOPGy7/d2j2mZhLJIdVW0C/xDluuOvymEATqz6zKsP0ZmXGzQtqlyjz5sC1sYQUoJG98w==", + "version": "1.0.0-beta.11", + "resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.11.tgz", + "integrity": "sha512-L/gAA/hyCSuzTF1ftlzUSI/IKr2POHsv1Dd78GfqkR83KMNuswWD61JxGV2L7nRwBBBSDr6R1gCkdTmoN7W4ag==", "dev": true, "license": "MIT" }, "node_modules/@rollup/rollup-android-arm-eabi": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.41.1.tgz", - "integrity": "sha512-NELNvyEWZ6R9QMkiytB4/L4zSEaBC03KIXEghptLGLZWJ6VPrL63ooZQCOnlx36aQPGhzuOMwDerC1Eb2VmrLw==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.42.0.tgz", + "integrity": "sha512-gldmAyS9hpj+H6LpRNlcjQWbuKUtb94lodB9uCz71Jm+7BxK1VIOo7y62tZZwxhA7j1ylv/yQz080L5WkS+LoQ==", "cpu": [ "arm" ], @@ -1258,9 +1258,9 @@ ] }, "node_modules/@rollup/rollup-android-arm64": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.41.1.tgz", - "integrity": "sha512-DXdQe1BJ6TK47ukAoZLehRHhfKnKg9BjnQYUu9gzhI8Mwa1d2fzxA1aw2JixHVl403bwp1+/o/NhhHtxWJBgEA==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.42.0.tgz", + "integrity": "sha512-bpRipfTgmGFdCZDFLRvIkSNO1/3RGS74aWkJJTFJBH7h3MRV4UijkaEUeOMbi9wxtxYmtAbVcnMtHTPBhLEkaw==", "cpu": [ "arm64" ], @@ -1272,9 +1272,9 @@ ] }, "node_modules/@rollup/rollup-darwin-arm64": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.41.1.tgz", - "integrity": "sha512-5afxvwszzdulsU2w8JKWwY8/sJOLPzf0e1bFuvcW5h9zsEg+RQAojdW0ux2zyYAz7R8HvvzKCjLNJhVq965U7w==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.42.0.tgz", + "integrity": "sha512-JxHtA081izPBVCHLKnl6GEA0w3920mlJPLh89NojpU2GsBSB6ypu4erFg/Wx1qbpUbepn0jY4dVWMGZM8gplgA==", "cpu": [ "arm64" ], @@ -1286,9 +1286,9 @@ ] }, "node_modules/@rollup/rollup-darwin-x64": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.41.1.tgz", - "integrity": "sha512-egpJACny8QOdHNNMZKf8xY0Is6gIMz+tuqXlusxquWu3F833DcMwmGM7WlvCO9sB3OsPjdC4U0wHw5FabzCGZg==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.42.0.tgz", + "integrity": "sha512-rv5UZaWVIJTDMyQ3dCEK+m0SAn6G7H3PRc2AZmExvbDvtaDc+qXkei0knQWcI3+c9tEs7iL/4I4pTQoPbNL2SA==", "cpu": [ "x64" ], @@ -1300,9 +1300,9 @@ ] }, "node_modules/@rollup/rollup-freebsd-arm64": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.41.1.tgz", - "integrity": "sha512-DBVMZH5vbjgRk3r0OzgjS38z+atlupJ7xfKIDJdZZL6sM6wjfDNo64aowcLPKIx7LMQi8vybB56uh1Ftck/Atg==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.42.0.tgz", + "integrity": "sha512-fJcN4uSGPWdpVmvLuMtALUFwCHgb2XiQjuECkHT3lWLZhSQ3MBQ9pq+WoWeJq2PrNxr9rPM1Qx+IjyGj8/c6zQ==", "cpu": [ "arm64" ], @@ -1314,9 +1314,9 @@ ] }, "node_modules/@rollup/rollup-freebsd-x64": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.41.1.tgz", - "integrity": "sha512-3FkydeohozEskBxNWEIbPfOE0aqQgB6ttTkJ159uWOFn42VLyfAiyD9UK5mhu+ItWzft60DycIN1Xdgiy8o/SA==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.42.0.tgz", + "integrity": "sha512-CziHfyzpp8hJpCVE/ZdTizw58gr+m7Y2Xq5VOuCSrZR++th2xWAz4Nqk52MoIIrV3JHtVBhbBsJcAxs6NammOQ==", "cpu": [ "x64" ], @@ -1328,9 +1328,9 @@ ] }, "node_modules/@rollup/rollup-linux-arm-gnueabihf": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.41.1.tgz", - "integrity": "sha512-wC53ZNDgt0pqx5xCAgNunkTzFE8GTgdZ9EwYGVcg+jEjJdZGtq9xPjDnFgfFozQI/Xm1mh+D9YlYtl+ueswNEg==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.42.0.tgz", + "integrity": "sha512-UsQD5fyLWm2Fe5CDM7VPYAo+UC7+2Px4Y+N3AcPh/LdZu23YcuGPegQly++XEVaC8XUTFVPscl5y5Cl1twEI4A==", "cpu": [ "arm" ], @@ -1342,9 +1342,9 @@ ] }, "node_modules/@rollup/rollup-linux-arm-musleabihf": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.41.1.tgz", - "integrity": "sha512-jwKCca1gbZkZLhLRtsrka5N8sFAaxrGz/7wRJ8Wwvq3jug7toO21vWlViihG85ei7uJTpzbXZRcORotE+xyrLA==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.42.0.tgz", + "integrity": "sha512-/i8NIrlgc/+4n1lnoWl1zgH7Uo0XK5xK3EDqVTf38KvyYgCU/Rm04+o1VvvzJZnVS5/cWSd07owkzcVasgfIkQ==", "cpu": [ "arm" ], @@ -1356,9 +1356,9 @@ ] }, "node_modules/@rollup/rollup-linux-arm64-gnu": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.41.1.tgz", - "integrity": "sha512-g0UBcNknsmmNQ8V2d/zD2P7WWfJKU0F1nu0k5pW4rvdb+BIqMm8ToluW/eeRmxCared5dD76lS04uL4UaNgpNA==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.42.0.tgz", + "integrity": "sha512-eoujJFOvoIBjZEi9hJnXAbWg+Vo1Ov8n/0IKZZcPZ7JhBzxh2A+2NFyeMZIRkY9iwBvSjloKgcvnjTbGKHE44Q==", "cpu": [ "arm64" ], @@ -1370,9 +1370,9 @@ ] }, "node_modules/@rollup/rollup-linux-arm64-musl": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.41.1.tgz", - "integrity": "sha512-XZpeGB5TKEZWzIrj7sXr+BEaSgo/ma/kCgrZgL0oo5qdB1JlTzIYQKel/RmhT6vMAvOdM2teYlAaOGJpJ9lahg==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.42.0.tgz", + "integrity": "sha512-/3NrcOWFSR7RQUQIuZQChLND36aTU9IYE4j+TB40VU78S+RA0IiqHR30oSh6P1S9f9/wVOenHQnacs/Byb824g==", "cpu": [ "arm64" ], @@ -1384,9 +1384,9 @@ ] }, "node_modules/@rollup/rollup-linux-loongarch64-gnu": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loongarch64-gnu/-/rollup-linux-loongarch64-gnu-4.41.1.tgz", - "integrity": "sha512-bkCfDJ4qzWfFRCNt5RVV4DOw6KEgFTUZi2r2RuYhGWC8WhCA8lCAJhDeAmrM/fdiAH54m0mA0Vk2FGRPyzI+tw==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loongarch64-gnu/-/rollup-linux-loongarch64-gnu-4.42.0.tgz", + "integrity": "sha512-O8AplvIeavK5ABmZlKBq9/STdZlnQo7Sle0LLhVA7QT+CiGpNVe197/t8Aph9bhJqbDVGCHpY2i7QyfEDDStDg==", "cpu": [ "loong64" ], @@ -1398,9 +1398,9 @@ ] }, "node_modules/@rollup/rollup-linux-powerpc64le-gnu": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.41.1.tgz", - "integrity": "sha512-3mr3Xm+gvMX+/8EKogIZSIEF0WUu0HL9di+YWlJpO8CQBnoLAEL/roTCxuLncEdgcfJcvA4UMOf+2dnjl4Ut1A==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.42.0.tgz", + "integrity": "sha512-6Qb66tbKVN7VyQrekhEzbHRxXXFFD8QKiFAwX5v9Xt6FiJ3BnCVBuyBxa2fkFGqxOCSGGYNejxd8ht+q5SnmtA==", "cpu": [ "ppc64" ], @@ -1412,9 +1412,9 @@ ] }, "node_modules/@rollup/rollup-linux-riscv64-gnu": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.41.1.tgz", - "integrity": "sha512-3rwCIh6MQ1LGrvKJitQjZFuQnT2wxfU+ivhNBzmxXTXPllewOF7JR1s2vMX/tWtUYFgphygxjqMl76q4aMotGw==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.42.0.tgz", + "integrity": "sha512-KQETDSEBamQFvg/d8jajtRwLNBlGc3aKpaGiP/LvEbnmVUKlFta1vqJqTrvPtsYsfbE/DLg5CC9zyXRX3fnBiA==", "cpu": [ "riscv64" ], @@ -1426,9 +1426,9 @@ ] }, "node_modules/@rollup/rollup-linux-riscv64-musl": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.41.1.tgz", - "integrity": "sha512-LdIUOb3gvfmpkgFZuccNa2uYiqtgZAz3PTzjuM5bH3nvuy9ty6RGc/Q0+HDFrHrizJGVpjnTZ1yS5TNNjFlklw==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.42.0.tgz", + "integrity": "sha512-qMvnyjcU37sCo/tuC+JqeDKSuukGAd+pVlRl/oyDbkvPJ3awk6G6ua7tyum02O3lI+fio+eM5wsVd66X0jQtxw==", "cpu": [ "riscv64" ], @@ -1440,9 +1440,9 @@ ] }, "node_modules/@rollup/rollup-linux-s390x-gnu": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.41.1.tgz", - "integrity": "sha512-oIE6M8WC9ma6xYqjvPhzZYk6NbobIURvP/lEbh7FWplcMO6gn7MM2yHKA1eC/GvYwzNKK/1LYgqzdkZ8YFxR8g==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.42.0.tgz", + "integrity": "sha512-I2Y1ZUgTgU2RLddUHXTIgyrdOwljjkmcZ/VilvaEumtS3Fkuhbw4p4hgHc39Ypwvo2o7sBFNl2MquNvGCa55Iw==", "cpu": [ "s390x" ], @@ -1454,9 +1454,9 @@ ] }, "node_modules/@rollup/rollup-linux-x64-gnu": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.41.1.tgz", - "integrity": "sha512-cWBOvayNvA+SyeQMp79BHPK8ws6sHSsYnK5zDcsC3Hsxr1dgTABKjMnMslPq1DvZIp6uO7kIWhiGwaTdR4Og9A==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.42.0.tgz", + "integrity": "sha512-Gfm6cV6mj3hCUY8TqWa63DB8Mx3NADoFwiJrMpoZ1uESbK8FQV3LXkhfry+8bOniq9pqY1OdsjFWNsSbfjPugw==", "cpu": [ "x64" ], @@ -1468,9 +1468,9 @@ ] }, "node_modules/@rollup/rollup-linux-x64-musl": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.41.1.tgz", - "integrity": "sha512-y5CbN44M+pUCdGDlZFzGGBSKCA4A/J2ZH4edTYSSxFg7ce1Xt3GtydbVKWLlzL+INfFIZAEg1ZV6hh9+QQf9YQ==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.42.0.tgz", + "integrity": "sha512-g86PF8YZ9GRqkdi0VoGlcDUb4rYtQKyTD1IVtxxN4Hpe7YqLBShA7oHMKU6oKTCi3uxwW4VkIGnOaH/El8de3w==", "cpu": [ "x64" ], @@ -1482,9 +1482,9 @@ ] }, "node_modules/@rollup/rollup-win32-arm64-msvc": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.41.1.tgz", - "integrity": "sha512-lZkCxIrjlJlMt1dLO/FbpZbzt6J/A8p4DnqzSa4PWqPEUUUnzXLeki/iyPLfV0BmHItlYgHUqJe+3KiyydmiNQ==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.42.0.tgz", + "integrity": "sha512-+axkdyDGSp6hjyzQ5m1pgcvQScfHnMCcsXkx8pTgy/6qBmWVhtRVlgxjWwDp67wEXXUr0x+vD6tp5W4x6V7u1A==", "cpu": [ "arm64" ], @@ -1496,9 +1496,9 @@ ] }, "node_modules/@rollup/rollup-win32-ia32-msvc": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.41.1.tgz", - "integrity": "sha512-+psFT9+pIh2iuGsxFYYa/LhS5MFKmuivRsx9iPJWNSGbh2XVEjk90fmpUEjCnILPEPJnikAU6SFDiEUyOv90Pg==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.42.0.tgz", + "integrity": "sha512-F+5J9pelstXKwRSDq92J0TEBXn2nfUrQGg+HK1+Tk7VOL09e0gBqUHugZv7SW4MGrYj41oNCUe3IKCDGVlis2g==", "cpu": [ "ia32" ], @@ -1510,9 +1510,9 @@ ] }, "node_modules/@rollup/rollup-win32-x64-msvc": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.41.1.tgz", - "integrity": "sha512-Wq2zpapRYLfi4aKxf2Xff0tN+7slj2d4R87WEzqw7ZLsVvO5zwYCIuEGSZYiK41+GlwUo1HiR+GdkLEJnCKTCw==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.42.0.tgz", + "integrity": "sha512-LpHiJRwkaVz/LqjHjK8LCi8osq7elmpwujwbXKNW88bM8eeGxavJIKKjkjpMHAh/2xfnrt1ZSnhTv41WYUHYmA==", "cpu": [ "x64" ], @@ -1578,9 +1578,9 @@ } }, "node_modules/@types/estree": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.7.tgz", - "integrity": "sha512-w28IoSUCJpidD/TGviZwwMJckNESJZXFu7NBZ5YJ4mEUnNraUn9Pm8HSZm/jDF1pDWYKspWE7oVphigUPRakIQ==", + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", + "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==", "license": "MIT" }, "node_modules/@types/estree-jsx": { @@ -1623,24 +1623,24 @@ "license": "MIT" }, "node_modules/@types/prop-types": { - "version": "15.7.14", - "resolved": "https://registry.npmjs.org/@types/prop-types/-/prop-types-15.7.14.tgz", - "integrity": "sha512-gNMvNH49DJ7OJYv+KAKn0Xp45p8PLl6zo2YnvDIbTd4J6MER2BmWN49TG7n9LvkyihINxeKW8+3bfS2yDC9dzQ==", + "version": "15.7.15", + "resolved": "https://registry.npmjs.org/@types/prop-types/-/prop-types-15.7.15.tgz", + "integrity": "sha512-F6bEyamV9jKGAFBEmlQnesRPGOQqS2+Uwi0Em15xenOxHaf2hv6L8YCVn3rPdPJOiJfPiCnLIRyvwVaqMY3MIw==", "license": "MIT" }, "node_modules/@types/react": { - "version": "19.1.6", - "resolved": "https://registry.npmjs.org/@types/react/-/react-19.1.6.tgz", - "integrity": "sha512-JeG0rEWak0N6Itr6QUx+X60uQmN+5t3j9r/OVDtWzFXKaj6kD1BwJzOksD0FF6iWxZlbE1kB0q9vtnU2ekqa1Q==", + "version": "19.1.7", + "resolved": "https://registry.npmjs.org/@types/react/-/react-19.1.7.tgz", + "integrity": "sha512-BnsPLV43ddr05N71gaGzyZ5hzkCmGwhMvYc8zmvI8Ci1bRkkDSzDDVfAXfN2tk748OwI7ediiPX6PfT9p0QGVg==", "license": "MIT", "dependencies": { "csstype": "^3.0.2" } }, "node_modules/@types/react-dom": { - "version": "19.1.5", - "resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-19.1.5.tgz", - "integrity": "sha512-CMCjrWucUBZvohgZxkjd6S9h0nZxXjzus6yDfUb+xLxYM7VvjKNH1tQrE9GWLql1XoOP4/Ds3bwFqShHUYraGg==", + "version": "19.1.6", + "resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-19.1.6.tgz", + "integrity": "sha512-4hOiT/dwO8Ko0gV1m/TJZYk3y0KBnY9vzDh7W+DH17b2HFSOGgdj33dhihPeuy3l0q23+4e+hoXHV6hCC4dCXw==", "dev": true, "license": "MIT", "peerDependencies": { @@ -1669,16 +1669,16 @@ "license": "ISC" }, "node_modules/@vitejs/plugin-react": { - "version": "4.5.0", - "resolved": "https://registry.npmjs.org/@vitejs/plugin-react/-/plugin-react-4.5.0.tgz", - "integrity": "sha512-JuLWaEqypaJmOJPLWwO335Ig6jSgC1FTONCWAxnqcQthLTK/Yc9aH6hr9z/87xciejbQcnP3GnA1FWUSWeXaeg==", + "version": "4.5.2", + "resolved": "https://registry.npmjs.org/@vitejs/plugin-react/-/plugin-react-4.5.2.tgz", + "integrity": "sha512-QNVT3/Lxx99nMQWJWF7K4N6apUEuT0KlZA3mx/mVaoGj3smm/8rc8ezz15J1pcbcjDK0V15rpHetVfya08r76Q==", "dev": true, "license": "MIT", "dependencies": { - "@babel/core": "^7.26.10", - "@babel/plugin-transform-react-jsx-self": "^7.25.9", - "@babel/plugin-transform-react-jsx-source": "^7.25.9", - "@rolldown/pluginutils": "1.0.0-beta.9", + "@babel/core": "^7.27.4", + "@babel/plugin-transform-react-jsx-self": "^7.27.1", + "@babel/plugin-transform-react-jsx-source": "^7.27.1", + "@rolldown/pluginutils": "1.0.0-beta.11", "@types/babel__core": "^7.20.5", "react-refresh": "^0.17.0" }, @@ -1686,7 +1686,7 @@ "node": "^14.18.0 || >=16.0.0" }, "peerDependencies": { - "vite": "^4.2.0 || ^5.0.0 || ^6.0.0" + "vite": "^4.2.0 || ^5.0.0 || ^6.0.0 || ^7.0.0-beta.0" } }, "node_modules/babel-plugin-macros": { @@ -1770,9 +1770,9 @@ } }, "node_modules/caniuse-lite": { - "version": "1.0.30001720", - "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001720.tgz", - "integrity": "sha512-Ec/2yV2nNPwb4DnTANEV99ZWwm3ZWfdlfkQbWSDDt+PsXEVYwlhPH8tdMaPunYTKKmz7AnHi2oNEi1GcmKCD8g==", + "version": "1.0.30001721", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001721.tgz", + "integrity": "sha512-cOuvmUVtKrtEaoKiO0rSc29jcjwMwX5tOHDy4MgVFEWiUXj4uBMJkwI8MDySkgXidpMiHUcviogAvFi4pA2hDQ==", "dev": true, "funding": [ { @@ -1959,9 +1959,9 @@ } }, "node_modules/electron-to-chromium": { - "version": "1.5.161", - "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.161.tgz", - "integrity": "sha512-hwtetwfKNZo/UlwHIVBlKZVdy7o8bIZxxKs0Mv/ROPiQQQmDgdm5a+KvKtBsxM8ZjFzTaCeLoodZ8jiBE3o9rA==", + "version": "1.5.166", + "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.166.tgz", + "integrity": "sha512-QPWqHL0BglzPYyJJ1zSSmwFFL6MFXhbACOCcsCdUMCkzPdS9/OIBVxg516X/Ado2qwAq8k0nJJ7phQPCqiaFAw==", "dev": true, "license": "ISC" }, @@ -2054,9 +2054,9 @@ "license": "MIT" }, "node_modules/fdir": { - "version": "6.4.5", - "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.4.5.tgz", - "integrity": "sha512-4BG7puHpVsIYxZUbiUE3RqGloLaSSwzYie5jvasC4LWuBWzZawynvYouhjbQKw2JuIGYdm0DzIxl8iVidKlUEw==", + "version": "6.4.6", + "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.4.6.tgz", + "integrity": "sha512-hiFoqpyZcfNm1yc4u8oWCf9A2c4D3QjCrks3zmoVKVxpQRzmPNar1hUJcBG2RQHvEVGDN+Jm81ZheVLAQMK6+w==", "dev": true, "license": "MIT", "peerDependencies": { @@ -3342,9 +3342,9 @@ } }, "node_modules/rollup": { - "version": "4.41.1", - "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.41.1.tgz", - "integrity": "sha512-cPmwD3FnFv8rKMBc1MxWCwVQFxwf1JEmSX3iQXrRVVG15zerAIXRjMFVWnd5Q5QvgKF7Aj+5ykXFhUl+QGnyOw==", + "version": "4.42.0", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.42.0.tgz", + "integrity": "sha512-LW+Vse3BJPyGJGAJt1j8pWDKPd73QM8cRXYK1IxOBgL2AGLu7Xd2YOW0M2sLUBCkF5MshXXtMApyEAEzMVMsnw==", "dev": true, "license": "MIT", "dependencies": { @@ -3358,29 +3358,36 @@ "npm": ">=8.0.0" }, "optionalDependencies": { - "@rollup/rollup-android-arm-eabi": "4.41.1", - "@rollup/rollup-android-arm64": "4.41.1", - "@rollup/rollup-darwin-arm64": "4.41.1", - "@rollup/rollup-darwin-x64": "4.41.1", - "@rollup/rollup-freebsd-arm64": "4.41.1", - "@rollup/rollup-freebsd-x64": "4.41.1", - "@rollup/rollup-linux-arm-gnueabihf": "4.41.1", - "@rollup/rollup-linux-arm-musleabihf": "4.41.1", - "@rollup/rollup-linux-arm64-gnu": "4.41.1", - "@rollup/rollup-linux-arm64-musl": "4.41.1", - "@rollup/rollup-linux-loongarch64-gnu": "4.41.1", - "@rollup/rollup-linux-powerpc64le-gnu": "4.41.1", - "@rollup/rollup-linux-riscv64-gnu": "4.41.1", - "@rollup/rollup-linux-riscv64-musl": "4.41.1", - "@rollup/rollup-linux-s390x-gnu": "4.41.1", - "@rollup/rollup-linux-x64-gnu": "4.41.1", - "@rollup/rollup-linux-x64-musl": "4.41.1", - "@rollup/rollup-win32-arm64-msvc": "4.41.1", - "@rollup/rollup-win32-ia32-msvc": "4.41.1", - "@rollup/rollup-win32-x64-msvc": "4.41.1", + "@rollup/rollup-android-arm-eabi": "4.42.0", + "@rollup/rollup-android-arm64": "4.42.0", + "@rollup/rollup-darwin-arm64": "4.42.0", + "@rollup/rollup-darwin-x64": "4.42.0", + "@rollup/rollup-freebsd-arm64": "4.42.0", + "@rollup/rollup-freebsd-x64": "4.42.0", + "@rollup/rollup-linux-arm-gnueabihf": "4.42.0", + "@rollup/rollup-linux-arm-musleabihf": "4.42.0", + "@rollup/rollup-linux-arm64-gnu": "4.42.0", + "@rollup/rollup-linux-arm64-musl": "4.42.0", + "@rollup/rollup-linux-loongarch64-gnu": "4.42.0", + "@rollup/rollup-linux-powerpc64le-gnu": "4.42.0", + "@rollup/rollup-linux-riscv64-gnu": "4.42.0", + "@rollup/rollup-linux-riscv64-musl": "4.42.0", + "@rollup/rollup-linux-s390x-gnu": "4.42.0", + "@rollup/rollup-linux-x64-gnu": "4.42.0", + "@rollup/rollup-linux-x64-musl": "4.42.0", + "@rollup/rollup-win32-arm64-msvc": "4.42.0", + "@rollup/rollup-win32-ia32-msvc": "4.42.0", + "@rollup/rollup-win32-x64-msvc": "4.42.0", "fsevents": "~2.3.2" } }, + "node_modules/rollup/node_modules/@types/estree": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.7.tgz", + "integrity": "sha512-w28IoSUCJpidD/TGviZwwMJckNESJZXFu7NBZ5YJ4mEUnNraUn9Pm8HSZm/jDF1pDWYKspWE7oVphigUPRakIQ==", + "dev": true, + "license": "MIT" + }, "node_modules/scheduler": { "version": "0.26.0", "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.26.0.tgz", From 499c272260844bacf6eeb19dd739c74701b6f718 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 Jun 2025 16:37:52 +0100 Subject: [PATCH 12/19] dependabot switch to monthly --- .github/dependabot.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 34e7a24..0042295 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -12,7 +12,7 @@ updates: patterns: - "*" schedule: - interval: "weekly" + interval: "monthly" - package-ecosystem: "github-actions" directory: "/" @@ -21,7 +21,7 @@ updates: patterns: - "*" schedule: - interval: "weekly" + interval: "monthly" - package-ecosystem: "npm" directory: "/scripts/settings/" @@ -30,11 +30,11 @@ updates: patterns: - "*" schedule: - interval: "weekly" + interval: "monthly" - package-ecosystem: "docker" # Look for a `Dockerfile` in the `root` directory directory: "/" # Check for updates once a week schedule: - interval: "weekly" \ No newline at end of file + interval: "monthly" \ No newline at end of file From 22bd8727dfa950e5541ab5df14bd87ead439688b Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 Jun 2025 16:43:55 +0100 Subject: [PATCH 13/19] python dependencies bump --- poetry.lock | 109 ++++++++++++++++++++++++++-------------------------- 1 file changed, 55 insertions(+), 54 deletions(-) diff --git a/poetry.lock b/poetry.lock index 8adf6b9..786ae5f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -193,18 +193,18 @@ files = [ [[package]] name = "boto3" -version = "1.38.27" +version = "1.38.33" description = "The AWS SDK for Python" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "boto3-1.38.27-py3-none-any.whl", hash = "sha256:95f5fe688795303a8a15e8b7e7f255cadab35eae459d00cc281a4fd77252ea80"}, - {file = "boto3-1.38.27.tar.gz", hash = "sha256:94bd7fdd92d5701b362d4df100d21e28f8307a67ff56b6a8b0398119cf22f859"}, + {file = "boto3-1.38.33-py3-none-any.whl", hash = "sha256:25d0717489c658f7ae6c3c7f0f7e1b4d611b30b2f08f0fcef6455ac6864a8768"}, + {file = "boto3-1.38.33.tar.gz", hash = "sha256:6467909c1ae01ff67981f021bb2568592211765ec8a9a1d2c4529191e46c3541"}, ] [package.dependencies] -botocore = ">=1.38.27,<1.39.0" +botocore = ">=1.38.33,<1.39.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.13.0,<0.14.0" @@ -213,14 +213,14 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.38.27" +version = "1.38.33" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "botocore-1.38.27-py3-none-any.whl", hash = "sha256:a785d5e9a5eda88ad6ab9ed8b87d1f2ac409d0226bba6ff801c55359e94d91a8"}, - {file = "botocore-1.38.27.tar.gz", hash = "sha256:9788f7efe974328a38cbade64cc0b1e67d27944b899f88cb786ae362973133b6"}, + {file = "botocore-1.38.33-py3-none-any.whl", hash = "sha256:ad25233e93dcebe95809b1f9393c1f11a639696327793d166295fb78dd7bc597"}, + {file = "botocore-1.38.33.tar.gz", hash = "sha256:dbe8fea9d0426c644c89ef2018ead886ccbcc22901a02b377b4e65ce1cb69a2b"}, ] [package.dependencies] @@ -941,14 +941,14 @@ files = [ [[package]] name = "google-api-core" -version = "2.24.2" +version = "2.25.0" description = "Google API client core library" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "google_api_core-2.24.2-py3-none-any.whl", hash = "sha256:810a63ac95f3c441b7c0e43d344e372887f62ce9071ba972eacf32672e072de9"}, - {file = "google_api_core-2.24.2.tar.gz", hash = "sha256:81718493daf06d96d6bc76a91c23874dbf2fac0adbbf542831b805ee6e974696"}, + {file = "google_api_core-2.25.0-py3-none-any.whl", hash = "sha256:1db79d1281dcf9f3d10023283299ba38f3dc9f639ec41085968fd23e5bcf512e"}, + {file = "google_api_core-2.25.0.tar.gz", hash = "sha256:9b548e688702f82a34ed8409fb8a6961166f0b7795032f0be8f48308dff4333a"}, ] [package.dependencies] @@ -959,21 +959,21 @@ protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4 requests = ">=2.18.0,<3.0.0" [package.extras] -async-rest = ["google-auth[aiohttp] (>=2.35.0,<3.0.dev0)"] -grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev) ; python_version >= \"3.11\"", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0) ; python_version >= \"3.11\""] -grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] -grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] +async-rest = ["google-auth[aiohttp] (>=2.35.0,<3.0.0)"] +grpc = ["grpcio (>=1.33.2,<2.0.0)", "grpcio (>=1.49.1,<2.0.0) ; python_version >= \"3.11\"", "grpcio-status (>=1.33.2,<2.0.0)", "grpcio-status (>=1.49.1,<2.0.0) ; python_version >= \"3.11\""] +grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0.0)"] +grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.0)"] [[package]] name = "google-api-python-client" -version = "2.170.0" +version = "2.171.0" description = "Google API Client Library for Python" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "google_api_python_client-2.170.0-py3-none-any.whl", hash = "sha256:7bf518a0527ad23322f070fa69f4f24053170d5c766821dc970ff0571ec22748"}, - {file = "google_api_python_client-2.170.0.tar.gz", hash = "sha256:75f3a1856f11418ea3723214e0abc59d9b217fd7ed43dcf743aab7f06ab9e2b1"}, + {file = "google_api_python_client-2.171.0-py3-none-any.whl", hash = "sha256:c9c9b76f561e9d9ac14e54a9e2c0842876201d5b96e69e48f967373f0784cbe9"}, + {file = "google_api_python_client-2.171.0.tar.gz", hash = "sha256:057a5c08d28463c6b9eb89746355de5f14b7ed27a65c11fdbf1d06c66bb66b23"}, ] [package.dependencies] @@ -985,14 +985,14 @@ uritemplate = ">=3.0.1,<5" [[package]] name = "google-auth" -version = "2.40.2" +version = "2.40.3" description = "Google Authentication Library" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "google_auth-2.40.2-py2.py3-none-any.whl", hash = "sha256:f7e568d42eedfded58734f6a60c58321896a621f7c116c411550a4b4a13da90b"}, - {file = "google_auth-2.40.2.tar.gz", hash = "sha256:a33cde547a2134273226fa4b853883559947ebe9207521f7afc707efbf690f58"}, + {file = "google_auth-2.40.3-py2.py3-none-any.whl", hash = "sha256:1370d4593e86213563547f97a92752fc658456fe4514c809544f330fed45a7ca"}, + {file = "google_auth-2.40.3.tar.gz", hash = "sha256:500c3a29adedeb36ea9cf24b8d10858e152f2412e3ca37829b3fa18e33d63b77"}, ] [package.dependencies] @@ -2130,7 +2130,7 @@ version = "2.19.1" description = "Pygments is a syntax highlighting package written in Python." optional = false python-versions = ">=3.8" -groups = ["main", "docs"] +groups = ["main", "dev", "docs"] files = [ {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"}, {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"}, @@ -2335,26 +2335,27 @@ files = [ [[package]] name = "pytest" -version = "8.3.5" +version = "8.4.0" description = "pytest: simple powerful testing with Python" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" groups = ["main", "dev"] files = [ - {file = "pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820"}, - {file = "pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845"}, + {file = "pytest-8.4.0-py3-none-any.whl", hash = "sha256:f40f825768ad76c0977cbacdf1fd37c6f7a468e460ea6a0636078f8972d4517e"}, + {file = "pytest-8.4.0.tar.gz", hash = "sha256:14d920b48472ea0dbf68e45b96cd1ffda4705f33307dcc86c676c1b5104838a6"}, ] [package.dependencies] -colorama = {version = "*", markers = "sys_platform == \"win32\""} -exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} -iniconfig = "*" -packaging = "*" +colorama = {version = ">=0.4", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1", markers = "python_version < \"3.11\""} +iniconfig = ">=1" +packaging = ">=20" pluggy = ">=1.5,<2" +pygments = ">=2.7.2" tomli = {version = ">=1", markers = "python_version < \"3.11\""} [package.extras] -dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "requests", "setuptools", "xmlschema"] [[package]] name = "pytest-html" @@ -2766,19 +2767,19 @@ files = [ [[package]] name = "requests" -version = "2.32.3" +version = "2.32.4" description = "Python HTTP for Humans." optional = false python-versions = ">=3.8" groups = ["main", "docs"] files = [ - {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, - {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, + {file = "requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c"}, + {file = "requests-2.32.4.tar.gz", hash = "sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422"}, ] [package.dependencies] certifi = ">=2017.4.17" -charset-normalizer = ">=2,<4" +charset_normalizer = ">=2,<4" idna = ">=2.5,<4" PySocks = {version = ">=1.5.6,<1.5.7 || >1.5.7", optional = true, markers = "extra == \"socks\""} urllib3 = ">=1.21.1,<3" @@ -2894,7 +2895,7 @@ description = "Manipulate well-formed Roman numerals" optional = false python-versions = ">=3.9" groups = ["docs"] -markers = "python_version >= \"3.11\"" +markers = "python_version != \"3.10\"" files = [ {file = "roman_numerals_py-3.1.0-py3-none-any.whl", hash = "sha256:9da2ad2fb670bcf24e81070ceb3be72f6c11c440d73bd579fbeca1e9f330954c"}, {file = "roman_numerals_py-3.1.0.tar.gz", hash = "sha256:be4bf804f083a4ce001b5eb7e3c0862479d10f94c936f6c4e5f250aa5ff5bd2d"}, @@ -2921,14 +2922,14 @@ pyasn1 = ">=0.1.3" [[package]] name = "ruamel-yaml" -version = "0.18.12" +version = "0.18.14" description = "ruamel.yaml is a YAML parser/emitter that supports roundtrip preservation of comments, seq/map flow style, and map key order" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" groups = ["main"] files = [ - {file = "ruamel.yaml-0.18.12-py3-none-any.whl", hash = "sha256:790ba4c48b6a6e6b12b532a7308779eb12d2aaab3a80fdb8389216f28ea2b287"}, - {file = "ruamel.yaml-0.18.12.tar.gz", hash = "sha256:5a38fd5ce39d223bebb9e3a6779e86b9427a03fb0bf9f270060f8b149cffe5e2"}, + {file = "ruamel.yaml-0.18.14-py3-none-any.whl", hash = "sha256:710ff198bb53da66718c7db27eec4fbcc9aa6ca7204e4c1df2f282b6fe5eb6b2"}, + {file = "ruamel.yaml-0.18.14.tar.gz", hash = "sha256:7227b76aaec364df15936730efbf7d72b30c0b79b1d578bbb8e3dcb2d81f52b7"}, ] [package.dependencies] @@ -3112,14 +3113,14 @@ websocket-client = ">=1.8.0,<1.9.0" [[package]] name = "seleniumbase" -version = "4.39.2" +version = "4.39.3" description = "A complete web automation framework for end-to-end testing." optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "seleniumbase-4.39.2-py3-none-any.whl", hash = "sha256:23b2d071c02ba269a8239b828fd5098edb208d04171143c93b40d8a351ba2861"}, - {file = "seleniumbase-4.39.2.tar.gz", hash = "sha256:3a18d582ca90f4d633debb8ec45871db1b7aed71e5876fc634962fba79731967"}, + {file = "seleniumbase-4.39.3-py3-none-any.whl", hash = "sha256:cbb94d7858a9ef3b0b4431a5879150649f4a73029afaa8ecfb7bda113f2565e1"}, + {file = "seleniumbase-4.39.3.tar.gz", hash = "sha256:b32978e685b1e4e2c7859b2dcb377ac14ba99bf748ea428548f9e450257b861b"}, ] [package.dependencies] @@ -3156,7 +3157,7 @@ pygments = ">=2.19.1" pynose = ">=1.5.4" pyotp = "2.9.0" pyreadline3 = {version = ">=3.5.3", markers = "platform_system == \"Windows\""} -pytest = "8.3.5" +pytest = {version = "8.4.0", markers = "python_version >= \"3.9\""} pytest-html = "4.0.2" pytest-metadata = "3.1.1" pytest-ordering = "0.6" @@ -3164,11 +3165,11 @@ pytest-rerunfailures = {version = "15.1", markers = "python_version >= \"3.9\""} pytest-xdist = {version = "3.7.0", markers = "python_version >= \"3.9\""} python-xlib = {version = "0.33", markers = "platform_system == \"Linux\""} pyyaml = ">=6.0.2" -requests = "2.32.3" +requests = "2.32.4" rich = ">=14.0.0,<15" sbvirtualdisplay = ">=1.4.0" selenium = {version = "4.33.0", markers = "python_version >= \"3.10\""} -setuptools = {version = ">=80.8.0", markers = "python_version >= \"3.10\""} +setuptools = {version = ">=80.9.0", markers = "python_version >= \"3.10\""} six = ">=1.17.0" sniffio = "1.3.1" sortedcontainers = "2.4.0" @@ -3323,7 +3324,7 @@ description = "Python documentation generator" optional = false python-versions = ">=3.11" groups = ["docs"] -markers = "python_version >= \"3.11\"" +markers = "python_version != \"3.10\"" files = [ {file = "sphinx-8.2.3-py3-none-any.whl", hash = "sha256:4405915165f13521d875a8c29c8970800a0141c14cc5416a38feca4ea5d9b9c3"}, {file = "sphinx-8.2.3.tar.gz", hash = "sha256:398ad29dee7f63a75888314e9424d40f52ce5a6a87ae88e7071e80af296ec348"}, @@ -3801,14 +3802,14 @@ test = ["coverage", "pytest", "pytest-cov"] [[package]] name = "uritemplate" -version = "4.1.1" +version = "4.2.0" description = "Implementation of RFC 6570 URI Templates" optional = false -python-versions = ">=3.6" +python-versions = ">=3.9" groups = ["main"] files = [ - {file = "uritemplate-4.1.1-py2.py3-none-any.whl", hash = "sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e"}, - {file = "uritemplate-4.1.1.tar.gz", hash = "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0"}, + {file = "uritemplate-4.2.0-py3-none-any.whl", hash = "sha256:962201ba1c4edcab02e60f9a0d3821e82dfc5d2d6662a21abd533879bdb8a686"}, + {file = "uritemplate-4.2.0.tar.gz", hash = "sha256:480c2ed180878955863323eea31b0ede668795de182617fef9c6ca09e6ec9d0e"}, ] [[package]] @@ -4120,14 +4121,14 @@ h11 = ">=0.9.0,<1" [[package]] name = "yt-dlp" -version = "2025.5.22" +version = "2025.6.9" description = "A feature-rich command-line audio/video downloader" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "yt_dlp-2025.5.22-py3-none-any.whl", hash = "sha256:a49c4b76afeaded6254c3e2b759d8d5a13271aa963d5fccb51fe059d1c313151"}, - {file = "yt_dlp-2025.5.22.tar.gz", hash = "sha256:ea73854c5dabc124f29a35a8fae9bc5d422ef3231bebeea2bdfa82ac191a9c29"}, + {file = "yt_dlp-2025.6.9-py3-none-any.whl", hash = "sha256:ebdfda9ffa807f6a26aed7c8f906e5557cd06b4c388dc547df1ec2078631fca8"}, + {file = "yt_dlp-2025.6.9.tar.gz", hash = "sha256:751f53a3b61353522bf805fa30bbcbd16666126537e39706eab4f8c368f111ac"}, ] [package.dependencies] @@ -4142,7 +4143,7 @@ urllib3 = {version = ">=1.26.17,<3", optional = true, markers = "extra == \"defa websockets = {version = ">=13.0", optional = true, markers = "extra == \"default\""} [package.extras] -build = ["build", "hatchling", "pip", "setuptools (>=71.0.2)", "wheel"] +build = ["build", "hatchling", "pip", "setuptools (>=71.0.2,<81)", "wheel"] curl-cffi = ["curl-cffi (>=0.5.10,<0.6.dev0 || ==0.10.*) ; implementation_name == \"cpython\""] default = ["brotli ; implementation_name == \"cpython\"", "brotlicffi ; implementation_name != \"cpython\"", "certifi", "mutagen", "pycryptodomex", "requests (>=2.32.2,<3)", "urllib3 (>=1.26.17,<3)", "websockets (>=13.0)"] dev = ["autopep8 (>=2.0,<3.0)", "pre-commit", "pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)", "ruff (>=0.11.0,<0.12.0)"] From fdbe96f2e44fd87b88e8a6fd1cab8436cdebcee1 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 Jun 2025 16:44:14 +0100 Subject: [PATCH 14/19] vk and reddit should work without credentials but log the error --- .../modules/antibot_extractor_enricher/dropins/reddit.py | 1 + .../modules/antibot_extractor_enricher/dropins/vk.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py index c46ca17..14d9c8c 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py @@ -29,6 +29,7 @@ class RedditDropin(Dropin): self.sb.open(url) return True + @logger.catch def _login(self): self.sb.click_link_text("Log in") self.sb.wait_for_ready_state_complete() diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py index 76e176e..6888727 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py @@ -37,6 +37,7 @@ class VkDropin(Dropin): self.sb.open(url) return True + @logger.catch def _login(self) -> bool: # TODO: test method self.sb.open("https://vk.com") From 71636233cbd537a6eca1cd5b47eb895abbbf89aa Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 Jun 2025 17:07:10 +0100 Subject: [PATCH 15/19] adds migration information and VkDropin info. --- .../source/how_to/upgrading_1_0_1_to_1_1_0.md | 40 +++++++++++++++++++ .../modules/generic_extractor/__manifest__.py | 2 + 2 files changed, 42 insertions(+) create mode 100644 docs/source/how_to/upgrading_1_0_1_to_1_1_0.md diff --git a/docs/source/how_to/upgrading_1_0_1_to_1_1_0.md b/docs/source/how_to/upgrading_1_0_1_to_1_1_0.md new file mode 100644 index 0000000..7e8d398 --- /dev/null +++ b/docs/source/how_to/upgrading_1_0_1_to_1_1_0.md @@ -0,0 +1,40 @@ +# Upgrading from v1.0.1 + +```{note} This how-to is only relevant for people who used Auto Archiver before June 2025 (versions prior to 1.0.1). + +If you are new to Auto Archiver, then you are already using the latest configuration format and this how-to is not relevant for you. +``` + +Versions 1.1.0+ of Auto Archiver has breaking changes in the configuration format, which means earlier configuration formats will not work without slight modifications. + + +## Dropping `vk_extractor` module +We have dropped the `vk_extractor` because of problems in a project we relied on. You will need to remove it from your configuration file, otherwise you will see an error like: + +```{code} console +Module 'vk_extractor' not found. Are you sure it's installed/exists? +``` + +## New `antibot_extractor_enricher` module +We have added a new `antibot_extractor_enricher` module that uses a computer-controlled browser to extract content from websites that use anti-bot measures. You can add it to your configuration file like this: + +```{code} yaml +steps: + extractor_enrichers: + - antibot_extractor_enricher +``` + +It comes with Dropins that we will be adding and maintaining. + +> Dropin: A module that is loaded automatically. You don't need to add them to your configuration steps for them to run. Sometimes they need `authentication` configurations though. + +One such Dropin is the VkDropin which uses this automated browser to access VKontakte (VK) pages. You should add a username/password to the configuration file if you get authentication blocks from VK, to do so use the [authentication settings](authentication_how_to.md): + +```{code} yaml +authentication: + vk: + username: your_username + password: your_password +``` + +See all available Dropins in [the source code](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules/antibot_extractor_enricher/dropins). Usually each Dropin needs its own authentication settings, similarly to the VkDropin. \ No newline at end of file diff --git a/src/auto_archiver/modules/generic_extractor/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py index 72db630..52cf8b8 100644 --- a/src/auto_archiver/modules/generic_extractor/__manifest__.py +++ b/src/auto_archiver/modules/generic_extractor/__manifest__.py @@ -30,6 +30,8 @@ For a full list of video platforms supported by `yt-dlp`, see the custom dropins can be created to handle additional websites and passed to the archiver via the command line using the `--dropins` option (TODO!). +You can see all currently implemented dropins in [the source code](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules/generic_extractor). + ### Auto-Updates The Generic Extractor will also automatically check for updates to `yt-dlp` (every 5 days by default). From 54fda9cad4a83bec52d522ca64d07b6acd8330f6 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 Jun 2025 18:04:27 +0100 Subject: [PATCH 16/19] antibot in docker uses a different user_data_dir --- .../__manifest__.py | 2 +- .../antibot_extractor_enricher.py | 22 ++++++------------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py index 214653c..c7be89f 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py @@ -23,7 +23,7 @@ }, "user_data_dir": { "default": "secrets/antibot_user_data", - "help": "Path to the user data directory for the webdriver. This is used to persist browser state, such as cookies and local storage. When using docker it's best to let docker create the folder otherwise there may be permission issues. The Extractor will try to work without it if that error occurs but login sessions will not be used or preserved on those runs.", + "help": "Path to the user data directory for the webdriver. This is used to persist browser state, such as cookies and local storage. If you use the docker deployment, this path will be appended with `_docker` that is because the folder cannot be shared between the host and the container due to user permissions.", }, "detect_auth_wall": { "default": True, diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py index 08d0c03..549cced 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py @@ -6,7 +6,6 @@ import sys import traceback from urllib.parse import urljoin import glob -import stat import importlib.util from loguru import logger @@ -41,7 +40,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): else: self.max_download_videos = int(self.max_download_videos) - self._prepare_and_warn_about_docker_and_user_data_dir() + self._prepare_user_data_dir() self.dropins = self.load_dropins() @@ -79,19 +78,12 @@ class AntibotExtractorEnricher(Extractor, Enricher): result.status = "antibot" return result - def _prepare_and_warn_about_docker_and_user_data_dir(self): - os.makedirs(self.user_data_dir, exist_ok=True) - - in_docker = os.environ.get("RUNNING_IN_DOCKER") - if in_docker and self.user_data_dir: - st = os.stat(self.user_data_dir) - perms = stat.filemode(st.st_mode) - owner = st.st_uid - group = st.st_gid - if owner != 0 or group != 0: - logger.warning( - f"""ANTIBOT: Running in Docker with user_data_dir {self.user_data_dir} with permissions {perms} and non-root {owner=}. This may cause issues with Chrome, if you get 'session not created' errors make sure to remove the folder and let docker create it.""" - ) + def _prepare_user_data_dir(self): + if self.user_data_dir: + in_docker = os.environ.get("RUNNING_IN_DOCKER") + if in_docker: + self.user_data_dir = self.user_data_dir.rstrip(os.path.sep) + "_docker" + os.makedirs(self.user_data_dir, exist_ok=True) def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool: using_user_data_dir = self.user_data_dir if custom_data_dir else None From fc89d9651758dce463c060da8c114e05b9f0a946 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 Jun 2025 18:04:33 +0100 Subject: [PATCH 17/19] escape sequence --- .../modules/antibot_extractor_enricher/dropins/reddit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py index 14d9c8c..44d572b 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py @@ -69,7 +69,7 @@ class RedditDropin(Dropin): @logger.catch def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]: - filtered_urls = self.sb.execute_script(f""" + filtered_urls = self.sb.execute_script(rf""" return [...document.querySelectorAll("{self.video_selectors()}")] .map(el => el.src || el.href) .filter(url => url && /\.(m3u8|mpd|ism)$/.test(url)); From 6279610a43c4cd78317b74c1a10c589b019cac31 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 Jun 2025 18:28:45 +0100 Subject: [PATCH 18/19] updates docs --- docs/source/how_to/upgrading_1_0_1_to_1_1_0.md | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/docs/source/how_to/upgrading_1_0_1_to_1_1_0.md b/docs/source/how_to/upgrading_1_0_1_to_1_1_0.md index 7e8d398..81e00e2 100644 --- a/docs/source/how_to/upgrading_1_0_1_to_1_1_0.md +++ b/docs/source/how_to/upgrading_1_0_1_to_1_1_0.md @@ -1,6 +1,6 @@ # Upgrading from v1.0.1 -```{note} This how-to is only relevant for people who used Auto Archiver before June 2025 (versions prior to 1.0.1). +```{note} This how-to is only relevant for people who used Auto Archiver before June 2025 (versions prior to 1.1.0). If you are new to Auto Archiver, then you are already using the latest configuration format and this how-to is not relevant for you. ``` @@ -15,18 +15,22 @@ We have dropped the `vk_extractor` because of problems in a project we relied on Module 'vk_extractor' not found. Are you sure it's installed/exists? ``` -## New `antibot_extractor_enricher` module +## New `antibot_extractor_enricher` module and VkDropin We have added a new `antibot_extractor_enricher` module that uses a computer-controlled browser to extract content from websites that use anti-bot measures. You can add it to your configuration file like this: ```{code} yaml steps: - extractor_enrichers: - - antibot_extractor_enricher + extractors: + - antibot_extractor_enricher + + # or alternatively, if you want to use it as an enricher: + enrichers: + - antibot_extractor_enricher ``` It comes with Dropins that we will be adding and maintaining. -> Dropin: A module that is loaded automatically. You don't need to add them to your configuration steps for them to run. Sometimes they need `authentication` configurations though. +> Dropin: A module with site-specific behaviours that is loaded automatically. You don't need to add them to your configuration steps for them to run. Sometimes they need `authentication` configurations though. One such Dropin is the VkDropin which uses this automated browser to access VKontakte (VK) pages. You should add a username/password to the configuration file if you get authentication blocks from VK, to do so use the [authentication settings](authentication_how_to.md): From 8314833ae8ef1117112ed947e79b0f9cabde98a2 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 Jun 2025 18:34:33 +0100 Subject: [PATCH 19/19] removes exclude_media_extensions option --- .../modules/antibot_extractor_enricher/__manifest__.py | 4 ---- .../antibot_extractor_enricher.py | 8 -------- src/auto_archiver/utils/url.py | 4 +++- tests/extractors/test_antibot_extractor_enricher.py | 1 - 4 files changed, 3 insertions(+), 14 deletions(-) diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py index c7be89f..e2bcad9 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py @@ -17,10 +17,6 @@ "default": 50, "help": "maximum number of videos to download from the page (0 = no download, inf = no limit).", }, - "exclude_media_extensions": { - "default": ".svg,.ico,.gif", - "help": "CSV of media (image/video) file extensions to exclude from download", - }, "user_data_dir": { "default": "secrets/antibot_user_data", "help": "Path to the user data directory for the webdriver. This is used to persist browser state, such as cookies and local storage. If you use the docker deployment, this path will be appended with `_docker` that is because the folder cannot be shared between the host and the container due to user permissions.", diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py index 549cced..1982389 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py @@ -1,6 +1,5 @@ import base64 import math -import mimetypes import os import sys import traceback @@ -26,10 +25,6 @@ class AntibotExtractorEnricher(Extractor, Enricher): self.agent = None # Use the default UserAgent # parse configuration options - self.exclude_media_mimetypes = set( - [mimetypes.guess_type(f"file{m}")[0] for m in self.exclude_media_extensions.split(",")] - ) - {None} - if self.max_download_images == "inf": self.max_download_images = math.inf else: @@ -292,9 +287,6 @@ class AntibotExtractorEnricher(Extractor, Enricher): break if not is_relevant_url(src): continue - mimetype = mimetypes.guess_type(src)[0] - if mimetype in self.exclude_media_mimetypes: - continue full_src = urljoin(url, src) if full_src not in all_urls: filename, full_src = self.download_from_url(full_src, try_best_quality=True) diff --git a/src/auto_archiver/utils/url.py b/src/auto_archiver/utils/url.py index ea03d7f..2bb19cf 100644 --- a/src/auto_archiver/utils/url.py +++ b/src/auto_archiver/utils/url.py @@ -111,12 +111,14 @@ def is_relevant_url(url: str) -> bool: ("emoji.redditmedia.com",), ] + # TODO: make these globally configurable IRRELEVANT_ENDS_WITH = [ ".svg", # ignore SVGs ".ico", # ignore icons + # ignore index files for videos, these should be handled by ytdlp ".m3u8", ".mpd", - ".ism", # ignore index files for videos, these should be handled by ytdlp + ".ism", ] for end in IRRELEVANT_ENDS_WITH: diff --git a/tests/extractors/test_antibot_extractor_enricher.py b/tests/extractors/test_antibot_extractor_enricher.py index 600baf2..06107b4 100644 --- a/tests/extractors/test_antibot_extractor_enricher.py +++ b/tests/extractors/test_antibot_extractor_enricher.py @@ -34,7 +34,6 @@ class TestAntibotExtractorEnricher(TestExtractorBase): "save_to_pdf": False, "max_download_images": 0, "max_download_videos": 0, - "exclude_media_extensions": ".svg,.ico,.gif", "proxy": None, }