From 212bf67ab1b01166420a054b6db9b3974a3e8292 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 18 Jun 2025 00:29:36 +0100 Subject: [PATCH 01/17] installs ffmpeg in readthedocs --- .readthedocs.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 8d32a3a..5a0ae62 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -7,6 +7,8 @@ version: 2 build: os: ubuntu-22.04 + apt_packages: + - ffmpeg tools: python: "3.10" nodejs: "22" From 302e6f4258a17acdb366964505d4f172360f3638 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 18 Jun 2025 13:35:43 +0100 Subject: [PATCH 02/17] logs improved --- .../modules/gsheet_feeder_db/gsheet_feeder_db.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py index 00b403a..10300e0 100644 --- a/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py +++ b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py @@ -117,13 +117,15 @@ class GsheetsFeederDB(Feeder, Database): def done(self, item: Metadata, cached: bool = False) -> None: """archival result ready - should be saved to DB""" - logger.success(f"DONE {item.get_url()}") gw, row = self._retrieve_gsheet(item) - # self._safe_status_update(item, 'done') cell_updates = [] row_values = gw.get_row(row) + spreadsheet = gw.wks.spreadsheet.title + worksheet = gw.wks.title + logger.info(f"DONE url='{item.get_url()}' {row=} on {spreadsheet=} : {worksheet=}") + def batch_if_valid(col, val, final_value=None): final_value = final_value or val try: From d46eeee9b64770e9abec5d9ffbc1fa26e42ff5b8 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 18 Jun 2025 13:35:51 +0100 Subject: [PATCH 03/17] docs improved --- src/auto_archiver/modules/generic_extractor/__manifest__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/auto_archiver/modules/generic_extractor/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py index 52cf8b8..62bd4c8 100644 --- a/src/auto_archiver/modules/generic_extractor/__manifest__.py +++ b/src/auto_archiver/modules/generic_extractor/__manifest__.py @@ -58,7 +58,7 @@ If you are having issues with the extractor, you can review the version of `yt-d }, "proxy": { "default": "", - "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port", + "help": "http/https/socks proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port", }, "end_means_success": { "default": True, From 4a36e6f6b0ded28a703b2c545d6cbaafd2f0daec Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 18 Jun 2025 13:50:21 +0100 Subject: [PATCH 04/17] fix tests --- tests/databases/test_gsheet_db.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/databases/test_gsheet_db.py b/tests/databases/test_gsheet_db.py index 0760c79..d951fc5 100644 --- a/tests/databases/test_gsheet_db.py +++ b/tests/databases/test_gsheet_db.py @@ -10,7 +10,10 @@ def mock_gworksheet(mocker): mock_gworksheet = mocker.MagicMock(spec=GWorksheet) mock_gworksheet.col_exists.return_value = True mock_gworksheet.get_cell.return_value = "" - mock_gworksheet.get_row.return_value = {} + mock_gworksheet.wks = mocker.MagicMock() + mock_gworksheet.wks.spreadsheet = mocker.MagicMock() + mock_gworksheet.wks.spreadsheet.title = "Test Spreadsheet" + mock_gworksheet.title = "Test Worksheet" return mock_gworksheet From 592dc3041517f9db51b3830236e793f3c3140604 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 18 Jun 2025 16:40:55 +0100 Subject: [PATCH 05/17] closes #330 --- .../instagram_api_extractor/__manifest__.py | 2 +- .../instagram_api_extractor.py | 90 ++++++++++++------- .../test_instagram_api_extractor.py | 15 ++-- 3 files changed, 66 insertions(+), 41 deletions(-) diff --git a/src/auto_archiver/modules/instagram_api_extractor/__manifest__.py b/src/auto_archiver/modules/instagram_api_extractor/__manifest__.py index c40a5d8..2ecc6b9 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/__manifest__.py +++ b/src/auto_archiver/modules/instagram_api_extractor/__manifest__.py @@ -22,7 +22,7 @@ "full_profile_max_posts": { "default": 0, "type": "int", - "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights", + "help": "Use to limit the number of posts to download when full_profile is true or when a URL for multiple posts is passed (like /stories /highlights ...). 0 means no limit. when full_profile is true the order of downloaded content is stories -> posts -> tagged posts -> highlights, so a value of 10 could download 2 stories, 7 posts, 1 tagged posts, and 0 highlights.", }, "minimize_json_output": { "default": True, diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py index 5f13ecf..1694ddc 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py +++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py @@ -8,8 +8,10 @@ data, reducing JSON output size, and handling large profiles. """ +import math import re from datetime import datetime +import traceback import requests from loguru import logger @@ -35,10 +37,12 @@ class InstagramAPIExtractor(Extractor): def setup(self) -> None: if self.api_endpoint[-1] == "/": self.api_endpoint = self.api_endpoint[:-1] + self.full_profile_max_posts = int(self.full_profile_max_posts or 0) + if self.full_profile_max_posts == 0: + self.full_profile_max_posts = math.inf def download(self, item: Metadata) -> Metadata: url = item.get_url() - url.replace("instagr.com", "instagram.com").replace("instagr.am", "instagram.com") insta_matches = self.valid_url.findall(url) logger.info(f"{insta_matches=}") @@ -97,57 +101,74 @@ class InstagramAPIExtractor(Extractor): filename = self.download_from_url(pic_url) result.add_media(Media(filename=filename), id="profile_picture") + count_posts = 0 if self.full_profile: user_id = user.get("pk") # download all stories try: - stories = self._download_stories_reusable(result, username) + stories = self._download_stories_reusable( + result, username, max_to_download=self.full_profile_max_posts - count_posts + ) + count_posts += len(stories) result.set("#stories", len(stories)) except Exception as e: result.append("errors", f"Error downloading stories for {username}") - logger.error(f"Error downloading stories for {username}: {e}") + logger.error(f"Error downloading stories for {username}: {e} {traceback.format_exc()}") # download all posts try: - self.download_all_posts(result, user_id) + if count_posts < self.full_profile_max_posts: + count_posts += self.download_all_posts( + result, user_id, max_to_download=self.full_profile_max_posts - count_posts + ) except Exception as e: result.append("errors", f"Error downloading posts for {username}") - logger.error(f"Error downloading posts for {username}: {e}") + logger.error(f"Error downloading posts for {username}: {e} {traceback.format_exc()}") # download all tagged try: - self.download_all_tagged(result, user_id) + if count_posts < self.full_profile_max_posts: + count_posts += self.download_all_tagged( + result, user_id, max_to_download=self.full_profile_max_posts - count_posts + ) except Exception as e: result.append("errors", f"Error downloading tagged posts for {username}") - logger.error(f"Error downloading tagged posts for {username}: {e}") + logger.error(f"Error downloading tagged posts for {username}: {e} {traceback.format_exc()}") # download all highlights try: - self.download_all_highlights(result, username, user_id) + if count_posts < self.full_profile_max_posts: + count_posts += self.download_all_highlights( + result, username, user_id, max_to_download=self.full_profile_max_posts - count_posts + ) except Exception as e: result.append("errors", f"Error downloading highlights for {username}") - logger.error(f"Error downloading highlights for {username}: {e}") + logger.error(f"Error downloading highlights for {username}: {e} {traceback.format_exc()}") result.set_url(url) # reset as scrape_item modifies it return result.success("insta profile") - def download_all_highlights(self, result, username, user_id): + def download_all_highlights(self, result, username, user_id, max_to_download: int) -> int: count_highlights = 0 highlights = self.call_api("v1/user/highlights", {"user_id": user_id}) + highlights = highlights[: min(max_to_download, len(highlights))] # newest to oldest for h in highlights: try: - h_info = self._download_highlights_reusable(result, h.get("pk")) + h_info = self._download_highlights_reusable(result, h.get("pk"), max_to_download=max_to_download) count_highlights += len(h_info.get("items", [])) except Exception as e: result.append( "errors", f"Error downloading highlight id{h.get('pk')} for {username}", ) - logger.error(f"Error downloading highlight id{h.get('pk')} for {username}: {e}") - if self.full_profile_max_posts and count_highlights >= self.full_profile_max_posts: - logger.info(f"HIGHLIGHTS reached full_profile_max_posts={self.full_profile_max_posts}") + logger.error( + f"Error downloading highlight id{h.get('pk')} for {username}: {e} {traceback.format_exc()}" + ) + if count_highlights >= max_to_download: + logger.debug(f"HIGHLIGHTS reached max_to_download={self.full_profile_max_posts}") break result.set("#highlights", count_highlights) + return count_highlights def download_post(self, result: Metadata, code: str = None, id: str = None, context: str = None) -> Metadata: if id: @@ -166,13 +187,13 @@ class InstagramAPIExtractor(Extractor): return result.success(f"insta {context or 'post'}") def download_highlights(self, result: Metadata, id: str) -> Metadata: - h_info = self._download_highlights_reusable(result, id) + h_info = self._download_highlights_reusable(result, id, self.full_profile_max_posts) items = len(h_info.get("items", [])) del h_info["items"] result.set_title(h_info.get("title")).set("data", h_info).set("#reels", items) return result.success("insta highlights") - def _download_highlights_reusable(self, result: Metadata, id: str) -> dict: + def _download_highlights_reusable(self, result: Metadata, id: str, max_to_download: int) -> dict: full_h = self.call_api("v2/highlight/by/id", {"id": id}) h_info = full_h.get("response", {}).get("reels", {}).get(f"highlight:{id}") assert h_info, f"Highlight {id} not found: {full_h=}" @@ -182,38 +203,39 @@ class InstagramAPIExtractor(Extractor): result.add_media(Media(filename=filename), id=f"cover_media highlight {id}") items = h_info.get("items", [])[::-1] # newest to oldest + items = items[: min(max_to_download, len(items))] for h in tqdm(items, desc="downloading highlights", unit="highlight"): try: self.scrape_item(result, h, "highlight") except Exception as e: result.append("errors", f"Error downloading highlight {h.get('id')}") - logger.error(f"Error downloading highlight, skipping {h.get('id')}: {e}") + logger.error(f"Error downloading highlight, skipping {h.get('id')}: {e} {traceback.format_exc()}") return h_info def download_stories(self, result: Metadata, username: str) -> Metadata: now = datetime.now().strftime("%Y-%m-%d_%H-%M") - stories = self._download_stories_reusable(result, username) + stories = self._download_stories_reusable(result, username, max_to_download=self.full_profile_max_posts) if stories == []: return result.success("insta no story") result.set_title(f"stories {username} at {now}").set("#stories", len(stories)) return result.success(f"insta stories {now}") - def _download_stories_reusable(self, result: Metadata, username: str) -> list[dict]: + def _download_stories_reusable(self, result: Metadata, username: str, max_to_download: int) -> list[dict]: stories = self.call_api("v1/user/stories/by/username", {"username": username}) if not stories or not len(stories): return [] - stories = stories[::-1] # newest to oldest + stories = stories[::-1][: min(max_to_download, len(stories))] # newest to oldest for s in tqdm(stories, desc="downloading stories", unit="story"): try: self.scrape_item(result, s, "story") except Exception as e: result.append("errors", f"Error downloading story {s.get('id')}") - logger.error(f"Error downloading story, skipping {s.get('id')}: {e}") + logger.error(f"Error downloading story, skipping {s.get('id')}: {e} {traceback.format_exc()}") return stories - def download_all_posts(self, result: Metadata, user_id: str): + def download_all_posts(self, result: Metadata, user_id: str, max_to_download: int) -> int: end_cursor = None pbar = tqdm(desc="downloading posts") @@ -223,22 +245,23 @@ class InstagramAPIExtractor(Extractor): if not posts or not isinstance(posts, list) or len(posts) != 2: break posts, end_cursor = posts[0], posts[1] - logger.info(f"parsing {len(posts)} posts, next {end_cursor=}") - + posts = posts[: min(max_to_download, len(posts))] + logger.info(f"parsing {len(posts)} posts, next {end_cursor=} {post_count=} {max_to_download=}") for p in posts: try: self.scrape_item(result, p, "post") except Exception as e: result.append("errors", f"Error downloading post {p.get('id')}") - logger.error(f"Error downloading post, skipping {p.get('id')}: {e}") + logger.error(f"Error downloading post, skipping {p.get('id')}: {e} {traceback.format_exc()}") pbar.update(1) post_count += 1 - if self.full_profile_max_posts and post_count >= self.full_profile_max_posts: - logger.info(f"POSTS reached full_profile_max_posts={self.full_profile_max_posts}") + if post_count >= max_to_download: + logger.info(f"POSTS reached max_to_download={self.full_profile_max_posts}") break result.set("#posts", post_count) + return post_count - def download_all_tagged(self, result: Metadata, user_id: str): + def download_all_tagged(self, result: Metadata, user_id: str, max_to_download: int) -> int: next_page_id = "" pbar = tqdm(desc="downloading tagged posts") @@ -251,21 +274,22 @@ class InstagramAPIExtractor(Extractor): next_page_id = resp.get("next_page_id") logger.info(f"parsing {len(posts)} tagged posts, next {next_page_id=}") - + posts = posts[: min(max_to_download, len(posts))] for p in posts: try: self.scrape_item(result, p, "tagged") except Exception as e: result.append("errors", f"Error downloading tagged post {p.get('id')}") - logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e}") + logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e} {traceback.format_exc()}") pbar.update(1) tagged_count += 1 - if self.full_profile_max_posts and tagged_count >= self.full_profile_max_posts: - logger.info(f"TAGS reached full_profile_max_posts={self.full_profile_max_posts}") + if tagged_count >= max_to_download: + logger.info(f"TAGS reached max_to_download={self.full_profile_max_posts}") break result.set("#tagged", tagged_count) + return tagged_count - ### reusable parsing utils below + # reusable parsing utils below def scrape_item(self, result: Metadata, item: dict, context: str = None) -> dict: """ diff --git a/tests/extractors/test_instagram_api_extractor.py b/tests/extractors/test_instagram_api_extractor.py index d8a1cc0..8463c49 100644 --- a/tests/extractors/test_instagram_api_extractor.py +++ b/tests/extractors/test_instagram_api_extractor.py @@ -1,4 +1,5 @@ from datetime import datetime +import math import pytest @@ -147,14 +148,14 @@ class TestInstagramAPIExtractor(TestExtractorBase): self.extractor.full_profile = True mock_call.side_effect = [mock_user_response, mock_story_response] - mock_highlights.return_value = None + mock_highlights.return_value = 1 mock_stories.return_value = mock_story_response - mock_posts.return_value = None - mock_tagged.return_value = None + mock_posts.return_value = 2 + mock_tagged.return_value = 3 result = self.extractor.download_profile(metadata, "test_user") assert result.get("#stories") == len(mock_story_response) - mock_posts.assert_called_once_with(result, "123") + mock_posts.assert_called_once_with(result, "123", max_to_download=math.inf) assert "errors" not in result.metadata def test_download_profile_not_found(self, metadata, mocker): @@ -175,10 +176,10 @@ class TestInstagramAPIExtractor(TestExtractorBase): self.extractor.full_profile = True mock_call.side_effect = [mock_user_response, Exception("Stories API failed"), Exception("Posts API failed")] - mock_highlights.return_value = None - mock_tagged.return_value = None + mock_highlights.return_value = 1 + mock_tagged.return_value = 2 stories_tagged.return_value = None - mock_posts.return_value = None + mock_posts.return_value = 4 result = self.extractor.download_profile(metadata, "test_user") assert result.is_success() From 12b457706b1a673e7514f1cbf5fdf0771a280e1d Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 18 Jun 2025 17:37:44 +0100 Subject: [PATCH 06/17] closes #166 adds story URL feature to telethon extractor --- .../telethon_extractor/telethon_extractor.py | 112 +++++++++++------- tests/extractors/test_telethon_extractor.py | 36 ++++++ 2 files changed, 104 insertions(+), 44 deletions(-) diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 279dccd..2643b32 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -5,6 +5,7 @@ import time from pathlib import Path from datetime import date +from telethon import functions from telethon.sync import TelegramClient from telethon.errors import ChannelInvalidError from telethon.tl.functions.messages import ImportChatInviteRequest @@ -24,7 +25,7 @@ from auto_archiver.utils import random_str class TelethonExtractor(Extractor): - valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") + valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+?)(\/s){0,1}\/(\d+)") invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") def setup(self) -> None: @@ -122,62 +123,85 @@ class TelethonExtractor(Extractor): is_private = match.group(1) == "/c" chat = int(match.group(2)) if is_private else match.group(2) - post_id = int(match.group(3)) + is_story = match.group(3) == "/s" + post_id = int(match.group(4)) result = Metadata() # NB: not using bot_token since then private channels cannot be archived: self.client.start(bot_token=self.bot_token) with self.client.start(): # with self.client.start(bot_token=self.bot_token): - try: - post = self.client.get_messages(chat, ids=post_id) - except ValueError as e: - logger.error(f"Could not fetch telegram {url} possibly it's private: {e}") - return False - except ChannelInvalidError as e: - logger.error( - f"Could not fetch telegram {url}. This error may be fixed if you setup a bot_token in addition to api_id and api_hash (but then private channels will not be archived, we need to update this logic to handle both): {e}" - ) - return False + if is_story: + try: + stories = self.client(functions.stories.GetStoriesByIDRequest(peer=chat, id=[post_id])) + if not stories.stories: + logger.info(f"No stories found for {url}, possibly it's private or the story has expired.") + return False + story = stories.stories[0] + logger.debug(f"TELETHON got story {story.id=} {story.date=} {story.expire_date=}") + result.set_timestamp(story.date).set("views", story.views.to_dict()).set( + "expire_date", story.expire_date + ) - logger.debug(f"TELETHON GOT POST {post=}") - if post is None: - return False + # download the story media + filename_dest = os.path.join(self.tmp_dir, f"{chat}_{post_id}", str(story.id)) + if filename := self.client.download_media(story.media, filename_dest): + result.add_media(Media(filename)) + except Exception as e: + logger.error(f"Error fetching story {post_id} from {chat}: {e}") + return False + else: + try: + post = self.client.get_messages(chat, ids=post_id) + except ValueError as e: + logger.error(f"Could not fetch telegram {url} possibly it's private: {e}") + return False + except ChannelInvalidError as e: + logger.error( + f"Could not fetch telegram {url}. This error may be fixed if you setup a bot_token in addition to api_id and api_hash (but then private channels will not be archived, we need to update this logic to handle both): {e}" + ) + return False - media_posts = self._get_media_posts_in_group(chat, post) - logger.debug(f"got {len(media_posts)=} for {url=}") + logger.debug(f"TELETHON got post {post=}") + if post is None: + return False - tmp_dir = self.tmp_dir + media_posts = self._get_media_posts_in_group(chat, post) + logger.debug(f"got {len(media_posts)=} for {url=}") - group_id = post.grouped_id if post.grouped_id is not None else post.id - title = post.message - for mp in media_posts: - if len(mp.message) > len(title): - title = mp.message # save the longest text found (usually only 1) + group_id = post.grouped_id if post.grouped_id is not None else post.id + title = post.message + for mp in media_posts: + if len(mp.message) > len(title): + title = mp.message # save the longest text found (usually only 1) - # media can also be in entities - if mp.entities: - other_media_urls = [ - e.url - for e in mp.entities - if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"] - ] - if len(other_media_urls): - logger.debug(f"Got {len(other_media_urls)} other media urls from {mp.id=}: {other_media_urls}") - for i, om_url in enumerate(other_media_urls): - filename = self.download_from_url(om_url, f"{chat}_{group_id}_{i}") - result.add_media(Media(filename=filename), id=f"{group_id}_{i}") + # media can also be in entities + if mp.entities: + other_media_urls = [ + e.url + for e in mp.entities + if hasattr(e, "url") + and e.url + and self._guess_file_type(e.url) in ["video", "image", "audio"] + ] + if len(other_media_urls): + logger.debug( + f"Got {len(other_media_urls)} other media urls from {mp.id=}: {other_media_urls}" + ) + for i, om_url in enumerate(other_media_urls): + filename = self.download_from_url(om_url, f"{chat}_{group_id}_{i}") + result.add_media(Media(filename=filename), id=f"{group_id}_{i}") - filename_dest = os.path.join(tmp_dir, f"{chat}_{group_id}", str(mp.id)) - filename = self.client.download_media(mp.media, filename_dest) - if not filename: - logger.debug(f"Empty media found, skipping {str(mp)=}") - continue - result.add_media(Media(filename)) + filename_dest = os.path.join(self.tmp_dir, f"{chat}_{group_id}", str(mp.id)) + filename = self.client.download_media(mp.media, filename_dest) + if not filename: + logger.debug(f"Empty media found, skipping {str(mp)=}") + continue + result.add_media(Media(filename)) - result.set_title(title).set_timestamp(post.date).set("api_data", post.to_dict()) - if post.message != title: - result.set_content(post.message) + result.set_title(title).set_timestamp(post.date).set("api_data", post.to_dict()) + if post.message != title: + result.set_content(post.message) return result.success("telethon") def _get_media_posts_in_group(self, chat, original_post, max_amp=10): diff --git a/tests/extractors/test_telethon_extractor.py b/tests/extractors/test_telethon_extractor.py index 0a2d5c8..a1a5aa9 100644 --- a/tests/extractors/test_telethon_extractor.py +++ b/tests/extractors/test_telethon_extractor.py @@ -3,6 +3,8 @@ from datetime import date import pytest +from auto_archiver.modules.telethon_extractor.telethon_extractor import TelethonExtractor + @pytest.fixture(autouse=True) def mock_client_setup(mocker): @@ -24,3 +26,37 @@ def test_setup_fails_clear_session_file(get_lazy_module, tmp_path, mocker): assert session_file.exists() assert f"telethon-{date.today().strftime('%Y-%m-%d')}" in lazy_module._instance.session_file assert os.path.exists(lazy_module._instance.session_file + ".session") + + +@pytest.mark.parametrize( + "url,expected", + [ + ("https://t.me/channel/123", True), + ("https://t.me/c/123/456", True), + ("https://t.me/channel/s/789", True), + ("https://t.me/c/123/s/456", True), + ("https://t.me/with_single/1234567?single", True), + ("https://t.me/invalid", False), + ("https://example.com/nottelegram/123", False), + ], +) +def test_valid_url_regex(url, expected, get_lazy_module): + match = TelethonExtractor.valid_url.search(url) + assert bool(match) == expected + + +@pytest.mark.parametrize( + "invite,expected", + [ + ("t.me/joinchat/AAAAAE", True), + ("t.me/+AAAAAE", True), + ("t.me/AAAAAE", True), + ("https://t.me/joinchat/AAAAAE", True), + ("https://t.me/+AAAAAE", True), + ("https://t.me/AAAAAE", True), + ("https://example.com/AAAAAE", False), + ], +) +def test_invite_pattern_regex(invite, expected, get_lazy_module): + match = TelethonExtractor.invite_pattern.search(invite) + assert bool(match) == expected From ade7feb5a0619ec628b71a69aae717edb15334de Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 18 Jun 2025 17:38:17 +0100 Subject: [PATCH 07/17] version bump --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index cdbb86b..4b2eff7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [project] name = "auto-archiver" -version = "1.1.0" +version = "1.1.1" description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)." requires-python = ">=3.10,<3.13" From ce4d7ac6496d2621fdace69217014e6c6a04bae0 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Sat, 21 Jun 2025 15:54:51 +0100 Subject: [PATCH 08/17] WIP refactor logging --- scripts/telegram_setup.py | 2 +- src/auto_archiver/core/base_module.py | 2 +- src/auto_archiver/core/config.py | 8 +-- src/auto_archiver/core/extractor.py | 6 +- src/auto_archiver/core/media.py | 5 +- src/auto_archiver/core/metadata.py | 2 +- src/auto_archiver/core/module.py | 2 +- src/auto_archiver/core/orchestrator.py | 48 ++++++++------- src/auto_archiver/core/storage.py | 2 +- .../antibot_extractor_enricher.py | 23 ++++--- .../captcha_services/anti_captcha.py | 60 +++++++++++++++++++ .../antibot_extractor_enricher/dropin.py | 7 ++- .../dropins/linkedin.py | 4 +- .../dropins/reddit.py | 8 +-- .../antibot_extractor_enricher/dropins/vk.py | 6 +- src/auto_archiver/modules/api_db/api_db.py | 6 +- .../atlos_feeder_db_storage.py | 16 ++--- .../modules/cli_feeder/cli_feeder.py | 5 -- .../modules/console_db/console_db.py | 2 +- src/auto_archiver/modules/csv_db/csv_db.py | 2 +- .../modules/csv_feeder/csv_feeder.py | 9 ++- .../modules/gdrive_storage/gdrive_storage.py | 10 ++-- .../modules/generic_extractor/bluesky.py | 4 +- .../generic_extractor/generic_extractor.py | 51 ++++++++-------- .../modules/generic_extractor/tiktok.py | 4 +- .../modules/generic_extractor/twitter.py | 6 +- .../gsheet_feeder_db/gsheet_feeder_db.py | 39 ++++++------ .../modules/hash_enricher/hash_enricher.py | 5 +- .../modules/html_formatter/html_formatter.py | 4 +- .../instagram_api_extractor.py | 36 +++++------ .../instagram_extractor.py | 40 ++++++------- .../instagram_tbot_extractor.py | 2 +- .../modules/json_enricher/json_enricher.py | 2 +- .../modules/local_storage/local_storage.py | 2 +- .../modules/meta_enricher/meta_enricher.py | 2 +- .../metadata_enricher/metadata_enricher.py | 2 +- .../opentimestamps_enricher.py | 2 +- .../pdq_hash_enricher/pdq_hash_enricher.py | 2 +- .../modules/s3_storage/s3_storage.py | 2 +- .../modules/ssl_enricher/ssl_enricher.py | 2 +- .../telegram_extractor/telegram_extractor.py | 2 +- .../telethon_extractor/telethon_extractor.py | 2 +- .../thumbnail_enricher/thumbnail_enricher.py | 2 +- .../timestamping_enricher.py | 2 +- .../twitter_api_extractor.py | 2 +- .../wacz_extractor_enricher.py | 2 +- .../wayback_extractor_enricher.py | 2 +- .../whisper_enricher/whisper_enricher.py | 2 +- src/auto_archiver/utils/custom_logger.py | 37 ++++++++++++ src/auto_archiver/utils/misc.py | 2 +- tests/conftest.py | 2 +- .../example_extractor/example_extractor.py | 2 +- .../example_module/example_module.py | 2 +- tests/test_implementation.py | 2 +- 54 files changed, 298 insertions(+), 207 deletions(-) create mode 100644 src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/anti_captcha.py create mode 100644 src/auto_archiver/utils/custom_logger.py diff --git a/scripts/telegram_setup.py b/scripts/telegram_setup.py index 9480cd8..c11f94a 100644 --- a/scripts/telegram_setup.py +++ b/scripts/telegram_setup.py @@ -14,7 +14,7 @@ You will need to provide your phone number and a 2FA code the first time you run import os from telethon.sync import TelegramClient -from loguru import logger +from auto_archiver.utils.custom_logger import logger # Create a diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index 6461ab7..f12c38d 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -7,7 +7,7 @@ from tempfile import TemporaryDirectory from auto_archiver.utils import url as UrlUtil from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES -from loguru import logger +from auto_archiver.utils.custom_logger import logger if TYPE_CHECKING: from .module import ModuleFactory diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index a2d7679..8e65edf 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -10,7 +10,7 @@ from ruamel.yaml import YAML, CommentedMap import json import os -from loguru import logger +from auto_archiver.utils.custom_logger import logger from copy import deepcopy from auto_archiver.core.consts import MODULE_TYPES @@ -118,8 +118,7 @@ class DefaultValidatingParser(argparse.ArgumentParser): """ Override of error to format a nicer looking error message using logger """ - logger.error("Problem with configuration file (tip: use --help to see the available options):") - logger.error(message) + logger.error(f"Problem with configuration file (tip: use --help to see the available options): \n{message}") self.exit(2) def parse_known_args(self, args=None, namespace=None): @@ -136,8 +135,7 @@ class DefaultValidatingParser(argparse.ArgumentParser): try: self._check_value(action, action.default) except argparse.ArgumentError as e: - logger.error(f"You have an invalid setting in your configuration file ({action.dest}):") - logger.error(e) + logger.error(f"You have an invalid setting in your configuration file ({action.dest}):\n {e}") exit() return super().parse_known_args(args, namespace) diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index 5dca928..1720c68 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -12,7 +12,7 @@ from contextlib import suppress import mimetypes import os import requests -from loguru import logger +from auto_archiver.utils.custom_logger import logger from retrying import retry import re @@ -94,7 +94,7 @@ class Extractor(BaseModule): to_filename = to_filename[-64:] to_filename = os.path.join(self.tmp_dir, to_filename) if verbose: - logger.debug(f"downloading {url[0:50]=} {to_filename=}") + logger.debug(f"downloading {to_filename=}") headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36" } @@ -117,7 +117,7 @@ class Extractor(BaseModule): return to_filename except requests.RequestException as e: - logger.warning(f"Failed to fetch the Media URL: {str(e)[:250]}") + logger.warning(f"Failed to fetch the Media URL: {e}") if try_best_quality: return None, url diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py index 2fad0ec..fee81d3 100644 --- a/src/auto_archiver/core/media.py +++ b/src/auto_archiver/core/media.py @@ -11,7 +11,7 @@ from dataclasses import dataclass, field from dataclasses_json import dataclass_json, config import mimetypes -from loguru import logger +from auto_archiver.utils.custom_logger import logger @dataclass_json # annotation order matters @@ -121,8 +121,7 @@ class Media: except Error: return False # ffmpeg errors when reading bad files except Exception as e: - logger.error(e) - logger.error(traceback.format_exc()) + logger.error(f"{e}: {traceback.format_exc()}") try: fsize = os.path.getsize(self.filename) return fsize > 20_000 diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index 370af78..f1ac3c0 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -17,7 +17,7 @@ from dataclasses_json import dataclass_json import datetime from urllib.parse import urlparse from dateutil.parser import parse as parse_dt -from loguru import logger +from auto_archiver.utils.custom_logger import logger from .media import Media diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index f620500..1aad298 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -16,7 +16,7 @@ import sys from importlib.util import find_spec import os from os.path import join -from loguru import logger +from auto_archiver.utils.custom_logger import logger import auto_archiver from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE, SetupError diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index a028ac7..27a1bc9e 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -15,9 +15,11 @@ import traceback from copy import copy from rich_argparse import RichHelpFormatter -from loguru import logger +from auto_archiver.utils.custom_logger import logger import requests +from auto_archiver.utils.misc import random_str + from .metadata import Metadata, Media from auto_archiver.version import __version__ from .config import ( @@ -342,7 +344,12 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ # add other logging info if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0 use_level = logging_config["level"] - self.logger_id = logger.add(sys.stderr, level=use_level) + self.logger_id = logger.add( + sys.stderr, + level=use_level, + catch=True, + format="{level}: {message} {extra[serialize_no_message]}", + ) rotation = logging_config["rotation"] log_file = logging_config["file"] @@ -356,9 +363,10 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ f"{log_file}.{i}_{level.lower()}", filter=lambda rec, lvl=level: rec["level"].name == lvl, rotation=rotation, + format="{extra[serialized]}", ) elif log_file: - logger.add(log_file, rotation=rotation, level=use_level) + logger.add(log_file, rotation=rotation, level=use_level, format="{extra[serialized]}") def install_modules(self, modules_by_type): """ @@ -466,13 +474,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ update_cmd = "`docker pull bellingcat/auto-archiver:latest`" else: update_cmd = "`pip install --upgrade auto-archiver`" - logger.warning("") - logger.warning("********* IMPORTANT: UPDATE AVAILABLE ********") logger.warning( - f"A new version of auto-archiver is available (v{latest_version}, you have v{current_version})" + f"\n********* IMPORTANT: UPDATE AVAILABLE ********\nA new version of auto-archiver is available (v{latest_version}, you have v{current_version})\nMake sure to update to the latest version using: {update_cmd}\n" ) - logger.warning(f"Make sure to update to the latest version using: {update_cmd}") - logger.warning("") def setup(self, args: list): """ @@ -522,7 +526,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ self.setup(args) return self.feed() except Exception as e: - logger.error(e) + logger.error(f"{e}: {traceback.format_exc()}") exit(1) def cleanup(self) -> None: @@ -534,10 +538,12 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ url_count = 0 for feeder in self.feeders: for item in feeder: - yield self.feed_item(item) - url_count += 1 + with logger.contextualize(url=item.get_url(), trace=random_str(12)): + logger.info("started processing") + yield self.feed_item(item) + url_count += 1 - logger.info(f"Processed {url_count} URL(s)") + logger.info(f"processed {url_count} URL(s)") self.cleanup() def feed_item(self, item: Metadata) -> Metadata: @@ -555,13 +561,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ return self.archive(item) except KeyboardInterrupt: # catches keyboard interruptions to do a clean exit - logger.warning(f"caught interrupt on {item=}") + logger.warning("caught interrupt") for d in self.databases: d.aborted(item) self.cleanup() exit() except Exception as e: - logger.error(f"Got unexpected error on item {item}: {e}\n{traceback.format_exc()}") + logger.error(f"Got unexpected error: {e}\n{traceback.format_exc()}") for d in self.databases: if isinstance(e, AssertionError): d.failed(item, str(e)) @@ -589,7 +595,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ try: check_url_or_raise(original_url) except ValueError as e: - logger.error(f"Error archiving URL {original_url}: {e}") + logger.error(f"Error archiving: {e}") raise e # 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs @@ -599,7 +605,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ result.set_url(url) if original_url != url: - logger.debug(f"Sanitized URL from {original_url} to {url}") + logger.debug(f"Sanitized URL to {url}") result.set("original_url", original_url) # 2 - notify start to DBs, propagate already archived if feature enabled in DBs @@ -614,25 +620,25 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ try: d.done(cached_result, cached=True) except Exception as e: - logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}") + logger.error(f"database {d.name}: {e}: {traceback.format_exc()}") return cached_result # 3 - call extractors until one succeeds for a in self.extractors: - logger.info(f"Trying extractor {a.name} for {url}") + logger.info(f"trying extractor {a.name}") try: result.merge(a.download(result)) if result.is_success(): break except Exception as e: - logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}") + logger.error(f"archiver {a.name}: {e}: {traceback.format_exc()}") # 4 - call enrichers to work with archived content for e in self.enrichers: try: e.enrich(result) except Exception as exc: - logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}") + logger.error(f"enricher {e.name}: {exc}: {traceback.format_exc()}") # 5 - store all downloaded/generated media result.store(storages=self.storages) @@ -651,7 +657,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ try: d.done(result) except Exception as e: - logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}") + logger.error(f"database {d.name}: {e}: {traceback.format_exc()}") return result diff --git a/src/auto_archiver/core/storage.py b/src/auto_archiver/core/storage.py index 3205f5a..fd743cb 100644 --- a/src/auto_archiver/core/storage.py +++ b/src/auto_archiver/core/storage.py @@ -24,7 +24,7 @@ from abc import abstractmethod from typing import IO import os -from loguru import logger +from auto_archiver.utils.custom_logger import logger from slugify import slugify from auto_archiver.utils.misc import random_str diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py index 04e4702..e380adb 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py @@ -7,7 +7,7 @@ from urllib.parse import urljoin import glob import importlib.util -from loguru import logger +from auto_archiver.utils.custom_logger import logger import selenium from seleniumbase import SB @@ -57,7 +57,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): continue # Skip imported modules/classes/functions if isinstance(obj, type) and issubclass(obj, Dropin): dropins.append(obj) - logger.debug(f"ANTIBOT loaded drop-in classes: {', '.join([d.__name__ for d in dropins])}") + logger.debug(f"loaded drop-in classes: {', '.join([d.__name__ for d in dropins])}") return dropins def sanitize_url(self, url: str) -> str: @@ -83,14 +83,13 @@ class AntibotExtractorEnricher(Extractor, Enricher): def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool: using_user_data_dir = self.user_data_dir if custom_data_dir else None url = to_enrich.get_url() - url_sample = url[:75] try: with SB(uc=True, agent=self.agent, headed=None, user_data_dir=using_user_data_dir, proxy=self.proxy) as sb: - logger.info(f"ANTIBOT selenium browser is up with agent {self.agent}, opening {url_sample}...") + logger.info(f"selenium browser is up with agent {self.agent}, opening url...") sb.uc_open_with_reconnect(url, 4) - logger.debug(f"ANTIBOT handling CAPTCHAs for {url_sample}...") + logger.debug("handling CAPTCHAs for...") sb.uc_gui_handle_cf() sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future @@ -98,7 +97,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): dropin.open_page(url) if self.detect_auth_wall and self._hit_auth_wall(sb): - logger.warning(f"ANTIBOT SKIP since auth wall or CAPTCHA was detected for {url_sample}") + logger.warning("skipping since auth wall or CAPTCHA was detected") return False sb.wait_for_ready_state_complete() @@ -125,18 +124,18 @@ class AntibotExtractorEnricher(Extractor, Enricher): js_css_selector=dropin.js_for_video_css_selectors(), max_media=self.max_download_videos - downloaded_videos, ) - logger.info(f"ANTIBOT completed for {url_sample}") + logger.info("completed") return to_enrich except selenium.common.exceptions.SessionNotCreatedException as e: if custom_data_dir: # the retry logic only works once logger.error( - f"ANTIBOT session not created error: {e}. Please remove the user_data_dir {self.user_data_dir} and try again, will retry without user data dir though." + f"session not created error: {e}. Please remove the user_data_dir {self.user_data_dir} and try again, will retry without user data dir though." ) return self.enrich(to_enrich, custom_data_dir=False) raise e # re-raise except Exception as e: - logger.error(f"ANTIBOT runtime error: {e}: {traceback.format_exc()}") + logger.error(f"runtime error: {e}: {traceback.format_exc()}") return False def _get_suitable_dropin(self, url: str, sb: SB): @@ -146,7 +145,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): """ for dropin in self.dropins: if dropin.suitable(url): - logger.debug(f"ANTIBOT using drop-in {dropin.__name__} for {url}") + logger.debug(f"using drop-in {dropin.__name__}") return dropin(sb, self) return DefaultDropin(sb, self) @@ -241,7 +240,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): x = max(sb.execute_script("return document.documentElement.scrollWidth"), w) y = min(max(sb.execute_script("return document.documentElement.scrollHeight"), h), 25_000) - logger.debug(f"Setting window size to {x}x{y} for full page screenshot.") + logger.debug(f"setting window size to {x}x{y} for full page screenshot.") sb.set_window_size(x, y) screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png") @@ -280,7 +279,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): # js_for_css_selectors for src in sources: if len(all_urls) >= max_media: - logger.debug(f"Reached max download limit of {max_media} images/videos.") + logger.debug(f"reached max download limit of {max_media} images/videos.") break if not is_relevant_url(src): continue diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/anti_captcha.py b/src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/anti_captcha.py new file mode 100644 index 0000000..f624953 --- /dev/null +++ b/src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/anti_captcha.py @@ -0,0 +1,60 @@ +# def solve_captcha(image_url): +# # Download image +# img_data = requests.get(image_url).content +# encoded_image = base64.b64encode(img_data).decode() + +# # Submit to AntiCaptcha +# task = { +# "clientKey": ANTI_CAPTCHA_KEY, +# "task": { +# "type": "ImageToTextTask", +# "body": encoded_image +# } +# } +# print("[*] Sending captcha request to anti-captcha...") + +# task_response = requests.post("https://api.anti-captcha.com/createTask", json=task).json() +# task_id = task_response["taskId"] +# print(f"[*] Anti-captcha response: {task_response}") + +# # Poll for result +# while True: +# time.sleep(5) +# res = requests.post("https://api.anti-captcha.com/getTaskResult", json={ +# "clientKey": ANTI_CAPTCHA_KEY, +# "taskId": task_id +# }).json() +# if res["status"] == "ready": +# print(f"[*] Captcha solved: {res}") +# return res["solution"]["text"] +# print(f"[*] Polling for captcha solution: {res['status']}") + + +# def solve_recaptcha(site_key, page_url): +# print("[*] Sending captcha request to anti-captcha...") +# # Step 1: Send captcha request +# task_payload = { +# "clientKey": ANTI_CAPTCHA_KEY, +# "task": { +# "type": "NoCaptchaTaskProxyless", +# "websiteURL": page_url, +# "websiteKey": site_key +# } +# } +# response = requests.post("https://api.anti-captcha.com/createTask", json=task_payload).json() +# print(f"[*] Anti-captcha response: {response}") +# task_id = response["taskId"] + +# # Step 2: Poll for solution +# print("[*] Polling for captcha solution...") +# for i in range(40): # ~80 seconds +# time.sleep(2) +# result = requests.post("https://api.anti-captcha.com/getTaskResult", json={ +# "clientKey": ANTI_CAPTCHA_KEY, +# "taskId": task_id +# }).json() +# print(f" Poll {i+1}: status={result['status']}") +# if result["status"] == "ready": +# print("[*] Captcha solved!") +# return result["solution"]["gRecaptchaResponse"] +# raise TimeoutError("AntiCaptcha took too long") diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py index d4b255d..c45d7ad 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py @@ -1,6 +1,7 @@ import os +import traceback from typing import Mapping -from loguru import logger +from auto_archiver.utils.custom_logger import logger from seleniumbase import SB import yt_dlp @@ -143,7 +144,7 @@ class Dropin: with yt_dlp.YoutubeDL(validated_options) as ydl: for url in video_urls: try: - logger.debug(f"Downloading video from URL: {url}") + logger.debug("downloading video from url") info = ydl.extract_info(url, download=True) filename = ydl_entry_to_filename(ydl, info) if not filename: # Failed to download video. @@ -155,5 +156,5 @@ class Dropin: to_enrich.add_media(media) downloaded += 1 except Exception as e: - logger.error(f"Error downloading {url}: {e}") + logger.error(f"download failed: {e} {traceback.format_exc()}") return downloaded diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py index 336b630..082e409 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py @@ -1,5 +1,5 @@ from typing import Mapping -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin @@ -62,7 +62,7 @@ class LinkedinDropin(Dropin): self.sb.wait_for_ready_state_complete() username, password = self._get_username_password("linkedin.com") - logger.debug("LinkedinDropin Logging in to Linkedin with username: {}", username) + logger.debug("logging in to Linkedin with username: {}", username) self.sb.type("#username", username) self.sb.type("#password", password) self.sb.click_if_visible("#password-visibility-toggle", timeout=0.5) diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py index 3f699b6..7f5e23e 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py @@ -3,7 +3,7 @@ from typing import Mapping from auto_archiver.core.metadata import Metadata from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin -from loguru import logger +from auto_archiver.utils.custom_logger import logger class RedditDropin(Dropin): @@ -50,7 +50,7 @@ class RedditDropin(Dropin): self._close_cookies_banner() username, password = self._get_username_password("reddit.com") - logger.debug("RedditDropin Logging in to Reddit with username: {}", username) + logger.debug("logging in to Reddit with username: {}", username) self.sb.type("#login-username", username) self.sb.type("#login-password", password) @@ -68,7 +68,7 @@ class RedditDropin(Dropin): self.sb.click_link_text("Log in") self.sb.wait_for_ready_state_complete() if self.sb.is_text_visible("Welcome back"): - logger.debug("RedditDropin Login successful") + logger.debug("login successful") self.sb.click_if_visible("this link") def _close_cookies_banner(self): @@ -88,5 +88,5 @@ class RedditDropin(Dropin): .map(el => el.src || el.href) .filter(url => url && /\.(m3u8|mpd|ism)$/.test(url)); """) - logger.debug("RedditDropin Found {} video URLs", len(filtered_urls)) + logger.debug("found {} video URLs", len(filtered_urls)) return 0, self._download_videos_with_ytdlp(filtered_urls, to_enrich) diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py index 3f92eda..02afd75 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py @@ -4,7 +4,7 @@ from typing import Mapping from auto_archiver.core.metadata import Metadata from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin -from loguru import logger +from auto_archiver.utils.custom_logger import logger class VkDropin(Dropin): @@ -57,12 +57,12 @@ class VkDropin(Dropin): self.sb.open("https://vk.com") self.sb.wait_for_ready_state_complete() if "/feed" in self.sb.get_current_url(): - logger.debug("Already logged in to VK.") + logger.debug("already logged in to VK.") return True # need to login username, password = self._get_username_password("vk.com") - logger.debug("Logging in to VK with username: {}", username) + logger.debug("logging in to VK with username: {}", username) self.sb.click('[data-testid="enter-another-way"]', timeout=10) self.sb.clear('input[name="login"][type="tel"]', by="css selector", timeout=10) diff --git a/src/auto_archiver/modules/api_db/api_db.py b/src/auto_archiver/modules/api_db/api_db.py index c422248..1475375 100644 --- a/src/auto_archiver/modules/api_db/api_db.py +++ b/src/auto_archiver/modules/api_db/api_db.py @@ -2,7 +2,7 @@ from typing import Union import os import requests -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Database from auto_archiver.core import Metadata @@ -36,9 +36,9 @@ class AAApiDb(Database): if not self.store_results: return if cached: - logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached") + logger.debug("skipping saving archive to AA API because it was cached") return - logger.debug(f"saving archive of {item.get_url()} to the AA API.") + logger.debug("saving archive to the AA API.") payload = { "author_id": self.author_id, diff --git a/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py b/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py index c84abd6..814800d 100644 --- a/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py +++ b/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py @@ -3,7 +3,7 @@ import os from typing import IO, Iterator, Optional, Union import requests -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Database, Feeder, Media, Metadata, Storage from auto_archiver.utils import calculate_file_hash @@ -66,13 +66,13 @@ class AtlosFeederDbStorage(Feeder, Database, Storage): """Mark an item as failed in Atlos, if the ID exists.""" atlos_id = item.metadata.get("atlos_id") if not atlos_id: - logger.info(f"Item {item.get_url()} has no Atlos ID, skipping") + logger.info("No Atlos ID available, skipping") return self._post( f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver", json={"metadata": {"processed": True, "status": "error", "error": reason}}, ) - logger.info(f"Stored failure for {item.get_url()} (ID {atlos_id}) on Atlos: {reason}") + logger.info(f"stored failure ID {atlos_id} on Atlos: {reason}") def fetch(self, item: Metadata) -> Union[Metadata, bool]: """check and fetch if the given item has been archived already, each @@ -88,7 +88,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage): """Mark an item as successfully archived in Atlos.""" atlos_id = item.metadata.get("atlos_id") if not atlos_id: - logger.info(f"Item {item.get_url()} has no Atlos ID, skipping") + logger.info("item has no Atlos ID, skipping") return self._post( f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver", @@ -100,7 +100,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage): } }, ) - logger.info(f"Stored success for {item.get_url()} (ID {atlos_id}) on Atlos") + logger.info(f"stored success ID {atlos_id} on Atlos") # ! Atlos Module - Storage Methods @@ -111,12 +111,12 @@ class AtlosFeederDbStorage(Feeder, Database, Storage): def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool: """Upload a media file to Atlos if it has not been uploaded already.""" if metadata is None: - logger.error(f"No metadata provided for {media.filename}") + logger.error(f"no metadata provided for {media.filename}") return False atlos_id = metadata.get("atlos_id") if not atlos_id: - logger.error(f"No Atlos ID found in metadata; can't store {media.filename} in Atlos.") + logger.error(f"no Atlos ID found in metadata; can't store {media.filename} in Atlos.") return False media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096) @@ -135,7 +135,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage): params={"title": media.properties}, files={"file": (os.path.basename(media.filename), file_obj)}, ) - logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}") + logger.info(f"uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}") return True def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: diff --git a/src/auto_archiver/modules/cli_feeder/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py index 5935466..7bb243b 100644 --- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py +++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py @@ -1,5 +1,3 @@ -from loguru import logger - from auto_archiver.core.feeder import Feeder from auto_archiver.core.metadata import Metadata from auto_archiver.core.consts import SetupError @@ -16,8 +14,5 @@ class CLIFeeder(Feeder): def __iter__(self) -> Metadata: urls = self.config["urls"] for url in urls: - logger.debug(f"Processing {url}") m = Metadata().set_url(url) yield m - - logger.success(f"Processed {len(urls)} URL(s)") diff --git a/src/auto_archiver/modules/console_db/console_db.py b/src/auto_archiver/modules/console_db/console_db.py index c6711c5..d6c1383 100644 --- a/src/auto_archiver/modules/console_db/console_db.py +++ b/src/auto_archiver/modules/console_db/console_db.py @@ -1,4 +1,4 @@ -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Database from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/csv_db/csv_db.py b/src/auto_archiver/modules/csv_db/csv_db.py index ac31027..aff4ad0 100644 --- a/src/auto_archiver/modules/csv_db/csv_db.py +++ b/src/auto_archiver/modules/csv_db/csv_db.py @@ -1,5 +1,5 @@ import os -from loguru import logger +from auto_archiver.utils.custom_logger import logger from csv import DictWriter from dataclasses import asdict diff --git a/src/auto_archiver/modules/csv_feeder/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py index 9c72162..f41f6b4 100644 --- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py +++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py @@ -1,4 +1,4 @@ -from loguru import logger +from auto_archiver.utils.custom_logger import logger import csv from auto_archiver.core import Feeder @@ -20,20 +20,19 @@ class CSVFeeder(Feeder): url_column = first_row.index(url_column) except ValueError: logger.error( - f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?" + f"column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?" ) return elif not (url_or_none(first_row[url_column])): # it's a header row, but we've been given a column number already - logger.debug(f"Skipping header row: {first_row}") + logger.debug(f"skipping header row: {first_row}") else: # first row isn't a header row, rewind the file f.seek(0) for row in reader: if not url_or_none(row[url_column]): - logger.warning(f"Not a valid URL in row: {row}, skipping") + logger.warning(f"not a valid URL in row: {row}, skipping") continue url = row[url_column] - logger.debug(f"Processing {url}") yield Metadata().set_url(url) diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index 50ce244..6a15e80 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -8,7 +8,7 @@ from google.oauth2 import service_account from google.oauth2.credentials import Credentials from googleapiclient.discovery import build from googleapiclient.http import MediaFileUpload -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Media from auto_archiver.core import Storage @@ -23,10 +23,10 @@ class GDriveStorage(Storage): def _setup_google_drive_service(self): """Initialize Google Drive service based on provided credentials.""" if self.oauth_token: - logger.debug(f"Using Google Drive OAuth token: {self.oauth_token}") + logger.debug(f"using Google Drive OAuth token: {self.oauth_token}") self.service = self._initialize_with_oauth_token() elif self.service_account: - logger.debug(f"Using Google Drive service account: {self.service_account}") + logger.debug(f"using Google Drive service account: {self.service_account}") self.service = self._initialize_with_service_account() else: raise ValueError("Missing credentials: either `oauth_token` or `service_account` must be provided.") @@ -41,7 +41,7 @@ class GDriveStorage(Storage): if not creds.valid and creds.expired and creds.refresh_token: creds.refresh(Request()) with open(self.oauth_token, "w") as token_file: - logger.debug("Saving refreshed OAuth token.") + logger.debug("saving refreshed OAuth token.") token_file.write(creds.to_json()) elif not creds.valid: raise ValueError("Invalid OAuth token. Please regenerate the token.") @@ -180,7 +180,7 @@ class GDriveStorage(Storage): Creates a new GDrive folder @name inside folder @parent_id Returns id of the created folder """ - logger.debug(f"Creating new folder with {name=} inside {parent_id=}") + logger.debug(f"creating new folder with {name=} inside {parent_id=}") file_metadata = {"name": [name], "mimeType": "application/vnd.google-apps.folder", "parents": [parent_id]} gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields="id").execute() return gd_folder.get("id") diff --git a/src/auto_archiver/modules/generic_extractor/bluesky.py b/src/auto_archiver/modules/generic_extractor/bluesky.py index 5baad6c..261ff03 100644 --- a/src/auto_archiver/modules/generic_extractor/bluesky.py +++ b/src/auto_archiver/modules/generic_extractor/bluesky.py @@ -1,4 +1,4 @@ -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core.extractor import Extractor from auto_archiver.core.metadata import Metadata, Media @@ -18,7 +18,7 @@ class Bluesky(GenericDropin): # download if embeds present (1 video XOR >=1 images) for media in self._download_bsky_embeds(post, archiver): result.add_media(media) - logger.debug(f"Downloaded {len(result.media)} media files") + logger.debug(f"downloaded {len(result.media)} media files") return result diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index a65c5fe..f71ac28 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -14,7 +14,7 @@ from yt_dlp.extractor.common import InfoExtractor from yt_dlp.utils import MaxDownloadsReached import pysubs2 -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core.extractor import Extractor from auto_archiver.core import Metadata, Media @@ -63,12 +63,11 @@ class GenericExtractor(Extractor): if os.environ.get("AUTO_ARCHIVER_ALLOW_RESTART", "1") != "1": logger.warning("yt-dlp or plugin was updated — please restart auto-archiver manually") else: - logger.warning("yt-dlp or plugin was updated — restarting auto-archiver") - logger.warning(" ======= RESTARTING ======= ") + logger.warning("yt-dlp or plugin was updated — restarting auto-archiver\n ======= RESTARTING ======= ") os.execv(sys.executable, [sys.executable] + sys.argv) def update_package(self, package_name: str) -> bool: - logger.info(f"Checking and updating {package_name}...") + logger.info(f"checking and updating {package_name}...") from importlib.metadata import version as get_version old_version = get_version(package_name) @@ -80,7 +79,7 @@ class GenericExtractor(Extractor): return True logger.info(f"{package_name} already up to date") except Exception as e: - logger.error(f"Error updating {package_name}: {e}") + logger.error(f"failed to update {package_name}: {e}") return False def setup_po_tokens(self) -> None: @@ -111,7 +110,7 @@ class GenericExtractor(Extractor): missing_tools = [tool for tool in ("node", "yarn", "npx") if shutil.which(tool) is None] if missing_tools: logger.error( - f"Cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. " + f"cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. " "Install these tools or run bgutils via Docker. " "See: https://github.com/Brainicism/bgutil-ytdlp-pot-provider" ) @@ -140,7 +139,7 @@ class GenericExtractor(Extractor): f"https://github.com/Brainicism/bgutil-ytdlp-pot-provider/archive/refs/tags/{plugin_version}.zip" ) zip_path = os.path.join(base_dir, f"{plugin_version}.zip") - logger.info(f"Downloading bgutils release zip for version {plugin_version}...") + logger.info(f"downloading bgutils release zip for version {plugin_version}...") urlretrieve(zip_url, zip_path) with zipfile.ZipFile(zip_path, "r") as z: z.extractall(base_dir) @@ -149,7 +148,7 @@ class GenericExtractor(Extractor): extracted_root = os.path.join(base_dir, f"bgutil-ytdlp-pot-provider-{plugin_version}") shutil.move(os.path.join(extracted_root, "server"), server_dir) shutil.rmtree(extracted_root) - logger.info("Installing dependencies and transpiling PoT Generator script...") + logger.info("installing dependencies and transpiling PoT Generator script...") subprocess.run(["yarn", "install", "--frozen-lockfile"], cwd=server_dir, check=True) subprocess.run(["npx", "tsc"], cwd=server_dir, check=True) @@ -165,7 +164,7 @@ class GenericExtractor(Extractor): logger.info(f"PO Token script configured at: {script_path}") except Exception as e: - logger.error(f"Failed to set up PO Token script: {e}") + logger.error(f"failed to set up PO Token script: {e}") def suitable_extractors(self, url: str) -> Generator[str, None, None]: """ @@ -206,7 +205,7 @@ class GenericExtractor(Extractor): media = Media(cover_image_path) metadata.add_media(media, id="cover") except Exception as e: - logger.error(f"Error downloading cover image {thumbnail_url}: {e}") + logger.error(f"could not download cover image {thumbnail_url}: {e}") dropin = self.dropin_for_name(info_extractor.ie_key()) if dropin: @@ -353,7 +352,7 @@ class GenericExtractor(Extractor): if not dropin: # TODO: add a proper link to 'how to create your own dropin' - logger.debug(f"""Could not find valid dropin for {info_extractor.ie_key()}. + logger.debug(f"""could not find valid dropin for {info_extractor.ie_key()}. Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""") return False @@ -389,7 +388,7 @@ class GenericExtractor(Extractor): # file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies. continue - logger.debug(f"Using filename {filename} for entry {entry.get('id', 'unknown')}") + logger.debug(f"using filename {filename} for entry {entry.get('id', 'unknown')}") new_media = Media(filename) for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]: @@ -404,12 +403,12 @@ class GenericExtractor(Extractor): text = " ".join([line.text for line in subs]) new_media.set(f"subtitles_{lang}", text) except Exception as e: - logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}") + logger.error(f"error loading subtitle file {val.get('filepath')}: {e}") result.add_media(new_media) except Exception as e: - logger.error(f"Error processing entry {entry}: {e}") + logger.error(f"error processing entry {entry}: {e}") if not len(result.media): - logger.info(f"No media found for entry {entry}, skipping.") + logger.info(f"no media found for entry {entry}, skipping.") return False return self.add_metadata(data, info_extractor, url, result) @@ -471,14 +470,14 @@ class GenericExtractor(Extractor): def _helper_for_successful_extract_info(data, info_extractor, url, ydl): if data.get("is_live", False) and not self.livestreams: - logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting") + logger.warning("livestream detected, skipping due to 'livestreams' configuration setting") return False # it's a valid video, that the youtubdedl can download out of the box return self.get_metadata_for_video(data, info_extractor, url, ydl) try: if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor): - logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}") + logger.debug(f"skipping using ytdlp to download files for {info_extractor.ie_key()}") raise SkipYtdlp() # don't download since it can be a live stream @@ -497,17 +496,17 @@ class GenericExtractor(Extractor): if not isinstance(e, SkipYtdlp): logger.debug( - f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead' + f'issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead' ) try: result = self.get_metadata_for_post(info_extractor, url, ydl) except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e: - logger.error("Error downloading metadata for post: {error}", error=str(post_e)) + logger.error("error downloading metadata for post: {error}", error=str(post_e)) return False except Exception as generic_e: logger.debug( - 'Attempt to extract using ytdlp extractor "{name}" failed: \n {error}', + 'attempt to extract using ytdlp extractor "{name}" failed: \n {error}', name=info_extractor.IE_NAME, error=str(generic_e), exc_info=True, @@ -560,17 +559,17 @@ class GenericExtractor(Extractor): # order of importance: username/password -> api_key -> cookie -> cookies_from_browser -> cookies_file if auth: if "username" in auth and "password" in auth: - logger.debug(f"Using provided auth username and password for {url}") + logger.debug("using provided auth username and password") ydl_options.extend(("--username", auth["username"])) ydl_options.extend(("--password", auth["password"])) elif "cookie" in auth: - logger.debug(f"Using provided auth cookie for {url}") + logger.debug("using provided auth cookie") yt_dlp.utils.std_headers["cookie"] = auth["cookie"] elif "cookies_from_browser" in auth: - logger.debug(f"Using extracted cookies from browser {auth['cookies_from_browser']} for {url}") + logger.debug(f"using extracted cookies from browser {auth['cookies_from_browser']}") ydl_options.extend(("--cookies-from-browser", auth["cookies_from_browser"])) elif "cookies_file" in auth: - logger.debug(f"Using cookies from file {auth['cookies_file']} for {url}") + logger.debug(f"using cookies from file {auth['cookies_file']}") ydl_options.extend(("--cookies", auth["cookies_file"])) # Applying user-defined extractor_args @@ -580,11 +579,11 @@ class GenericExtractor(Extractor): arg_str = ";".join(f"{k}={v}" for k, v in args.items()) else: arg_str = str(args) - logger.debug(f"Setting extractor_args: {key}:{arg_str}") + logger.debug(f"setting extractor_args: {key}:{arg_str}") ydl_options.extend(["--extractor-args", f"{key}:{arg_str}"]) if self.ytdlp_args: - logger.debug("Adding additional ytdlp arguments: {self.ytdlp_args}") + logger.debug(f"adding additional ytdlp arguments: {self.ytdlp_args}") ydl_options += self.ytdlp_args.split(" ") *_, validated_options = yt_dlp.parse_options(ydl_options) diff --git a/src/auto_archiver/modules/generic_extractor/tiktok.py b/src/auto_archiver/modules/generic_extractor/tiktok.py index 902eb05..66936e3 100644 --- a/src/auto_archiver/modules/generic_extractor/tiktok.py +++ b/src/auto_archiver/modules/generic_extractor/tiktok.py @@ -1,5 +1,5 @@ import requests -from loguru import logger +from auto_archiver.utils.custom_logger import logger from yt_dlp.extractor.tiktok import TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE @@ -22,7 +22,7 @@ class Tiktok(GenericDropin): return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE)) def extract_post(self, url: str, ie_instance): - logger.debug(f"Using Tikwm API to attempt to download tiktok video from {url=}") + logger.debug(f"using Tikwm API to attempt to download tiktok video from {url=}") endpoint = self.TIKWM_ENDPOINT.format(url=url) diff --git a/src/auto_archiver/modules/generic_extractor/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py index 9006e57..c5964ad 100644 --- a/src/auto_archiver/modules/generic_extractor/twitter.py +++ b/src/auto_archiver/modules/generic_extractor/twitter.py @@ -1,7 +1,7 @@ import re import mimetypes -from loguru import logger +from auto_archiver.utils.custom_logger import logger from slugify import slugify from auto_archiver.core.metadata import Metadata, Media @@ -40,7 +40,7 @@ class Twitter(GenericDropin): raise ValueError("Error retreiving post. Are you sure it exists?") timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y") except (ValueError, KeyError) as ex: - logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}") + logger.warning(f"unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}") return False full_text = tweet.pop("full_text", "") @@ -49,7 +49,7 @@ class Twitter(GenericDropin): result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp) if not tweet.get("entities", {}).get("media"): - logger.debug("No media found, archiving tweet text only") + logger.debug("no media found, archiving tweet text only") result.status = "twitter-ytdl" return result for i, tw_media in enumerate(tweet["entities"]["media"]): diff --git a/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py index 10300e0..0f03de7 100644 --- a/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py +++ b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py @@ -10,11 +10,12 @@ The filtered rows are processed into `Metadata` objects. """ import os +import traceback from typing import Tuple, Union, Iterator from urllib.parse import quote import gspread -from loguru import logger +from auto_archiver.utils.custom_logger import logger from slugify import slugify from retrying import retry @@ -41,19 +42,19 @@ class GsheetsFeederDB(Feeder, Database): sh = self.open_sheet() for ii, worksheet in enumerate(sh.worksheets()): if not self.should_process_sheet(worksheet.title): - logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules") + logger.debug(f"skipped worksheet '{worksheet.title}' due to allow/block rules") continue - logger.info(f"Opening worksheet {ii=}: {worksheet.title=} header={self.header}") + logger.info(f"opening worksheet {ii=}: {worksheet.title=} header={self.header}") gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns) if len(missing_cols := self.missing_required_columns(gw)): logger.debug( - f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}" + f"skipped worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}" ) continue - - # process and yield metadata here: - yield from self._process_rows(gw) - logger.info(f"Finished worksheet {worksheet.title}") + with logger.contextualize(worksheet=f"{sh.title}:{worksheet.title}"): + # process and yield metadata here: + yield from self._process_rows(gw) + logger.info(f"finished worksheet {worksheet.title}") def _process_rows(self, gw: GWorksheet): for row in range(1 + self.header, gw.count_rows() + 1): @@ -69,7 +70,9 @@ class GsheetsFeederDB(Feeder, Database): # All checks done - archival process starts here m = Metadata().set_url(url) self._set_context(m, gw, row) - yield m + + with logger.contextualize(row=row): + yield m def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata: # TODO: Check folder value not being recognised @@ -99,16 +102,16 @@ class GsheetsFeederDB(Feeder, Database): return missing def started(self, item: Metadata) -> None: - logger.info(f"STARTED {item}") + logger.info("STARTED") gw, row = self._retrieve_gsheet(item) gw.set_cell(row, "status", "Archive in progress") def failed(self, item: Metadata, reason: str) -> None: - logger.error(f"FAILED {item}") + logger.error("FAILED") self._safe_status_update(item, f"Archive failed {reason}") def aborted(self, item: Metadata) -> None: - logger.warning(f"ABORTED {item}") + logger.warning("ABORTED") self._safe_status_update(item, "") def fetch(self, item: Metadata) -> Union[Metadata, bool]: @@ -122,9 +125,7 @@ class GsheetsFeederDB(Feeder, Database): cell_updates = [] row_values = gw.get_row(row) - spreadsheet = gw.wks.spreadsheet.title - worksheet = gw.wks.title - logger.info(f"DONE url='{item.get_url()}' {row=} on {spreadsheet=} : {worksheet=}") + logger.info("DONE") def batch_if_valid(col, val, final_value=None): final_value = final_value or val @@ -132,7 +133,7 @@ class GsheetsFeederDB(Feeder, Database): if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "": cell_updates.append((row, col, final_value)) except Exception as e: - logger.error(f"Unable to batch {col}={final_value} due to {e}") + logger.error(f"unable to batch {col}={final_value} due to {e}") status_message = item.status if cached: @@ -192,15 +193,13 @@ class GsheetsFeederDB(Feeder, Database): gw, row = self._retrieve_gsheet(item) gw.set_cell(row, "status", new_status) except Exception as e: - logger.debug(f"Unable to update sheet: {e}") + logger.debug(f"unable to update sheet: {e}: {traceback.format_exc()}") def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: if gsheet := item.get_context("gsheet"): gw: GWorksheet = gsheet.get("worksheet") row: int = gsheet.get("row") elif self.sheet_id: - logger.error( - f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder." - ) + logger.error("unable to retrieve Gsheet, GsheetDB must be used alongside GsheetFeeder.") return gw, row diff --git a/src/auto_archiver/modules/hash_enricher/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py index 71425f2..799c5b3 100644 --- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py +++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py @@ -9,7 +9,7 @@ making it suitable for handling large files efficiently. """ import hashlib -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata @@ -22,8 +22,7 @@ class HashEnricher(Enricher): """ def enrich(self, to_enrich: Metadata) -> None: - url = to_enrich.get_url() - logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})") + logger.debug(f"calculating media hashes with algo={self.algorithm}") for i, m in enumerate(to_enrich.media): if len(hd := self.calculate_hash(m.filename)): diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index f5da1d8..41188f1 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -4,7 +4,7 @@ import os import pathlib from jinja2 import Environment, FileSystemLoader from urllib.parse import quote -from loguru import logger +from auto_archiver.utils.custom_logger import logger import json import base64 @@ -35,7 +35,7 @@ class HtmlFormatter(Formatter): def format(self, item: Metadata) -> Media: url = item.get_url() if item.is_empty(): - logger.debug(f"[SKIP] FORMAT there is no media or metadata to format: {url=}") + logger.debug("nothing to format, skipping") return content = self.template.render( diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py index 1694ddc..e21b089 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py +++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py @@ -14,7 +14,7 @@ from datetime import datetime import traceback import requests -from loguru import logger +from auto_archiver.utils.custom_logger import logger from retrying import retry from tqdm import tqdm @@ -45,11 +45,11 @@ class InstagramAPIExtractor(Extractor): url = item.get_url() url.replace("instagr.com", "instagram.com").replace("instagr.am", "instagram.com") insta_matches = self.valid_url.findall(url) - logger.info(f"{insta_matches=}") + if not len(insta_matches) or len(insta_matches[0]) != 3: return if len(insta_matches) > 1: - logger.warning(f"Multiple instagram matches found in {url=}, using the first one") + logger.debug("multiple instagram matches found, using the first one") return g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2] if g1 == "": @@ -65,7 +65,7 @@ class InstagramAPIExtractor(Extractor): return self.download_post(item, id=g3, context="story") return self.download_stories(item, g2) else: - logger.warning(f"Unknown instagram regex group match {g1=} found in {url=}") + logger.warning(f"unknown instagram regex group match {g1=}") return @retry(wait_random_min=1000, wait_random_max=3000, stop_max_attempt_number=5) @@ -112,8 +112,8 @@ class InstagramAPIExtractor(Extractor): count_posts += len(stories) result.set("#stories", len(stories)) except Exception as e: - result.append("errors", f"Error downloading stories for {username}") - logger.error(f"Error downloading stories for {username}: {e} {traceback.format_exc()}") + result.append("errors", f"error downloading stories for {username}") + logger.error(f"error downloading stories for {username}: {e} {traceback.format_exc()}") # download all posts try: @@ -122,8 +122,8 @@ class InstagramAPIExtractor(Extractor): result, user_id, max_to_download=self.full_profile_max_posts - count_posts ) except Exception as e: - result.append("errors", f"Error downloading posts for {username}") - logger.error(f"Error downloading posts for {username}: {e} {traceback.format_exc()}") + result.append("errors", f"error downloading posts for {username}") + logger.error(f"error downloading posts for {username}: {e} {traceback.format_exc()}") # download all tagged try: @@ -132,8 +132,8 @@ class InstagramAPIExtractor(Extractor): result, user_id, max_to_download=self.full_profile_max_posts - count_posts ) except Exception as e: - result.append("errors", f"Error downloading tagged posts for {username}") - logger.error(f"Error downloading tagged posts for {username}: {e} {traceback.format_exc()}") + result.append("errors", f"error downloading tagged posts for {username}") + logger.error(f"error downloading tagged posts for {username}: {e} {traceback.format_exc()}") # download all highlights try: @@ -159,10 +159,10 @@ class InstagramAPIExtractor(Extractor): except Exception as e: result.append( "errors", - f"Error downloading highlight id{h.get('pk')} for {username}", + f"error downloading highlight id{h.get('pk')} for {username}", ) logger.error( - f"Error downloading highlight id{h.get('pk')} for {username}: {e} {traceback.format_exc()}" + f"error downloading highlight id{h.get('pk')} for {username}: {e} {traceback.format_exc()}" ) if count_highlights >= max_to_download: logger.debug(f"HIGHLIGHTS reached max_to_download={self.full_profile_max_posts}") @@ -208,8 +208,8 @@ class InstagramAPIExtractor(Extractor): try: self.scrape_item(result, h, "highlight") except Exception as e: - result.append("errors", f"Error downloading highlight {h.get('id')}") - logger.error(f"Error downloading highlight, skipping {h.get('id')}: {e} {traceback.format_exc()}") + result.append("errors", f"error downloading highlight {h.get('id')}") + logger.error(f"error downloading highlight, skipping {h.get('id')}: {e} {traceback.format_exc()}") return h_info @@ -251,8 +251,8 @@ class InstagramAPIExtractor(Extractor): try: self.scrape_item(result, p, "post") except Exception as e: - result.append("errors", f"Error downloading post {p.get('id')}") - logger.error(f"Error downloading post, skipping {p.get('id')}: {e} {traceback.format_exc()}") + result.append("errors", f"error downloading post {p.get('id')}") + logger.error(f"error downloading post, skipping {p.get('id')}: {e} {traceback.format_exc()}") pbar.update(1) post_count += 1 if post_count >= max_to_download: @@ -279,8 +279,8 @@ class InstagramAPIExtractor(Extractor): try: self.scrape_item(result, p, "tagged") except Exception as e: - result.append("errors", f"Error downloading tagged post {p.get('id')}") - logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e} {traceback.format_exc()}") + result.append("errors", f"error downloading tagged post {p.get('id')}") + logger.error(f"error downloading tagged post, skipping {p.get('id')}: {e} {traceback.format_exc()}") pbar.update(1) tagged_count += 1 if tagged_count >= max_to_download: diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py index d559c47..af525f3 100644 --- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py @@ -8,7 +8,7 @@ import re import os import shutil import instaloader -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Extractor from auto_archiver.core import Metadata @@ -29,8 +29,9 @@ class InstagramExtractor(Extractor): # TODO: links to stories def setup(self) -> None: - logger.warning("Instagram Extractor is not actively maintained, and may not work as expected.") - logger.warning("Please consider using the Instagram Tbot Extractor or Instagram API Extractor instead.") + logger.warning( + "Instagram Extractor is not actively maintained, and may not work as expected.\nPlease consider using the Instagram Tbot Extractor or Instagram API Extractor instead." + ) self.insta = instaloader.Instaloader( download_geotags=True, @@ -43,12 +44,11 @@ class InstagramExtractor(Extractor): self.insta.load_session_from_file(self.username, self.session_file) except Exception: try: - logger.debug("Session file failed", exc_info=True) - logger.info("No valid session file found - Attempting login with use and password.") + logger.info("no valid session file found - Attempting login with use and password.") self.insta.login(self.username, self.password) self.insta.save_session_to_file(self.session_file) except Exception as e: - logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}") + logger.error(f"failed to setup Instagram Extractor with Instagrapi. {e}") def download(self, item: Metadata) -> Metadata: url = item.get_url() @@ -72,14 +72,14 @@ class InstagramExtractor(Extractor): result = self.download_profile(url, profile_matches[0]) except Exception as e: logger.error( - f"Failed to download with instagram extractor due to: {e}, make sure your account credentials are valid." + f"failed to download with instagram extractor due to: {e}, make sure your account credentials are valid." ) finally: shutil.rmtree(self.download_folder, ignore_errors=True) return result def download_post(self, url: str, post_id: str) -> Metadata: - logger.debug(f"Instagram {post_id=} detected in {url=}") + logger.debug(f"Instagram {post_id=} detected") post = instaloader.Post.from_shortcode(self.insta.context, post_id) if self.insta.download_post(post, target=post.owner_username): @@ -87,7 +87,7 @@ class InstagramExtractor(Extractor): def download_profile(self, url: str, username: str) -> Metadata: # gets posts, posts where username is tagged, igtv postss, stories, and highlights - logger.debug(f"Instagram {username=} detected in {url=}") + logger.debug(f"Instagram {username=} detected") profile = instaloader.Profile.from_username(self.insta.context, username) try: @@ -95,27 +95,27 @@ class InstagramExtractor(Extractor): try: self.insta.download_post(post, target=f"profile_post_{post.owner_username}") except Exception as e: - logger.error(f"Failed to download post: {post.shortcode}: {e}") + logger.error(f"failed to download post: {post.shortcode}: {e}") except Exception as e: - logger.error(f"Failed profile.get_posts: {e}") + logger.error(f"failed profile.get_posts: {e}") try: for post in profile.get_tagged_posts(): try: self.insta.download_post(post, target=f"tagged_post_{post.owner_username}") except Exception as e: - logger.error(f"Failed to download tagged post: {post.shortcode}: {e}") + logger.error(f"failed to download tagged post: {post.shortcode}: {e}") except Exception as e: - logger.error(f"Failed profile.get_tagged_posts: {e}") + logger.error(f"failed profile.get_tagged_posts: {e}") try: for post in profile.get_igtv_posts(): try: self.insta.download_post(post, target=f"igtv_post_{post.owner_username}") except Exception as e: - logger.error(f"Failed to download igtv post: {post.shortcode}: {e}") + logger.error(f"failed to download igtv post: {post.shortcode}: {e}") except Exception as e: - logger.error(f"Failed profile.get_igtv_posts: {e}") + logger.error(f"failed profile.get_igtv_posts: {e}") try: for story in self.insta.get_stories([profile.userid]): @@ -123,9 +123,9 @@ class InstagramExtractor(Extractor): try: self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}") except Exception as e: - logger.error(f"Failed to download story item: {item}: {e}") + logger.error(f"failed to download story item: {item}: {e}") except Exception as e: - logger.error(f"Failed get_stories: {e}") + logger.error(f"failed get_stories: {e}") try: for highlight in self.insta.get_highlights(profile.userid): @@ -133,9 +133,9 @@ class InstagramExtractor(Extractor): try: self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}") except Exception as e: - logger.error(f"Failed to download highlight item: {item}: {e}") + logger.error(f"failed to download highlight item: {item}: {e}") except Exception as e: - logger.error(f"Failed get_highlights: {e}") + logger.error(f"failed get_highlights: {e}") return self.process_downloads(url, f"@{username}", profile._asdict(), None) @@ -158,4 +158,4 @@ class InstagramExtractor(Extractor): return result.success("instagram") except Exception as e: - logger.error(f"Could not fetch instagram post {url} due to: {e}") + logger.error(f"could not fetch instagram post due to: {e}") diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py index b4f9378..9d1fd7e 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py @@ -12,7 +12,7 @@ import shutil import time from sqlite3 import OperationalError -from loguru import logger +from auto_archiver.utils.custom_logger import logger from telethon.sync import TelegramClient from auto_archiver.core import Extractor diff --git a/src/auto_archiver/modules/json_enricher/json_enricher.py b/src/auto_archiver/modules/json_enricher/json_enricher.py index b0900b6..7a5c41e 100644 --- a/src/auto_archiver/modules/json_enricher/json_enricher.py +++ b/src/auto_archiver/modules/json_enricher/json_enricher.py @@ -1,5 +1,5 @@ import json -from loguru import logger +from auto_archiver.utils.custom_logger import logger import os from auto_archiver.core import Enricher diff --git a/src/auto_archiver/modules/local_storage/local_storage.py b/src/auto_archiver/modules/local_storage/local_storage.py index fdc6978..79cb1e8 100644 --- a/src/auto_archiver/modules/local_storage/local_storage.py +++ b/src/auto_archiver/modules/local_storage/local_storage.py @@ -1,7 +1,7 @@ import shutil from typing import IO import os -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Media from auto_archiver.core import Storage diff --git a/src/auto_archiver/modules/meta_enricher/meta_enricher.py b/src/auto_archiver/modules/meta_enricher/meta_enricher.py index 9356b16..74f4b9b 100644 --- a/src/auto_archiver/modules/meta_enricher/meta_enricher.py +++ b/src/auto_archiver/modules/meta_enricher/meta_enricher.py @@ -1,6 +1,6 @@ import datetime import os -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py index e4fac44..b59ce62 100644 --- a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py +++ b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py @@ -1,6 +1,6 @@ import subprocess import traceback -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py index c920a03..272b112 100644 --- a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py +++ b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py @@ -1,6 +1,6 @@ import os -from loguru import logger +from auto_archiver.utils.custom_logger import logger import opentimestamps from opentimestamps.calendar import RemoteCalendar, DEFAULT_CALENDAR_WHITELIST from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile diff --git a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py index 19b9c59..bad408f 100644 --- a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py +++ b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py @@ -15,7 +15,7 @@ import traceback import pdqhash import numpy as np from PIL import Image, UnidentifiedImageError -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py index b5d905d..602cbe4 100644 --- a/src/auto_archiver/modules/s3_storage/s3_storage.py +++ b/src/auto_archiver/modules/s3_storage/s3_storage.py @@ -2,7 +2,7 @@ from typing import IO import boto3 import os -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Media from auto_archiver.core import Storage diff --git a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py index 3ab1389..f6f7b01 100644 --- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py +++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py @@ -2,7 +2,7 @@ import ssl import os from slugify import slugify from urllib.parse import urlparse -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata, Media diff --git a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py index e70198d..f32fb1e 100644 --- a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py +++ b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py @@ -2,7 +2,7 @@ import requests import re import html from bs4 import BeautifulSoup -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Extractor from auto_archiver.core import Metadata, Media diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 2643b32..2dcc90e 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -17,7 +17,7 @@ from telethon.errors.rpcerrorlist import ( ) from tqdm import tqdm -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Extractor from auto_archiver.core import Metadata, Media diff --git a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py index a8f844f..4e15adf 100644 --- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py +++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py @@ -9,7 +9,7 @@ and identify important moments without watching the entire video. import ffmpeg import os -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Enricher from auto_archiver.core import Media, Metadata diff --git a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py index 1626b71..1c95f24 100644 --- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py +++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py @@ -5,7 +5,7 @@ import hashlib from slugify import slugify import requests -from loguru import logger +from auto_archiver.utils.custom_logger import logger from rfc3161_client import (decode_timestamp_response,TimestampRequestBuilder,TimeStampResponse, VerifierBuilder) from rfc3161_client import VerificationError as Rfc3161VerificationError diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py index 5a0023a..71ea318 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py +++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py @@ -4,7 +4,7 @@ import re import mimetypes import requests -from loguru import logger +from auto_archiver.utils.custom_logger import logger from pytwitter import Api from slugify import slugify diff --git a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py index b1fbd80..4e21cf7 100644 --- a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py +++ b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py @@ -4,7 +4,7 @@ import os import shutil import subprocess from zipfile import ZipFile -from loguru import logger +from auto_archiver.utils.custom_logger import logger from warcio.archiveiterator import ArchiveIterator from auto_archiver.core import Media, Metadata diff --git a/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py b/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py index f06effd..2cb1815 100644 --- a/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py +++ b/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py @@ -1,5 +1,5 @@ import json -from loguru import logger +from auto_archiver.utils.custom_logger import logger import time import requests diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index 063bd26..043fc30 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -1,7 +1,7 @@ import traceback import requests import time -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata, Media diff --git a/src/auto_archiver/utils/custom_logger.py b/src/auto_archiver/utils/custom_logger.py new file mode 100644 index 0000000..9c04f35 --- /dev/null +++ b/src/auto_archiver/utils/custom_logger.py @@ -0,0 +1,37 @@ +from loguru import logger +import json + + +def extract_log_data(record): + subset = { + "level": record["level"].name, + "time": record["time"].isoformat(timespec="seconds"), + } + subset["loc"] = f"{record['file'].name}:{record['function']}:{record['line']}" + + for extra_key in ["trace", "url", "worksheet", "row"]: + if extra_val := record.get("extra", {}).get(extra_key): + subset[extra_key] = extra_val + + subset["message"] = record["message"] + if exception := record.get("exception"): + subset["exception"] = exception + return subset + + +def serialize_no_message(record): + subset = extract_log_data(record) + subset.pop("message", None) + return json.dumps(subset, ensure_ascii=False) + + +def serialize(record): + return json.dumps(extract_log_data(record), ensure_ascii=False) + + +def patching(record): + record["extra"]["serialized"] = serialize(record) + record["extra"]["serialize_no_message"] = serialize_no_message(record) + + +logger = logger.patch(patching) diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py index 27a1bc9d..4c872f3 100644 --- a/src/auto_archiver/utils/misc.py +++ b/src/auto_archiver/utils/misc.py @@ -7,7 +7,7 @@ from datetime import datetime, timezone from dateutil.parser import parse as parse_dt import requests -from loguru import logger +from auto_archiver.utils.custom_logger import logger def mkdir_if_not_exists(folder): diff --git a/tests/conftest.py b/tests/conftest.py index a54f01d..6f47a46 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,7 +9,7 @@ from tempfile import TemporaryDirectory from typing import Dict, Tuple import hashlib -from loguru import logger +from auto_archiver.utils.custom_logger import logger import pytest from auto_archiver.core.metadata import Metadata, Media from auto_archiver.core.module import ModuleFactory diff --git a/tests/data/test_modules/example_extractor/example_extractor.py b/tests/data/test_modules/example_extractor/example_extractor.py index ade26e4..6a54b40 100644 --- a/tests/data/test_modules/example_extractor/example_extractor.py +++ b/tests/data/test_modules/example_extractor/example_extractor.py @@ -1,6 +1,6 @@ from auto_archiver.core import Extractor -from loguru import logger +from auto_archiver.utils.custom_logger import logger class ExampleExtractor(Extractor): diff --git a/tests/data/test_modules/example_module/example_module.py b/tests/data/test_modules/example_module/example_module.py index 898df96..655afec 100644 --- a/tests/data/test_modules/example_module/example_module.py +++ b/tests/data/test_modules/example_module/example_module.py @@ -1,6 +1,6 @@ from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata -from loguru import logger +from auto_archiver.utils.custom_logger import logger class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter): diff --git a/tests/test_implementation.py b/tests/test_implementation.py index e52a8d8..69dd5e6 100644 --- a/tests/test_implementation.py +++ b/tests/test_implementation.py @@ -25,7 +25,7 @@ def orchestration_file(orchestration_file_path): def autoarchiver(tmp_path, monkeypatch, request): def _autoarchiver(args=[]): def cleanup(): - from loguru import logger + from auto_archiver.utils.custom_logger import logger if not logger._core.handlers.get(0): logger._core.handlers_count = 0 From ad29cb44470007db9dae3a930c4fd1423312acd1 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 26 Jun 2025 15:48:10 +0100 Subject: [PATCH 09/17] adds post_data to metadata for instagram --- .../instagram_api_extractor/instagram_api_extractor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py index e21b089..44562f4 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py +++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py @@ -170,13 +170,15 @@ class InstagramAPIExtractor(Extractor): result.set("#highlights", count_highlights) return count_highlights - def download_post(self, result: Metadata, code: str = None, id: str = None, context: str = None) -> Metadata: + def download_post(self, result: Metadata, code: str = None, id: str = None, context: str = "") -> Metadata: if id: post = self.call_api("v1/media/by/id", {"id": id}) else: post = self.call_api("v1/media/by/code", {"code": code}) assert post, f"Post {id or code} not found" + result.set(f"{context}_data", post) + if caption_text := post.get("caption_text"): result.set_title(caption_text) From afd9090a4c683f9e9711847d65adf5c8f626be98 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 26 Jun 2025 17:20:04 +0100 Subject: [PATCH 10/17] concludes logging standardization refactor --- scripts/telegram_setup.py | 2 +- src/auto_archiver/core/extractor.py | 2 +- src/auto_archiver/core/media.py | 4 +- src/auto_archiver/core/orchestrator.py | 16 +++---- .../antibot_extractor_enricher.py | 20 ++++---- .../antibot_extractor_enricher/dropin.py | 4 +- .../dropins/linkedin.py | 2 +- .../dropins/reddit.py | 6 +-- .../antibot_extractor_enricher/dropins/vk.py | 4 +- src/auto_archiver/modules/api_db/api_db.py | 4 +- .../atlos_feeder_db_storage.py | 12 ++--- .../modules/csv_feeder/csv_feeder.py | 6 +-- .../modules/gdrive_storage/gdrive_storage.py | 26 +++++----- .../modules/generic_extractor/bluesky.py | 2 +- .../generic_extractor/generic_extractor.py | 48 +++++++++---------- .../modules/generic_extractor/tiktok.py | 4 +- .../modules/generic_extractor/twitter.py | 4 +- .../gsheet_feeder_db/gsheet_feeder_db.py | 14 +++--- .../modules/hash_enricher/hash_enricher.py | 2 +- .../modules/html_formatter/html_formatter.py | 2 +- .../instagram_api_extractor.py | 38 +++++++-------- .../instagram_extractor.py | 29 +++++------ .../instagram_tbot_extractor.py | 12 ++--- .../modules/json_enricher/json_enricher.py | 4 +- .../modules/local_storage/local_storage.py | 3 +- .../modules/meta_enricher/meta_enricher.py | 11 ++--- .../metadata_enricher/metadata_enricher.py | 7 ++- .../opentimestamps_enricher.py | 14 +++--- .../pdq_hash_enricher/pdq_hash_enricher.py | 5 +- .../modules/s3_storage/s3_storage.py | 2 +- .../modules/ssl_enricher/ssl_enricher.py | 4 +- .../telegram_extractor/telegram_extractor.py | 2 +- .../telethon_extractor/telethon_extractor.py | 28 +++++------ .../thumbnail_enricher/thumbnail_enricher.py | 8 ++-- .../timestamping_enricher.py | 13 +++-- .../twitter_api_extractor.py | 9 ++-- .../wacz_extractor_enricher.py | 6 +-- .../wayback_extractor_enricher.py | 14 +++--- .../whisper_enricher/whisper_enricher.py | 10 ++-- src/auto_archiver/utils/misc.py | 13 ----- tests/databases/test_api_db.py | 2 +- tests/enrichers/test_meta_enricher.py | 1 - tests/enrichers/test_metadata_enricher.py | 2 +- tests/enrichers/test_pdq_hash_enricher.py | 2 +- tests/enrichers/test_thumbnail_enricher.py | 8 ++-- tests/enrichers/test_wacz_enricher.py | 4 +- tests/test_orchestrator.py | 3 +- tests/utils/test_misc.py | 21 -------- 48 files changed, 207 insertions(+), 252 deletions(-) diff --git a/scripts/telegram_setup.py b/scripts/telegram_setup.py index c11f94a..4c1c3bc 100644 --- a/scripts/telegram_setup.py +++ b/scripts/telegram_setup.py @@ -24,4 +24,4 @@ SESSION_FILE = "secrets/anon-insta" os.makedirs("secrets", exist_ok=True) with TelegramClient(SESSION_FILE, API_ID, API_HASH) as client: - logger.success(f"New session file created: {SESSION_FILE}.session") + logger.success(f"new session file created: {SESSION_FILE}.session") diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index 1720c68..bcfa216 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -94,7 +94,7 @@ class Extractor(BaseModule): to_filename = to_filename[-64:] to_filename = os.path.join(self.tmp_dir, to_filename) if verbose: - logger.debug(f"downloading {to_filename=}") + logger.debug(f"Downloading {to_filename=}") headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36" } diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py index fee81d3..ac6b605 100644 --- a/src/auto_archiver/core/media.py +++ b/src/auto_archiver/core/media.py @@ -86,7 +86,7 @@ class Media: @property # getter .mimetype def mimetype(self) -> str: if not self.filename or len(self.filename) == 0: - logger.warning(f"cannot get mimetype from media without filename: {self}") + logger.warning(f"Cannot get mimetype from media without filename: {self}") return "" if not self._mimetype: self._mimetype = mimetypes.guess_type(self.filename)[0] @@ -116,7 +116,7 @@ class Media: # self.is_video() should be used together with this method try: streams = ffmpeg.probe(self.filename, select_streams="v")["streams"] - logger.debug(f"STREAMS FOR {self.filename} {streams}") + logger.debug(f"Streams for {self.filename}: {streams}") return any(s.get("duration_ts", 0) > 0 for s in streams) except Error: return False # ffmpeg errors when reading bad files diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 27a1bc9e..96f25c6 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -539,11 +539,11 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ for feeder in self.feeders: for item in feeder: with logger.contextualize(url=item.get_url(), trace=random_str(12)): - logger.info("started processing") + logger.info("Started processing") yield self.feed_item(item) url_count += 1 - logger.info(f"processed {url_count} URL(s)") + logger.info(f"Processed {url_count} URL(s)") self.cleanup() def feed_item(self, item: Metadata) -> Metadata: @@ -561,7 +561,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ return self.archive(item) except KeyboardInterrupt: # catches keyboard interruptions to do a clean exit - logger.warning("caught interrupt") + logger.warning("Caught interrupt") for d in self.databases: d.aborted(item) self.cleanup() @@ -620,25 +620,25 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ try: d.done(cached_result, cached=True) except Exception as e: - logger.error(f"database {d.name}: {e}: {traceback.format_exc()}") + logger.error(f"Database {d.name}: {e}: {traceback.format_exc()}") return cached_result # 3 - call extractors until one succeeds for a in self.extractors: - logger.info(f"trying extractor {a.name}") + logger.info(f"Trying extractor {a.name}") try: result.merge(a.download(result)) if result.is_success(): break except Exception as e: - logger.error(f"archiver {a.name}: {e}: {traceback.format_exc()}") + logger.error(f"Extractor {a.name}: {e}: {traceback.format_exc()}") # 4 - call enrichers to work with archived content for e in self.enrichers: try: e.enrich(result) except Exception as exc: - logger.error(f"enricher {e.name}: {exc}: {traceback.format_exc()}") + logger.error(f"Enricher {e.name}: {exc}: {traceback.format_exc()}") # 5 - store all downloaded/generated media result.store(storages=self.storages) @@ -657,7 +657,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ try: d.done(result) except Exception as e: - logger.error(f"database {d.name}: {e}: {traceback.format_exc()}") + logger.error(f"Database {d.name}: {e}: {traceback.format_exc()}") return result diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py index e380adb..8d5d019 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py @@ -57,7 +57,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): continue # Skip imported modules/classes/functions if isinstance(obj, type) and issubclass(obj, Dropin): dropins.append(obj) - logger.debug(f"loaded drop-in classes: {', '.join([d.__name__ for d in dropins])}") + logger.debug(f"Loaded drop-in classes: {', '.join([d.__name__ for d in dropins])}") return dropins def sanitize_url(self, url: str) -> str: @@ -86,10 +86,10 @@ class AntibotExtractorEnricher(Extractor, Enricher): try: with SB(uc=True, agent=self.agent, headed=None, user_data_dir=using_user_data_dir, proxy=self.proxy) as sb: - logger.info(f"selenium browser is up with agent {self.agent}, opening url...") + logger.info(f"Selenium browser is up with agent {self.agent}, opening url...") sb.uc_open_with_reconnect(url, 4) - logger.debug("handling CAPTCHAs for...") + logger.debug("Handling CAPTCHAs for...") sb.uc_gui_handle_cf() sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future @@ -97,7 +97,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): dropin.open_page(url) if self.detect_auth_wall and self._hit_auth_wall(sb): - logger.warning("skipping since auth wall or CAPTCHA was detected") + logger.warning("Skipping since auth wall or CAPTCHA was detected") return False sb.wait_for_ready_state_complete() @@ -124,18 +124,18 @@ class AntibotExtractorEnricher(Extractor, Enricher): js_css_selector=dropin.js_for_video_css_selectors(), max_media=self.max_download_videos - downloaded_videos, ) - logger.info("completed") + logger.info("Completed") return to_enrich except selenium.common.exceptions.SessionNotCreatedException as e: if custom_data_dir: # the retry logic only works once logger.error( - f"session not created error: {e}. Please remove the user_data_dir {self.user_data_dir} and try again, will retry without user data dir though." + f"Session not created error: {e}. Please remove the user_data_dir {self.user_data_dir} and try again, will retry without user data dir though." ) return self.enrich(to_enrich, custom_data_dir=False) raise e # re-raise except Exception as e: - logger.error(f"runtime error: {e}: {traceback.format_exc()}") + logger.error(f"Runtime error: {e}: {traceback.format_exc()}") return False def _get_suitable_dropin(self, url: str, sb: SB): @@ -145,7 +145,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): """ for dropin in self.dropins: if dropin.suitable(url): - logger.debug(f"using drop-in {dropin.__name__}") + logger.debug(f"Using drop-in {dropin.__name__}") return dropin(sb, self) return DefaultDropin(sb, self) @@ -240,7 +240,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): x = max(sb.execute_script("return document.documentElement.scrollWidth"), w) y = min(max(sb.execute_script("return document.documentElement.scrollHeight"), h), 25_000) - logger.debug(f"setting window size to {x}x{y} for full page screenshot.") + logger.debug(f"Setting window size to {x}x{y} for full page screenshot.") sb.set_window_size(x, y) screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png") @@ -279,7 +279,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): # js_for_css_selectors for src in sources: if len(all_urls) >= max_media: - logger.debug(f"reached max download limit of {max_media} images/videos.") + logger.debug(f"Reached max download limit of {max_media} images/videos.") break if not is_relevant_url(src): continue diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py index c45d7ad..47c958a 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py @@ -144,7 +144,7 @@ class Dropin: with yt_dlp.YoutubeDL(validated_options) as ydl: for url in video_urls: try: - logger.debug("downloading video from url") + logger.debug(f"Downloading video from url: {url}") info = ydl.extract_info(url, download=True) filename = ydl_entry_to_filename(ydl, info) if not filename: # Failed to download video. @@ -156,5 +156,5 @@ class Dropin: to_enrich.add_media(media) downloaded += 1 except Exception as e: - logger.error(f"download failed: {e} {traceback.format_exc()}") + logger.error(f"Download failed: {e} {traceback.format_exc()}") return downloaded diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py index 082e409..0527f89 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py @@ -62,7 +62,7 @@ class LinkedinDropin(Dropin): self.sb.wait_for_ready_state_complete() username, password = self._get_username_password("linkedin.com") - logger.debug("logging in to Linkedin with username: {}", username) + logger.debug("Logging in to Linkedin with username: {}", username) self.sb.type("#username", username) self.sb.type("#password", password) self.sb.click_if_visible("#password-visibility-toggle", timeout=0.5) diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py index 7f5e23e..3f91350 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py @@ -50,7 +50,7 @@ class RedditDropin(Dropin): self._close_cookies_banner() username, password = self._get_username_password("reddit.com") - logger.debug("logging in to Reddit with username: {}", username) + logger.debug("Logging in to Reddit with username: {}", username) self.sb.type("#login-username", username) self.sb.type("#login-password", password) @@ -68,7 +68,7 @@ class RedditDropin(Dropin): self.sb.click_link_text("Log in") self.sb.wait_for_ready_state_complete() if self.sb.is_text_visible("Welcome back"): - logger.debug("login successful") + logger.debug("Login successful") self.sb.click_if_visible("this link") def _close_cookies_banner(self): @@ -88,5 +88,5 @@ class RedditDropin(Dropin): .map(el => el.src || el.href) .filter(url => url && /\.(m3u8|mpd|ism)$/.test(url)); """) - logger.debug("found {} video URLs", len(filtered_urls)) + logger.debug("Found {} video URLs", len(filtered_urls)) return 0, self._download_videos_with_ytdlp(filtered_urls, to_enrich) diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py index 02afd75..65b1b24 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py @@ -57,12 +57,12 @@ class VkDropin(Dropin): self.sb.open("https://vk.com") self.sb.wait_for_ready_state_complete() if "/feed" in self.sb.get_current_url(): - logger.debug("already logged in to VK.") + logger.debug("Already logged in to VK.") return True # need to login username, password = self._get_username_password("vk.com") - logger.debug("logging in to VK with username: {}", username) + logger.debug("Logging in to VK with username: {}", username) self.sb.click('[data-testid="enter-another-way"]', timeout=10) self.sb.clear('input[name="login"][type="tel"]', by="css selector", timeout=10) diff --git a/src/auto_archiver/modules/api_db/api_db.py b/src/auto_archiver/modules/api_db/api_db.py index 1475375..235af80 100644 --- a/src/auto_archiver/modules/api_db/api_db.py +++ b/src/auto_archiver/modules/api_db/api_db.py @@ -36,9 +36,9 @@ class AAApiDb(Database): if not self.store_results: return if cached: - logger.debug("skipping saving archive to AA API because it was cached") + logger.debug("Skipping saving archive to AA API because it was cached") return - logger.debug("saving archive to the AA API.") + logger.debug("Saving archive to the AA API.") payload = { "author_id": self.author_id, diff --git a/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py b/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py index 814800d..ff384f8 100644 --- a/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py +++ b/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py @@ -72,7 +72,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage): f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver", json={"metadata": {"processed": True, "status": "error", "error": reason}}, ) - logger.info(f"stored failure ID {atlos_id} on Atlos: {reason}") + logger.info(f"Stored failure ID {atlos_id} on Atlos: {reason}") def fetch(self, item: Metadata) -> Union[Metadata, bool]: """check and fetch if the given item has been archived already, each @@ -88,7 +88,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage): """Mark an item as successfully archived in Atlos.""" atlos_id = item.metadata.get("atlos_id") if not atlos_id: - logger.info("item has no Atlos ID, skipping") + logger.info("Item has no Atlos ID, skipping") return self._post( f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver", @@ -100,7 +100,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage): } }, ) - logger.info(f"stored success ID {atlos_id} on Atlos") + logger.info(f"Stored success ID {atlos_id} on Atlos") # ! Atlos Module - Storage Methods @@ -111,12 +111,12 @@ class AtlosFeederDbStorage(Feeder, Database, Storage): def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool: """Upload a media file to Atlos if it has not been uploaded already.""" if metadata is None: - logger.error(f"no metadata provided for {media.filename}") + logger.error(f"No metadata provided for {media.filename}") return False atlos_id = metadata.get("atlos_id") if not atlos_id: - logger.error(f"no Atlos ID found in metadata; can't store {media.filename} in Atlos.") + logger.error(f"No Atlos ID found in metadata; can't store {media.filename} in Atlos.") return False media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096) @@ -135,7 +135,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage): params={"title": media.properties}, files={"file": (os.path.basename(media.filename), file_obj)}, ) - logger.info(f"uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}") + logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}") return True def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: diff --git a/src/auto_archiver/modules/csv_feeder/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py index f41f6b4..3d25643 100644 --- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py +++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py @@ -20,19 +20,19 @@ class CSVFeeder(Feeder): url_column = first_row.index(url_column) except ValueError: logger.error( - f"column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?" + f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?" ) return elif not (url_or_none(first_row[url_column])): # it's a header row, but we've been given a column number already - logger.debug(f"skipping header row: {first_row}") + logger.debug(f"Skipping header row: {first_row}") else: # first row isn't a header row, rewind the file f.seek(0) for row in reader: if not url_or_none(row[url_column]): - logger.warning(f"not a valid URL in row: {row}, skipping") + logger.warning(f"Not a valid URL in row: {row}, skipping") continue url = row[url_column] yield Metadata().set_url(url) diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index 6a15e80..980be7a 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -23,10 +23,10 @@ class GDriveStorage(Storage): def _setup_google_drive_service(self): """Initialize Google Drive service based on provided credentials.""" if self.oauth_token: - logger.debug(f"using Google Drive OAuth token: {self.oauth_token}") + logger.debug(f"Using Google Drive OAuth token: {self.oauth_token}") self.service = self._initialize_with_oauth_token() elif self.service_account: - logger.debug(f"using Google Drive service account: {self.service_account}") + logger.debug(f"Using Google Drive service account: {self.service_account}") self.service = self._initialize_with_service_account() else: raise ValueError("Missing credentials: either `oauth_token` or `service_account` must be provided.") @@ -41,7 +41,7 @@ class GDriveStorage(Storage): if not creds.valid and creds.expired and creds.refresh_token: creds.refresh(Request()) with open(self.oauth_token, "w") as token_file: - logger.debug("saving refreshed OAuth token.") + logger.debug("Saving refreshed OAuth token.") token_file.write(creds.to_json()) elif not creds.valid: raise ValueError("Invalid OAuth token. Please regenerate the token.") @@ -62,7 +62,7 @@ class GDriveStorage(Storage): parent_id, folder_id = self.root_folder_id, None path_parts = media.key.split(os.path.sep) filename = path_parts[-1] - logger.info(f"looking for folders for {path_parts[0:-1]} before getting url for {filename=}") + logger.info(f"Looking for folders for {path_parts[0:-1]} before getting url for {filename=}") for folder in path_parts[0:-1]: folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True) parent_id = folder_id @@ -70,7 +70,7 @@ class GDriveStorage(Storage): file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=True) if not file_id: # - logger.info(f"file {filename} not found in folder {folder_id}") + logger.info(f"File {filename} not found in folder {folder_id}") return None return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing" @@ -83,7 +83,7 @@ class GDriveStorage(Storage): parent_id, upload_to = self.root_folder_id, None path_parts = media.key.split(os.path.sep) filename = path_parts[-1] - logger.info(f"checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}") + logger.info(f"Checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}") for folder in path_parts[0:-1]: upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False) if upload_to is None: @@ -91,7 +91,7 @@ class GDriveStorage(Storage): parent_id = upload_to # upload file to gd - logger.debug(f"uploading {filename=} to folder id {upload_to}") + logger.debug(f"Uploading {filename=} to folder id {upload_to}") file_metadata = {"name": [filename], "parents": [upload_to]} try: media = MediaFileUpload(media.filename, resumable=True) @@ -100,11 +100,11 @@ class GDriveStorage(Storage): .create(supportsAllDrives=True, body=file_metadata, media_body=media, fields="id") .execute() ) - logger.debug(f"uploadf: uploaded file {gd_file['id']} successfully in folder={upload_to}") + logger.debug(f"Uploadf: uploaded file {gd_file['id']} successfully in folder={upload_to}") except FileNotFoundError as e: - logger.error(f"gd uploadf: file not found {media.filename=} - {e}") + logger.error(f"GD uploadf: file not found {media.filename=} - {e}") except Exception as e: - logger.error(f"gd uploadf: error uploading {media.filename=} to {upload_to} - {e}") + logger.error(f"GD uploadf: error uploading {media.filename=} to {upload_to} - {e}") # must be implemented even if unused def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: @@ -133,7 +133,7 @@ class GDriveStorage(Storage): self.api_cache = getattr(self, "api_cache", {}) cache_key = f"{parent_id}_{name}_{use_mime_type}" if cache_key in self.api_cache: - logger.debug(f"cache hit for {cache_key=}") + logger.debug(f"Cache hit for {cache_key=}") return self.api_cache[cache_key] # API logic @@ -168,7 +168,7 @@ class GDriveStorage(Storage): else: logger.debug(f"{debug_header} not found, attempt {attempt + 1}/{retries}.") if attempt < retries - 1: - logger.debug(f"sleeping for {sleep_seconds} second(s)") + logger.debug(f"Sleeping for {sleep_seconds} second(s)") time.sleep(sleep_seconds) if raise_on_missing: @@ -180,7 +180,7 @@ class GDriveStorage(Storage): Creates a new GDrive folder @name inside folder @parent_id Returns id of the created folder """ - logger.debug(f"creating new folder with {name=} inside {parent_id=}") + logger.debug(f"Creating new folder with {name=} inside {parent_id=}") file_metadata = {"name": [name], "mimeType": "application/vnd.google-apps.folder", "parents": [parent_id]} gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields="id").execute() return gd_folder.get("id") diff --git a/src/auto_archiver/modules/generic_extractor/bluesky.py b/src/auto_archiver/modules/generic_extractor/bluesky.py index 261ff03..a4357ca 100644 --- a/src/auto_archiver/modules/generic_extractor/bluesky.py +++ b/src/auto_archiver/modules/generic_extractor/bluesky.py @@ -18,7 +18,7 @@ class Bluesky(GenericDropin): # download if embeds present (1 video XOR >=1 images) for media in self._download_bsky_embeds(post, archiver): result.add_media(media) - logger.debug(f"downloaded {len(result.media)} media files") + logger.debug(f"Downloaded {len(result.media)} media files") return result diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index f71ac28..e0d3f04 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -67,7 +67,7 @@ class GenericExtractor(Extractor): os.execv(sys.executable, [sys.executable] + sys.argv) def update_package(self, package_name: str) -> bool: - logger.info(f"checking and updating {package_name}...") + logger.info(f"Checking and updating {package_name}...") from importlib.metadata import version as get_version old_version = get_version(package_name) @@ -79,7 +79,7 @@ class GenericExtractor(Extractor): return True logger.info(f"{package_name} already up to date") except Exception as e: - logger.error(f"failed to update {package_name}: {e}") + logger.error(f"Failed to update {package_name}: {e}") return False def setup_po_tokens(self) -> None: @@ -110,7 +110,7 @@ class GenericExtractor(Extractor): missing_tools = [tool for tool in ("node", "yarn", "npx") if shutil.which(tool) is None] if missing_tools: logger.error( - f"cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. " + f"Cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. " "Install these tools or run bgutils via Docker. " "See: https://github.com/Brainicism/bgutil-ytdlp-pot-provider" ) @@ -139,7 +139,7 @@ class GenericExtractor(Extractor): f"https://github.com/Brainicism/bgutil-ytdlp-pot-provider/archive/refs/tags/{plugin_version}.zip" ) zip_path = os.path.join(base_dir, f"{plugin_version}.zip") - logger.info(f"downloading bgutils release zip for version {plugin_version}...") + logger.info(f"Downloading bgutils release zip for version {plugin_version}...") urlretrieve(zip_url, zip_path) with zipfile.ZipFile(zip_path, "r") as z: z.extractall(base_dir) @@ -148,7 +148,7 @@ class GenericExtractor(Extractor): extracted_root = os.path.join(base_dir, f"bgutil-ytdlp-pot-provider-{plugin_version}") shutil.move(os.path.join(extracted_root, "server"), server_dir) shutil.rmtree(extracted_root) - logger.info("installing dependencies and transpiling PoT Generator script...") + logger.info("Installing dependencies and transpiling PoT Generator script...") subprocess.run(["yarn", "install", "--frozen-lockfile"], cwd=server_dir, check=True) subprocess.run(["npx", "tsc"], cwd=server_dir, check=True) @@ -164,7 +164,7 @@ class GenericExtractor(Extractor): logger.info(f"PO Token script configured at: {script_path}") except Exception as e: - logger.error(f"failed to set up PO Token script: {e}") + logger.error(f"Failed to set up PO Token script: {e}") def suitable_extractors(self, url: str) -> Generator[str, None, None]: """ @@ -205,7 +205,7 @@ class GenericExtractor(Extractor): media = Media(cover_image_path) metadata.add_media(media, id="cover") except Exception as e: - logger.error(f"could not download cover image {thumbnail_url}: {e}") + logger.error(f"Could not download cover image {thumbnail_url}: {e}") dropin = self.dropin_for_name(info_extractor.ie_key()) if dropin: @@ -352,7 +352,7 @@ class GenericExtractor(Extractor): if not dropin: # TODO: add a proper link to 'how to create your own dropin' - logger.debug(f"""could not find valid dropin for {info_extractor.ie_key()}. + logger.debug(f"""Could not find valid dropin for {info_extractor.ie_key()}. Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""") return False @@ -374,7 +374,7 @@ class GenericExtractor(Extractor): if "entries" in data: entries = data.get("entries", []) if not len(entries): - logger.info("YoutubeDLArchiver could not find any video") + logger.info("GenericExtractor could not find any video") return False else: entries = [data] @@ -388,7 +388,7 @@ class GenericExtractor(Extractor): # file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies. continue - logger.debug(f"using filename {filename} for entry {entry.get('id', 'unknown')}") + logger.debug(f"Using filename {filename} for entry {entry.get('id', 'unknown')}") new_media = Media(filename) for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]: @@ -403,12 +403,12 @@ class GenericExtractor(Extractor): text = " ".join([line.text for line in subs]) new_media.set(f"subtitles_{lang}", text) except Exception as e: - logger.error(f"error loading subtitle file {val.get('filepath')}: {e}") + logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}") result.add_media(new_media) except Exception as e: - logger.error(f"error processing entry {entry}: {e}") + logger.error(f"Error processing entry {entry}: {e}") if not len(result.media): - logger.info(f"no media found for entry {entry}, skipping.") + logger.info(f"No media found for entry {entry}, skipping.") return False return self.add_metadata(data, info_extractor, url, result) @@ -470,14 +470,14 @@ class GenericExtractor(Extractor): def _helper_for_successful_extract_info(data, info_extractor, url, ydl): if data.get("is_live", False) and not self.livestreams: - logger.warning("livestream detected, skipping due to 'livestreams' configuration setting") + logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting") return False # it's a valid video, that the youtubdedl can download out of the box return self.get_metadata_for_video(data, info_extractor, url, ydl) try: if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor): - logger.debug(f"skipping using ytdlp to download files for {info_extractor.ie_key()}") + logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}") raise SkipYtdlp() # don't download since it can be a live stream @@ -496,17 +496,17 @@ class GenericExtractor(Extractor): if not isinstance(e, SkipYtdlp): logger.debug( - f'issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead' + f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead' ) try: result = self.get_metadata_for_post(info_extractor, url, ydl) except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e: - logger.error("error downloading metadata for post: {error}", error=str(post_e)) + logger.error("Error downloading metadata for post: {error}", error=str(post_e)) return False except Exception as generic_e: logger.debug( - 'attempt to extract using ytdlp extractor "{name}" failed: \n {error}', + 'Attempt to extract using ytdlp extractor "{name}" failed: \n {error}', name=info_extractor.IE_NAME, error=str(generic_e), exc_info=True, @@ -559,17 +559,17 @@ class GenericExtractor(Extractor): # order of importance: username/password -> api_key -> cookie -> cookies_from_browser -> cookies_file if auth: if "username" in auth and "password" in auth: - logger.debug("using provided auth username and password") + logger.debug("Using provided auth username and password") ydl_options.extend(("--username", auth["username"])) ydl_options.extend(("--password", auth["password"])) elif "cookie" in auth: - logger.debug("using provided auth cookie") + logger.debug("Using provided auth cookie") yt_dlp.utils.std_headers["cookie"] = auth["cookie"] elif "cookies_from_browser" in auth: - logger.debug(f"using extracted cookies from browser {auth['cookies_from_browser']}") + logger.debug(f"Using extracted cookies from browser {auth['cookies_from_browser']}") ydl_options.extend(("--cookies-from-browser", auth["cookies_from_browser"])) elif "cookies_file" in auth: - logger.debug(f"using cookies from file {auth['cookies_file']}") + logger.debug(f"Using cookies from file {auth['cookies_file']}") ydl_options.extend(("--cookies", auth["cookies_file"])) # Applying user-defined extractor_args @@ -579,11 +579,11 @@ class GenericExtractor(Extractor): arg_str = ";".join(f"{k}={v}" for k, v in args.items()) else: arg_str = str(args) - logger.debug(f"setting extractor_args: {key}:{arg_str}") + logger.debug(f"Setting extractor_args: {key}:{arg_str}") ydl_options.extend(["--extractor-args", f"{key}:{arg_str}"]) if self.ytdlp_args: - logger.debug(f"adding additional ytdlp arguments: {self.ytdlp_args}") + logger.debug(f"Adding additional ytdlp arguments: {self.ytdlp_args}") ydl_options += self.ytdlp_args.split(" ") *_, validated_options = yt_dlp.parse_options(ydl_options) diff --git a/src/auto_archiver/modules/generic_extractor/tiktok.py b/src/auto_archiver/modules/generic_extractor/tiktok.py index 66936e3..36e8f74 100644 --- a/src/auto_archiver/modules/generic_extractor/tiktok.py +++ b/src/auto_archiver/modules/generic_extractor/tiktok.py @@ -22,7 +22,7 @@ class Tiktok(GenericDropin): return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE)) def extract_post(self, url: str, ie_instance): - logger.debug(f"using Tikwm API to attempt to download tiktok video from {url=}") + logger.debug("Using Tikwm API to attempt to download tiktok video") endpoint = self.TIKWM_ENDPOINT.format(url=url) @@ -62,7 +62,7 @@ class Tiktok(GenericDropin): # get the video or fail video_downloaded = archiver.download_from_url(video_url, f"vid_{post.get('id', '')}") if not video_downloaded: - logger.error(f"failed to download video from {video_url}") + logger.error("Failed to download video") return False video_media = Media(video_downloaded) if duration := post.get("duration", None): diff --git a/src/auto_archiver/modules/generic_extractor/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py index c5964ad..5153f1c 100644 --- a/src/auto_archiver/modules/generic_extractor/twitter.py +++ b/src/auto_archiver/modules/generic_extractor/twitter.py @@ -40,7 +40,7 @@ class Twitter(GenericDropin): raise ValueError("Error retreiving post. Are you sure it exists?") timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y") except (ValueError, KeyError) as ex: - logger.warning(f"unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}") + logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}") return False full_text = tweet.pop("full_text", "") @@ -49,7 +49,7 @@ class Twitter(GenericDropin): result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp) if not tweet.get("entities", {}).get("media"): - logger.debug("no media found, archiving tweet text only") + logger.debug("No media found, archiving tweet text only") result.status = "twitter-ytdl" return result for i, tw_media in enumerate(tweet["entities"]["media"]): diff --git a/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py index 0f03de7..ddc02df 100644 --- a/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py +++ b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py @@ -42,19 +42,19 @@ class GsheetsFeederDB(Feeder, Database): sh = self.open_sheet() for ii, worksheet in enumerate(sh.worksheets()): if not self.should_process_sheet(worksheet.title): - logger.debug(f"skipped worksheet '{worksheet.title}' due to allow/block rules") + logger.debug(f"Skipped worksheet '{worksheet.title}' due to allow/block rules") continue - logger.info(f"opening worksheet {ii=}: {worksheet.title=} header={self.header}") + logger.info(f"Opening worksheet {ii=}: {worksheet.title=} header={self.header}") gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns) if len(missing_cols := self.missing_required_columns(gw)): logger.debug( - f"skipped worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}" + f"Skipped worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}" ) continue with logger.contextualize(worksheet=f"{sh.title}:{worksheet.title}"): # process and yield metadata here: yield from self._process_rows(gw) - logger.info(f"finished worksheet {worksheet.title}") + logger.info(f"Finished worksheet {worksheet.title}") def _process_rows(self, gw: GWorksheet): for row in range(1 + self.header, gw.count_rows() + 1): @@ -133,7 +133,7 @@ class GsheetsFeederDB(Feeder, Database): if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "": cell_updates.append((row, col, final_value)) except Exception as e: - logger.error(f"unable to batch {col}={final_value} due to {e}") + logger.error(f"Unable to batch {col}={final_value} due to {e}") status_message = item.status if cached: @@ -193,13 +193,13 @@ class GsheetsFeederDB(Feeder, Database): gw, row = self._retrieve_gsheet(item) gw.set_cell(row, "status", new_status) except Exception as e: - logger.debug(f"unable to update sheet: {e}: {traceback.format_exc()}") + logger.debug(f"Unable to update sheet: {e}: {traceback.format_exc()}") def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: if gsheet := item.get_context("gsheet"): gw: GWorksheet = gsheet.get("worksheet") row: int = gsheet.get("row") elif self.sheet_id: - logger.error("unable to retrieve Gsheet, GsheetDB must be used alongside GsheetFeeder.") + logger.error("Unable to retrieve Gsheet, GsheetDB must be used alongside GsheetFeeder.") return gw, row diff --git a/src/auto_archiver/modules/hash_enricher/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py index 799c5b3..526bf9a 100644 --- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py +++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py @@ -22,7 +22,7 @@ class HashEnricher(Enricher): """ def enrich(self, to_enrich: Metadata) -> None: - logger.debug(f"calculating media hashes with algo={self.algorithm}") + logger.debug(f"Calculating media hashes with algo={self.algorithm}") for i, m in enumerate(to_enrich.media): if len(hd := self.calculate_hash(m.filename)): diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index 41188f1..fee2d7c 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -35,7 +35,7 @@ class HtmlFormatter(Formatter): def format(self, item: Metadata) -> Media: url = item.get_url() if item.is_empty(): - logger.debug("nothing to format, skipping") + logger.debug("Nothing to format, skipping") return content = self.template.render( diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py index 44562f4..89337db 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py +++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py @@ -49,7 +49,7 @@ class InstagramAPIExtractor(Extractor): if not len(insta_matches) or len(insta_matches[0]) != 3: return if len(insta_matches) > 1: - logger.debug("multiple instagram matches found, using the first one") + logger.debug("Multiple instagram matches found, using the first one") return g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2] if g1 == "": @@ -65,13 +65,13 @@ class InstagramAPIExtractor(Extractor): return self.download_post(item, id=g3, context="story") return self.download_stories(item, g2) else: - logger.warning(f"unknown instagram regex group match {g1=}") + logger.warning(f"Unknown instagram regex group match {g1=}") return @retry(wait_random_min=1000, wait_random_max=3000, stop_max_attempt_number=5) def call_api(self, path: str, params: dict) -> dict: headers = {"accept": "application/json", "x-access-key": self.access_token} - logger.debug(f"calling {self.api_endpoint}/{path} with {params=}") + logger.debug(f"Calling {self.api_endpoint}/{path} with {params=}") return requests.get(f"{self.api_endpoint}/{path}", headers=headers, params=params).json() def cleanup_dict(self, d: dict | list) -> dict: @@ -112,8 +112,8 @@ class InstagramAPIExtractor(Extractor): count_posts += len(stories) result.set("#stories", len(stories)) except Exception as e: - result.append("errors", f"error downloading stories for {username}") - logger.error(f"error downloading stories for {username}: {e} {traceback.format_exc()}") + result.append("errors", f"Error downloading stories for {username}") + logger.error(f"Error downloading stories for {username}: {e} {traceback.format_exc()}") # download all posts try: @@ -122,8 +122,8 @@ class InstagramAPIExtractor(Extractor): result, user_id, max_to_download=self.full_profile_max_posts - count_posts ) except Exception as e: - result.append("errors", f"error downloading posts for {username}") - logger.error(f"error downloading posts for {username}: {e} {traceback.format_exc()}") + result.append("errors", f"Error downloading posts for {username}") + logger.error(f"Error downloading posts for {username}: {e} {traceback.format_exc()}") # download all tagged try: @@ -132,8 +132,8 @@ class InstagramAPIExtractor(Extractor): result, user_id, max_to_download=self.full_profile_max_posts - count_posts ) except Exception as e: - result.append("errors", f"error downloading tagged posts for {username}") - logger.error(f"error downloading tagged posts for {username}: {e} {traceback.format_exc()}") + result.append("errors", f"Error downloading tagged posts for {username}") + logger.error(f"Error downloading tagged posts for {username}: {e} {traceback.format_exc()}") # download all highlights try: @@ -159,10 +159,10 @@ class InstagramAPIExtractor(Extractor): except Exception as e: result.append( "errors", - f"error downloading highlight id{h.get('pk')} for {username}", + f"Error downloading highlight id{h.get('pk')} for {username}", ) logger.error( - f"error downloading highlight id{h.get('pk')} for {username}: {e} {traceback.format_exc()}" + f"Error downloading highlight id{h.get('pk')} for {username}: {e} {traceback.format_exc()}" ) if count_highlights >= max_to_download: logger.debug(f"HIGHLIGHTS reached max_to_download={self.full_profile_max_posts}") @@ -210,8 +210,8 @@ class InstagramAPIExtractor(Extractor): try: self.scrape_item(result, h, "highlight") except Exception as e: - result.append("errors", f"error downloading highlight {h.get('id')}") - logger.error(f"error downloading highlight, skipping {h.get('id')}: {e} {traceback.format_exc()}") + result.append("errors", f"Error downloading highlight {h.get('id')}") + logger.error(f"Error downloading highlight, skipping {h.get('id')}: {e} {traceback.format_exc()}") return h_info @@ -248,13 +248,13 @@ class InstagramAPIExtractor(Extractor): break posts, end_cursor = posts[0], posts[1] posts = posts[: min(max_to_download, len(posts))] - logger.info(f"parsing {len(posts)} posts, next {end_cursor=} {post_count=} {max_to_download=}") + logger.info(f"Parsing {len(posts)} posts, next {end_cursor=} {post_count=} {max_to_download=}") for p in posts: try: self.scrape_item(result, p, "post") except Exception as e: - result.append("errors", f"error downloading post {p.get('id')}") - logger.error(f"error downloading post, skipping {p.get('id')}: {e} {traceback.format_exc()}") + result.append("errors", f"Error downloading post {p.get('id')}") + logger.error(f"Error downloading post, skipping {p.get('id')}: {e} {traceback.format_exc()}") pbar.update(1) post_count += 1 if post_count >= max_to_download: @@ -275,14 +275,14 @@ class InstagramAPIExtractor(Extractor): break next_page_id = resp.get("next_page_id") - logger.info(f"parsing {len(posts)} tagged posts, next {next_page_id=}") + logger.info(f"Parsing {len(posts)} tagged posts, next {next_page_id=}") posts = posts[: min(max_to_download, len(posts))] for p in posts: try: self.scrape_item(result, p, "tagged") except Exception as e: - result.append("errors", f"error downloading tagged post {p.get('id')}") - logger.error(f"error downloading tagged post, skipping {p.get('id')}: {e} {traceback.format_exc()}") + result.append("errors", f"Error downloading tagged post {p.get('id')}") + logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e} {traceback.format_exc()}") pbar.update(1) tagged_count += 1 if tagged_count >= max_to_download: diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py index af525f3..d6f9807 100644 --- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py @@ -7,6 +7,7 @@ highlights, and tagged posts. Authentication is required via username/password o import re import os import shutil +import traceback import instaloader from auto_archiver.utils.custom_logger import logger @@ -44,11 +45,11 @@ class InstagramExtractor(Extractor): self.insta.load_session_from_file(self.username, self.session_file) except Exception: try: - logger.info("no valid session file found - Attempting login with use and password.") + logger.info("No valid session file found - Attempting login with username and password.") self.insta.login(self.username, self.password) self.insta.save_session_to_file(self.session_file) except Exception as e: - logger.error(f"failed to setup Instagram Extractor with Instagrapi. {e}") + logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}") def download(self, item: Metadata) -> Metadata: url = item.get_url() @@ -72,7 +73,7 @@ class InstagramExtractor(Extractor): result = self.download_profile(url, profile_matches[0]) except Exception as e: logger.error( - f"failed to download with instagram extractor due to: {e}, make sure your account credentials are valid." + f"Failed to download with instagram extractor due to: {e}, make sure your account credentials are valid." ) finally: shutil.rmtree(self.download_folder, ignore_errors=True) @@ -95,27 +96,27 @@ class InstagramExtractor(Extractor): try: self.insta.download_post(post, target=f"profile_post_{post.owner_username}") except Exception as e: - logger.error(f"failed to download post: {post.shortcode}: {e}") + logger.error(f"Failed to download post: {post.shortcode}: {e} {traceback.format_exc()}") except Exception as e: - logger.error(f"failed profile.get_posts: {e}") + logger.error(f"Failed profile.get_posts: {e}: {traceback.format_exc()}") try: for post in profile.get_tagged_posts(): try: self.insta.download_post(post, target=f"tagged_post_{post.owner_username}") except Exception as e: - logger.error(f"failed to download tagged post: {post.shortcode}: {e}") + logger.error(f"Failed to download tagged post: {post.shortcode}: {e} {traceback.format_exc()}") except Exception as e: - logger.error(f"failed profile.get_tagged_posts: {e}") + logger.error(f"Failed profile.get_tagged_posts: {e} {traceback.format_exc()}") try: for post in profile.get_igtv_posts(): try: self.insta.download_post(post, target=f"igtv_post_{post.owner_username}") except Exception as e: - logger.error(f"failed to download igtv post: {post.shortcode}: {e}") + logger.error(f"Failed to download igtv post: {post.shortcode}: {e} {traceback.format_exc()}") except Exception as e: - logger.error(f"failed profile.get_igtv_posts: {e}") + logger.error(f"Failed profile.get_igtv_posts: {e} {traceback.format_exc()}") try: for story in self.insta.get_stories([profile.userid]): @@ -123,9 +124,9 @@ class InstagramExtractor(Extractor): try: self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}") except Exception as e: - logger.error(f"failed to download story item: {item}: {e}") + logger.error(f"Failed to download story item: {item}: {e} {traceback.format_exc()}") except Exception as e: - logger.error(f"failed get_stories: {e}") + logger.error(f"Failed get_stories: {e} {traceback.format_exc()}") try: for highlight in self.insta.get_highlights(profile.userid): @@ -133,9 +134,9 @@ class InstagramExtractor(Extractor): try: self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}") except Exception as e: - logger.error(f"failed to download highlight item: {item}: {e}") + logger.error(f"Failed to download highlight item: {item}: {e} {traceback.format_exc()}") except Exception as e: - logger.error(f"failed get_highlights: {e}") + logger.error(f"Failed get_highlights: {e} {traceback.format_exc()}") return self.process_downloads(url, f"@{username}", profile._asdict(), None) @@ -158,4 +159,4 @@ class InstagramExtractor(Extractor): return result.success("instagram") except Exception as e: - logger.error(f"could not fetch instagram post due to: {e}") + logger.error(f"Could not fetch instagram post due to: {e} {traceback.format_exc()}") diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py index 9d1fd7e..80614da 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py @@ -32,7 +32,7 @@ class InstagramTbotExtractor(Extractor): 1. makes a copy of session_file that is removed in cleanup 2. checks if the session file is valid """ - logger.info(f"SETUP {self.name} checking login...") + logger.debug(f"SETUP {self.name} checking login...") self._prepare_session_file() self._initialize_telegram_client() @@ -58,10 +58,10 @@ class InstagramTbotExtractor(Extractor): "If you do, disable at least one of the archivers for the first-time setup of the telethon session: {e}" ) with self.client.start(): - logger.info(f"SETUP {self.name} login works.") + logger.debug(f"SETUP {self.name} login works.") def cleanup(self) -> None: - logger.info(f"CLEANUP {self.name}.") + logger.debug(f"CLEANUP {self.name}.") session_file_name = self.session_file + ".session" if os.path.exists(session_file_name): os.remove(session_file_name) @@ -79,17 +79,17 @@ class InstagramTbotExtractor(Extractor): # This may be outdated and replaced by the below message, but keeping until confirmed if "You must enter a URL to a post" in message: - logger.debug(f"invalid link {url=} for {self.name}: {message}") + logger.debug(f"Invalid link for {self.name}: {message}") return False if "Media not found or unavailable" in message: - logger.debug(f"No media found for link {url=} for {self.name}: {message}") + logger.debug(f"No media found for {self.name}: {message}") return False if message: result.set_content(message).set_title(message[:128]) elif result.is_empty(): - logger.debug(f"No media found for link {url=} for {self.name}: {message}") + logger.debug(f"No media found for {self.name}: {message}") return False return result.success("insta-via-bot") diff --git a/src/auto_archiver/modules/json_enricher/json_enricher.py b/src/auto_archiver/modules/json_enricher/json_enricher.py index 7a5c41e..71e52e6 100644 --- a/src/auto_archiver/modules/json_enricher/json_enricher.py +++ b/src/auto_archiver/modules/json_enricher/json_enricher.py @@ -8,9 +8,7 @@ from auto_archiver.core import Media, Metadata class JsonEnricher(Enricher): def enrich(self, to_enrich: Metadata) -> None: - url = to_enrich.get_url() - - logger.debug(f"JSON Enricher for {url=}") + logger.debug("Enriching as JSON") item_path = os.path.join(self.tmp_dir, "metadata.json") with open(item_path, mode="w", encoding="utf-8") as outf: diff --git a/src/auto_archiver/modules/local_storage/local_storage.py b/src/auto_archiver/modules/local_storage/local_storage.py index 79cb1e8..5982985 100644 --- a/src/auto_archiver/modules/local_storage/local_storage.py +++ b/src/auto_archiver/modules/local_storage/local_storage.py @@ -38,8 +38,7 @@ class LocalStorage(Storage): os.makedirs(os.path.dirname(dest), exist_ok=True) logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}") - res = shutil.copy2(media.filename, dest) - logger.info(res) + shutil.copy2(media.filename, dest) return True # must be implemented even if unused diff --git a/src/auto_archiver/modules/meta_enricher/meta_enricher.py b/src/auto_archiver/modules/meta_enricher/meta_enricher.py index 74f4b9b..fd50291 100644 --- a/src/auto_archiver/modules/meta_enricher/meta_enricher.py +++ b/src/auto_archiver/modules/meta_enricher/meta_enricher.py @@ -12,20 +12,17 @@ class MetaEnricher(Enricher): """ def enrich(self, to_enrich: Metadata) -> None: - url = to_enrich.get_url() if to_enrich.is_empty(): - logger.debug(f"[SKIP] META_ENRICHER there is no media or metadata to enrich: {url=}") + logger.debug("[SKIP] META_ENRICHER there is no media or metadata to enrich") return - logger.debug(f"calculating archive metadata information for {url=}") + logger.debug("Calculating archive metadata information") self.enrich_file_sizes(to_enrich) self.enrich_archive_duration(to_enrich) def enrich_file_sizes(self, to_enrich: Metadata): - logger.debug( - f"calculating archive file sizes for url={to_enrich.get_url()} ({len(to_enrich.media)} media files)" - ) + logger.debug(f"Calculating archive file sizes for {len(to_enrich.media)} media files") total_size = 0 for media in to_enrich.get_all_media(): file_stats = os.stat(media.filename) @@ -44,7 +41,7 @@ class MetaEnricher(Enricher): size /= 1024 def enrich_archive_duration(self, to_enrich): - logger.debug(f"calculating archive duration for url={to_enrich.get_url()} ") + logger.debug("Calculating archive duration") archive_duration = datetime.datetime.now(datetime.timezone.utc) - to_enrich.get("_processed_at") to_enrich.set("archive_duration_seconds", archive_duration.seconds) diff --git a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py index b59ce62..4ed47f3 100644 --- a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py +++ b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py @@ -12,8 +12,7 @@ class MetadataEnricher(Enricher): """ def enrich(self, to_enrich: Metadata) -> None: - url = to_enrich.get_url() - logger.debug(f"extracting EXIF metadata for {url=}") + logger.debug("Extracting EXIF metadata") for i, m in enumerate(to_enrich.media): if len(md := self.get_metadata(m.filename)): @@ -31,8 +30,8 @@ class MetadataEnricher(Enricher): field, value = line.strip().split(":", 1) metadata[field.strip()] = value.strip() return metadata - except FileNotFoundError: - logger.error("[exif_enricher] ExifTool not found. Make sure ExifTool is installed and added to PATH.") + except FileNotFoundError as e: + logger.error(f"ExifTool not found. Make sure ExifTool is installed and added to PATH. {e}") except Exception as e: logger.error(f"Error occurred: {e}: {traceback.format_exc()}") return {} diff --git a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py index 272b112..4c12d53 100644 --- a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py +++ b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py @@ -1,4 +1,5 @@ import os +import traceback from auto_archiver.utils.custom_logger import logger import opentimestamps @@ -14,13 +15,12 @@ from auto_archiver.utils.misc import get_current_timestamp class OpentimestampsEnricher(Enricher): def enrich(self, to_enrich: Metadata) -> None: - url = to_enrich.get_url() - logger.debug(f"OpenTimestamps timestamping files for {url=}") + logger.debug("OpenTimestamps timestamping files") # Get the media files to timestamp media_files = [m for m in to_enrich.media if m.filename and not m.get("opentimestamps")] if not media_files: - logger.debug(f"No files found to timestamp in {url=}") + logger.debug("No files found to timestamp") return timestamp_files = [] @@ -94,7 +94,7 @@ class OpentimestampsEnricher(Enricher): detached_timestamp.serialize(ctx) f.write(ctx.getbytes()) except Exception as e: - logger.warning(f"Failed to serialize timestamp file: {e}") + logger.warning(f"Failed to serialize timestamp file: {e} {traceback.format_exc()}") continue # Create media for the timestamp file @@ -113,16 +113,16 @@ class OpentimestampsEnricher(Enricher): media.set("opentimestamps", True) except Exception as e: - logger.warning(f"Error while timestamping {media.filename}: {e}") + logger.warning(f"Error while timestamping {media.filename}: {e} {traceback.format_exc()}") # Add timestamp files to the metadata if timestamp_files: to_enrich.set("opentimestamped", True) to_enrich.set("opentimestamps_count", len(timestamp_files)) - logger.info(f"{len(timestamp_files)} OpenTimestamps proofs created for {url=}") + logger.info(f"{len(timestamp_files)} OpenTimestamps proofs created") else: to_enrich.set("opentimestamped", False) - logger.warning(f"No successful timestamps created for {url=}") + logger.warning("No successful timestamps created") def verify_timestamp(self, detached_timestamp): """ diff --git a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py index bad408f..6b69cd0 100644 --- a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py +++ b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py @@ -28,8 +28,7 @@ class PdqHashEnricher(Enricher): """ def enrich(self, to_enrich: Metadata) -> None: - url = to_enrich.get_url() - logger.debug(f"calculating perceptual hashes for {url=}") + logger.debug("Calculating perceptual hashes") media_with_hashes = [] for m in to_enrich.media: @@ -44,7 +43,7 @@ class PdqHashEnricher(Enricher): media.set("pdq_hash", hd) media_with_hashes.append(media.filename) - logger.debug(f"calculated '{len(media_with_hashes)}' perceptual hashes for {url=}: {media_with_hashes}") + logger.debug(f"Calculated '{len(media_with_hashes)}' perceptual hashes: {media_with_hashes}") def calculate_pdq_hash(self, filename): # returns a hexadecimal string with the perceptual hash for the given filename diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py index 602cbe4..9318bb7 100644 --- a/src/auto_archiver/modules/s3_storage/s3_storage.py +++ b/src/auto_archiver/modules/s3_storage/s3_storage.py @@ -56,7 +56,7 @@ class S3Storage(Storage): if existing_key := self.file_in_folder(path): media._key = existing_key media.set("previously archived", True) - logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}") + logger.debug(f"Skipping upload of {media.filename} because it already exists in {media.key}") return False _, ext = os.path.splitext(media.key) diff --git a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py index f6f7b01..feb0de1 100644 --- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py +++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py @@ -19,10 +19,10 @@ class SSLEnricher(Enricher): url = to_enrich.get_url() parsed = urlparse(url) - assert parsed.scheme in ["https"], f"Invalid URL scheme {url=}" + assert parsed.scheme in ["https"], "Invalid URL scheme" domain = parsed.netloc - logger.debug(f"fetching SSL certificate for {domain=} in {url=}") + logger.debug(f"Fetching SSL certificate for {domain=}") cert = ssl.get_server_certificate((domain, 443)) cert_fn = os.path.join(self.tmp_dir, f"{slugify(domain)}.pem") diff --git a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py index f32fb1e..03e4be5 100644 --- a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py +++ b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py @@ -38,7 +38,7 @@ class TelegramExtractor(Extractor): video = s.find("video") if video is None: - logger.warning("could not find video") + logger.warning("Could not find video") image_tags = s.find_all(class_="tgme_widget_message_photo_wrap") image_urls = [] diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 2dcc90e..aa3afb7 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -65,7 +65,7 @@ class TelethonExtractor(Extractor): # get currently joined channels # https://docs.telethon.dev/en/stable/modules/custom.html#module-telethon.tl.custom.dialog joined_channel_ids = [c.id for c in self.client.get_dialogs() if c.is_channel] - logger.info(f"already part of {len(joined_channel_ids)} channels") + logger.info(f"Already part of {len(joined_channel_ids)} channels") i = 0 pbar = tqdm(desc=f"joining {len(self.channel_invites)} invite links", total=len(self.channel_invites)) @@ -80,22 +80,22 @@ class TelethonExtractor(Extractor): else: ent = self.client.get_entity(invite) # fails if not a member logger.warning( - f"please add the property id='{ent.id}' to the 'channel_invites' configuration where {invite=}, not doing so can lead to a minutes-long setup time due to telegram's rate limiting." + f"Please add the property id='{ent.id}' to the 'channel_invites' configuration where {invite=}, not doing so can lead to a minutes-long setup time due to telegram's rate limiting." ) except ValueError: - logger.info(f"joining new channel {invite=}") + logger.info(f"Joining new channel {invite=}") try: self.client(ImportChatInviteRequest(match.group(2))) except UserAlreadyParticipantError: - logger.info(f"already joined {invite=}") + logger.info(f"Already joined {invite=}") except InviteRequestSentError: - logger.warning(f"already sent a join request with {invite} still no answer") + logger.warning(f"Already sent a join request with {invite} still no answer") except InviteHashExpiredError: logger.warning(f"{invite=} has expired please find a more recent one") except Exception as e: - logger.error(f"could not join channel with {invite=} due to {e}") + logger.error(f"Could not join channel with {invite=} due to {e}") except FloodWaitError as e: - logger.warning(f"got a flood error, need to wait {e.seconds} seconds") + logger.warning(f"Got a flood error, need to wait {e.seconds} seconds") time.sleep(e.seconds) continue else: @@ -117,7 +117,7 @@ class TelethonExtractor(Extractor): url = item.get_url() # detect URLs that we definitely cannot handle match = self.valid_url.search(url) - logger.debug(f"TELETHON: {match=}") + logger.debug(f"Found telethon url {match=}") if not match: return False @@ -135,10 +135,10 @@ class TelethonExtractor(Extractor): try: stories = self.client(functions.stories.GetStoriesByIDRequest(peer=chat, id=[post_id])) if not stories.stories: - logger.info(f"No stories found for {url}, possibly it's private or the story has expired.") + logger.info("No stories found, possibly it's private or the story has expired.") return False story = stories.stories[0] - logger.debug(f"TELETHON got story {story.id=} {story.date=} {story.expire_date=}") + logger.debug(f"Got story {story.id=} {story.date=} {story.expire_date=}") result.set_timestamp(story.date).set("views", story.views.to_dict()).set( "expire_date", story.expire_date ) @@ -154,20 +154,20 @@ class TelethonExtractor(Extractor): try: post = self.client.get_messages(chat, ids=post_id) except ValueError as e: - logger.error(f"Could not fetch telegram {url} possibly it's private: {e}") + logger.error(f"Could not fetch telegram URL possibly it's private: {e}") return False except ChannelInvalidError as e: logger.error( - f"Could not fetch telegram {url}. This error may be fixed if you setup a bot_token in addition to api_id and api_hash (but then private channels will not be archived, we need to update this logic to handle both): {e}" + f"Could not fetch telegram URL. This error may be fixed if you setup a bot_token in addition to api_id and api_hash (but then private channels will not be archived, we need to update this logic to handle both): {e}" ) return False - logger.debug(f"TELETHON got post {post=}") + logger.debug(f"Got post {post=}") if post is None: return False media_posts = self._get_media_posts_in_group(chat, post) - logger.debug(f"got {len(media_posts)=} for {url=}") + logger.debug(f"Got {len(media_posts)=}") group_id = post.grouped_id if post.grouped_id is not None else post.id title = post.message diff --git a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py index 4e15adf..3de7cfa 100644 --- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py +++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py @@ -27,12 +27,12 @@ class ThumbnailEnricher(Enricher): Calculates how many thumbnails to generate and at which timestamps based on the video duration, the number of thumbnails per minute and the max number of thumbnails. Thumbnails are equally distributed across the video duration. """ - logger.debug(f"generating thumbnails for {to_enrich.get_url()}") + logger.debug("Generating thumbnails") for m_id, m in enumerate(to_enrich.media[::]): if m.is_video(): folder = os.path.join(self.tmp_dir, random_str(24)) os.makedirs(folder, exist_ok=True) - logger.debug(f"generating thumbnails for {m.filename}") + logger.debug(f"Generating thumbnails for {m.filename}") duration = m.get("duration") try: @@ -42,10 +42,10 @@ class ThumbnailEnricher(Enricher): ) to_enrich.media[m_id].set("duration", duration) except Exception as e: - logger.warning(f"failed to get duration with FFMPEG from {m.filename}: {e}") + logger.warning(f"Failed to get duration with FFMPEG from {m.filename}: {e}") if not duration or type(duration) not in [float, int] or duration <= 0: - logger.warning(f"cannot generate thumbnails for {m.filename} without valid duration") + logger.warning(f"Cannot generate thumbnails for {m.filename} without valid duration") continue num_thumbs = int(min(max(1, (duration / 60) * self.thumbnails_per_minute), self.max_thumbnails)) diff --git a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py index 1c95f24..3d9041c 100644 --- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py +++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py @@ -49,8 +49,7 @@ class TimestampingEnricher(Enricher): self.session.close() def enrich(self, to_enrich: Metadata) -> None: - url = to_enrich.get_url() - logger.debug(f"RFC3161 timestamping existing files for {url=}") + logger.debug(f"RFC3161 timestamping existing files") # create a new text file with the existing media hashes hashes = [ @@ -58,7 +57,7 @@ class TimestampingEnricher(Enricher): ] if not len(hashes): - logger.debug(f"No hashes found in {url=}") + logger.debug(f"No hashes found") return @@ -74,7 +73,7 @@ class TimestampingEnricher(Enricher): try: message = bytes(data_to_sign, encoding='utf8') - logger.debug(f"Timestamping {url=} with {tsa_url=}") + logger.debug(f"Timestamping with {tsa_url=}") signed: TimeStampResponse = self.sign_data(tsa_url, message) # fail if there's any issue with the certificates, uses certifi list of trusted CAs or the user-defined `cert_authorities` @@ -92,7 +91,7 @@ class TimestampingEnricher(Enricher): timestamp_token_path = self.save_timestamp_token(signed.time_stamp_token(), tsa_url) timestamp_tokens.append(Media(filename=timestamp_token_path).set("tsa", tsa_url).set("cert_chain", cert_chain)) except Exception as e: - logger.warning(f"Error while timestamping {url=} with {tsa_url=}: {e}") + logger.warning(f"Error while timestamping with {tsa_url=}: {e}") if len(timestamp_tokens): hashes_media.set("timestamp_authority_files", timestamp_tokens) @@ -101,9 +100,9 @@ class TimestampingEnricher(Enricher): hashes_media.set("cryptography v", version("cryptography")) to_enrich.add_media(hashes_media, id="timestamped_hashes") to_enrich.set("timestamped", True) - logger.info(f"{len(timestamp_tokens)} timestamp tokens created for {url=}") + logger.info(f"{len(timestamp_tokens)} timestamp tokens created") else: - logger.warning(f"No successful timestamps for {url=}") + logger.warning(f"No successful timestamps found") def save_timestamp_token(self, timestamp_token: bytes, tsa_url: str) -> str: """ diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py index 71ea318..420008c 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py +++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py @@ -45,10 +45,9 @@ class TwitterApiExtractor(Extractor): if "https://t.co/" in url: try: r = requests.get(url, timeout=30) - logger.debug(f"Expanded url {url} to {r.url}") url = r.url - except Exception: - logger.error(f"Failed to expand url {url}") + except Exception as e: + logger.error(f"Failed to expand Twitter URL: {e}") return url def download(self, item: Metadata) -> Metadata: @@ -67,7 +66,7 @@ class TwitterApiExtractor(Extractor): return False, False username, tweet_id = matches[0] # only one URL supported - logger.debug(f"Found {username=} and {tweet_id=} in {url=}") + logger.debug(f"Found {username=} and {tweet_id=}") return username, tweet_id @@ -85,7 +84,7 @@ class TwitterApiExtractor(Extractor): media_fields=["type", "duration_ms", "url", "variants"], tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"], ) - logger.debug(tweet) + logger.debug(f"Got {tweet=}") except Exception as e: logger.error(f"Could not get tweet: {e}") return False diff --git a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py index 4e21cf7..5ba2112 100644 --- a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py +++ b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py @@ -94,7 +94,7 @@ class WaczExtractorEnricher(Enricher, Extractor): # call docker if explicitly enabled or we are running on the host (not in docker) if self.use_docker: - logger.debug(f"generating WACZ in Docker for {url=}") + logger.debug("Generating WACZ in Docker") logger.debug(f"{browsertrix_home_host=} {browsertrix_home_container=}") if self.docker_commands: cmd = self.docker_commands + cmd @@ -111,12 +111,12 @@ class WaczExtractorEnricher(Enricher, Extractor): if self.profile: profile_file = f"profile-{self.crawl_id}.tar.gz" profile_fn = os.path.join(browsertrix_home_container, profile_file) - logger.debug(f"copying {self.profile} to {profile_fn}") + logger.debug(f"Copying {self.profile} to {profile_fn}") shutil.copyfile(self.profile, profile_fn) cmd.extend(["--profile", os.path.join("/crawls", profile_file)]) else: - logger.debug(f"generating WACZ without Docker for {url=}") + logger.debug("Generating WACZ without Docker") if self.profile: cmd.extend(["--profile", os.path.join("/app", str(self.profile))]) diff --git a/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py b/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py index 2cb1815..581ca88 100644 --- a/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py +++ b/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py @@ -31,15 +31,15 @@ class WaybackExtractorEnricher(Enricher, Extractor): url = to_enrich.get_url() if UrlUtil.is_auth_wall(url): - logger.debug(f"[SKIP] WAYBACK since url is behind AUTH WALL: {url=}") + logger.debug("[SKIP] WAYBACK since url is behind AUTH WALL") return - logger.debug(f"calling wayback for {url=}") - if to_enrich.get("wayback"): logger.info(f"Wayback enricher had already been executed: {to_enrich.get('wayback')}") return True + logger.debug("Calling Wayback") + ia_headers = {"Accept": "application/json", "Authorization": f"LOW {self.key}:{self.secret}"} post_data = {"url": url} if self.if_not_archived_within: @@ -68,7 +68,7 @@ class WaybackExtractorEnricher(Enricher, Extractor): attempt = 1 while not wayback_url and time.time() - start_time <= self.timeout: try: - logger.debug(f"GETting status for {job_id=} on {url=} ({attempt=})") + logger.debug(f"GETting status for {job_id=} ({attempt=})") r_status = requests.get( f"https://web.archive.org/save/status/{job_id}", headers=ia_headers, proxies=proxies ) @@ -79,13 +79,13 @@ class WaybackExtractorEnricher(Enricher, Extractor): logger.error(f"Wayback failed with {r_json}") return False except requests.exceptions.RequestException as e: - logger.warning(f"RequestException: fetching status for {url=} due to: {e}") + logger.warning(f"RequestException: fetching status due to: {e}") break except json.decoder.JSONDecodeError: - logger.error(f"Expected a JSON from Wayback and got {r.text} for {url=}") + logger.error(f"Expected a JSON from Wayback and got {r.text}") break except Exception as e: - logger.warning(f"error fetching status for {url=} due to: {e}") + logger.warning(f"error fetching status due to: {e}") if not wayback_url: attempt += 1 time.sleep(1) # TODO: can be improved with exponential backoff diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index 043fc30..f317ebd 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -25,7 +25,7 @@ class WhisperEnricher(Enricher): def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() - logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.") + logger.debug(f"WHISPER[{self.action}]: iterating media items") job_results = {} for i, m in enumerate(to_enrich.media): @@ -35,7 +35,7 @@ class WhisperEnricher(Enricher): try: job_id = self.submit_job(m) job_results[job_id] = False - logger.debug(f"JOB SUBMITTED: {job_id=} for {m.key=}") + logger.debug(f"Job submitted: {job_id=} for {m.key=}") to_enrich.media[i].set("whisper_model", {"job_id": job_id}) except Exception as e: logger.error( @@ -72,14 +72,14 @@ class WhisperEnricher(Enricher): "type": self.action, # "language": "string" # may be a config } - logger.debug(f"calling API with {payload=}") + logger.debug(f"Calling API with {payload=}") response = requests.post( f"{self.api_endpoint}/jobs", json=payload, headers={"Authorization": f"Bearer {self.api_key}"} ) assert response.status_code == 201, ( f"calling the whisper api {self.api_endpoint} returned a non-success code: {response.status_code}" ) - logger.debug(response.json()) + logger.debug(f"Response from whisper API: {response.json()}") return response.json()["id"] def check_jobs(self, job_results: dict): @@ -115,7 +115,7 @@ class WhisperEnricher(Enricher): assert r_res.status_code == 200, ( f"Job artifacts did not respond with 200, instead with: {r_res.status_code}" ) - logger.success(r_res.json()) + logger.info(f"Job {job_id} completed successfully:{r_res.json()}") result = {} for art_id, artifact in enumerate(r_res.json()): subtitle = [] diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py index 4c872f3..747d57e 100644 --- a/src/auto_archiver/utils/misc.py +++ b/src/auto_archiver/utils/misc.py @@ -6,7 +6,6 @@ import uuid from datetime import datetime, timezone from dateutil.parser import parse as parse_dt -import requests from auto_archiver.utils.custom_logger import logger @@ -15,18 +14,6 @@ def mkdir_if_not_exists(folder): os.makedirs(folder) -def expand_url(url): - # expand short URL links - if "https://t.co/" in url: - try: - r = requests.get(url) - logger.debug(f"Expanded url {url} to {r.url}") - return r.url - except Exception: - logger.error(f"Failed to expand url {url}") - return url - - def getattr_or(o: object, prop: str, default=None): try: res = getattr(o, prop) diff --git a/tests/databases/test_api_db.py b/tests/databases/test_api_db.py index 2e87a87..b734053 100644 --- a/tests/databases/test_api_db.py +++ b/tests/databases/test_api_db.py @@ -29,7 +29,7 @@ def test_fetch_fail_status(api_db, metadata, mocker): mock_get = mocker.patch("auto_archiver.modules.api_db.api_db.requests.get") mock_get.return_value.status_code = 400 mock_get.return_value.json.return_value = {} - mock_error = mocker.patch("loguru.logger.error") + mock_error = mocker.patch("auto_archiver.utils.custom_logger.logger.error") assert api_db.fetch(metadata) is False mock_error.assert_called_once_with("AA API FAIL (400): {}") diff --git a/tests/enrichers/test_meta_enricher.py b/tests/enrichers/test_meta_enricher.py index fe0d737..9cc5e48 100644 --- a/tests/enrichers/test_meta_enricher.py +++ b/tests/enrichers/test_meta_enricher.py @@ -33,7 +33,6 @@ def test_enrich_skips_empty_metadata(meta_enricher, mock_metadata): """Test that enrich() does nothing when Metadata is empty.""" mock_metadata.is_empty.return_value = True meta_enricher.enrich(mock_metadata) - mock_metadata.get_url.assert_called_once() def test_enrich_file_sizes(meta_enricher, metadata, tmp_path): diff --git a/tests/enrichers/test_metadata_enricher.py b/tests/enrichers/test_metadata_enricher.py index 14cfc44..a640920 100644 --- a/tests/enrichers/test_metadata_enricher.py +++ b/tests/enrichers/test_metadata_enricher.py @@ -65,7 +65,7 @@ def test_enrich_empty_media(enricher, mocker): def test_get_metadata_error_handling(enricher, mocker): mocker.patch("subprocess.run", side_effect=Exception("Test error")) - mock_log = mocker.patch("loguru.logger.error") + mock_log = mocker.patch("auto_archiver.utils.custom_logger.logger.error") result = enricher.get_metadata("test.jpg") assert result == {} assert "Error occurred: " in mock_log.call_args[0][0] diff --git a/tests/enrichers/test_pdq_hash_enricher.py b/tests/enrichers/test_pdq_hash_enricher.py index 483392d..e128107 100644 --- a/tests/enrichers/test_pdq_hash_enricher.py +++ b/tests/enrichers/test_pdq_hash_enricher.py @@ -43,7 +43,7 @@ def test_enrich_skip_non_image(metadata_with_images, mocker): def test_enrich_handles_corrupted_image(metadata_with_images, mocker): mocker.patch("PIL.Image.open", side_effect=UnidentifiedImageError("Corrupted image")) mock_pdq = mocker.patch("pdqhash.compute") - mock_logger = mocker.patch("loguru.logger.error") + mock_logger = mocker.patch("auto_archiver.utils.custom_logger.logger.error") enricher = PdqHashEnricher() enricher.enrich(metadata_with_images) diff --git a/tests/enrichers/test_thumbnail_enricher.py b/tests/enrichers/test_thumbnail_enricher.py index ba9e988..3f1e375 100644 --- a/tests/enrichers/test_thumbnail_enricher.py +++ b/tests/enrichers/test_thumbnail_enricher.py @@ -75,12 +75,12 @@ def test_enrich_thumbnail_limits( def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video, mocker): mocker.patch("ffmpeg.probe", side_effect=Exception("Probe error")) mocker.patch("os.makedirs") - mock_logger = mocker.patch("loguru.logger.warning") + mock_logger = mocker.patch("auto_archiver.utils.custom_logger.logger.warning") mocker.patch.object(Media, "is_video", return_value=True) thumbnail_enricher.enrich(metadata_with_video) # Ensure error was logged - mock_logger.assert_called_with("cannot generate thumbnails for video.mp4 without valid duration") + mock_logger.assert_called_with("Cannot generate thumbnails for video.mp4 without valid duration") # Ensure no thumbnails were created thumbnails = metadata_with_video.media[0].get("thumbnails") assert thumbnails is None @@ -128,12 +128,12 @@ def test_enrich_handles_short_video( def test_uses_existing_duration_on_exception(thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, mocker): - mock_logger = mocker.patch("loguru.logger.warning") + mock_logger = mocker.patch("auto_archiver.utils.custom_logger.logger.warning") mock_probe = mocker.patch("ffmpeg.probe", side_effect=Exception("New probe error")) metadata_with_video.media[0].set("duration", 3) thumbnail_enricher.enrich(metadata_with_video) mock_probe.assert_called_once() - mock_logger.assert_called_with("failed to get duration with FFMPEG from video.mp4: New probe error") + mock_logger.assert_called_with("Failed to get duration with FFMPEG from video.mp4: New probe error") assert mock_ffmpeg_environment["mock_output"].run.call_count == 3 diff --git a/tests/enrichers/test_wacz_enricher.py b/tests/enrichers/test_wacz_enricher.py index c45c0c3..3a75709 100644 --- a/tests/enrichers/test_wacz_enricher.py +++ b/tests/enrichers/test_wacz_enricher.py @@ -46,7 +46,7 @@ def test_setup_with_docker(wacz_enricher, mocker): def test_already_ran(wacz_enricher, metadata, mocker): metadata.add_media(Media("test.wacz"), id="browsertrix") - mock_log = mocker.patch("loguru.logger.info") + mock_log = mocker.patch("auto_archiver.utils.custom_logger.logger.info") assert wacz_enricher.enrich(metadata) is True assert "WACZ enricher had already been executed" in mock_log.call_args[0][0] @@ -73,7 +73,7 @@ def test_download_success(wacz_enricher, mocker) -> None: def test_enrich_already_executed(wacz_enricher, mocker) -> None: """Test enrich if already executed.""" - mock_log = mocker.patch("loguru.logger.info") + mock_log = mocker.patch("auto_archiver.utils.custom_logger.logger.info") metadata = Metadata().set_url("https://example.com") media = Media(filename="some_file.wacz") metadata.add_media(media, id="browsertrix") diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 86e3125..b87dfa5 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -118,8 +118,7 @@ def test_check_required_values(orchestrator, caplog, test_args): with pytest.raises(SystemExit): orchestrator.setup_config(test_args) - - assert caplog.records[1].message == "the following arguments are required: --example_module.required_field" + assert "the following arguments are required: --example_module.required_field" in caplog.records[0].message def test_get_required_values_from_config(orchestrator, test_args, tmp_path): diff --git a/tests/utils/test_misc.py b/tests/utils/test_misc.py index 844d3d2..a3902c7 100644 --- a/tests/utils/test_misc.py +++ b/tests/utils/test_misc.py @@ -6,7 +6,6 @@ import pytest from auto_archiver.utils.misc import ( mkdir_if_not_exists, - expand_url, getattr_or, DateTimeEncoder, dump_payload, @@ -39,26 +38,6 @@ class TestDirectoryUtils: assert existing_dir.exists() -class TestURLExpansion: - @pytest.mark.parametrize( - "input_url,expected", - [("https://example.com", "https://example.com"), ("https://t.co/test", "https://expanded.url")], - ) - def test_expand_url(self, input_url, expected, mocker): - mock_response = mocker.Mock() - mock_response.url = "https://expanded.url" - mocker.patch("requests.get", return_value=mock_response) - result = expand_url(input_url) - assert result == expected - - def test_expand_url_handles_errors(self, caplog, mocker): - mocker.patch("requests.get", side_effect=Exception("Connection error")) - url = "https://t.co/error" - result = expand_url(url) - assert result == url - assert f"Failed to expand url {url}" in caplog.text - - class TestAttributeHandling: class Sample: exists = "value" From 7c9475cde2291aade0b948a4f9213533ae04aac0 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 30 Jun 2025 00:53:10 +0100 Subject: [PATCH 11/17] allow for human readable console logs, but defaults to JSON on file logs. --- docs/source/how_to/03_logging.md | 17 +++++++++++++- src/auto_archiver/core/orchestrator.py | 6 +++-- src/auto_archiver/utils/custom_logger.py | 28 +++++++++++++++++++++--- 3 files changed, 45 insertions(+), 6 deletions(-) diff --git a/docs/source/how_to/03_logging.md b/docs/source/how_to/03_logging.md index c283b77..2019eb5 100644 --- a/docs/source/how_to/03_logging.md +++ b/docs/source/how_to/03_logging.md @@ -24,7 +24,7 @@ This will disable all logs from Auto Archiver, but it does not disable logs for #### Logging Level -There are 7 logging levels in total, with 5 of them used in this tool. They are: `DEBUG`, `INFO`, `SUCCESS`, `WARNING` and `ERROR`. +There are 7 logging levels in total, with 5 of them used in this tool. They are: `DEBUG`, `INFO`, `SUCCESS`, `WARNING` and `ERROR`. If you select a level, only that and higher (more serious) levels will be included. `DEBUG` is the most verbose, while `ERROR` is the least verbose. Change the warning level by setting the value in your orchestration config file: @@ -42,6 +42,20 @@ For normal usage, it is recommended to use the `INFO` level, or if you prefer qu ```{note} To learn about all logging levels, see the [loguru documentation](https://loguru.readthedocs.io/en/stable/api/logger.html) ``` +### Logging Format +By default, the console logs are formatted in a human-readable way and the file logs are formatted in JSON. This is new from version 1.1.1. If you want to change the format of the console logs to JSON too you can set the `format:` option in your logging settings. + +```{code} yaml +:caption: orchestration.yaml + +logging: + format: json +``` + +When the Auto Archiver is writing logs it will include context about specific tasks, so if you are archiving a URL from a Google Sheet, both the URL (and a unique `trace_id` for that URL's archiving attempt) and the Spreadsheet name and row will be included in the logs. This is useful for debugging and understanding what the Auto Archiver is doing. + +Using JSON allows you to easily parse the logs and extract specific information, tools like [`jq`](https://jqlang.org/) can be used to filter and search through the logs. + ### Logging to a file As default, auto-archiver will log to the console. But if you wish to store your logs for future reference, or you are running the auto-archiver from within code a implementation, then you may wish to enable file logging. This can be done by setting the `file:` config value in the logging settings. @@ -84,6 +98,7 @@ The below example logs only `DEBUG` logs to the console and to the file `/my/fil logging: level: DEBUG + format: json file: /my/file.log rotation: 1 week ``` \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 96f25c6..9d914d1 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -15,7 +15,7 @@ import traceback from copy import copy from rich_argparse import RichHelpFormatter -from auto_archiver.utils.custom_logger import logger +from auto_archiver.utils.custom_logger import format_for_human_readable_console, logger import requests from auto_archiver.utils.misc import random_str @@ -348,7 +348,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ sys.stderr, level=use_level, catch=True, - format="{level}: {message} {extra[serialize_no_message]}", + format="{extra[serialized]}" + if logging_config.get("format", "").lower() == "json" + else format_for_human_readable_console(), ) rotation = logging_config["rotation"] diff --git a/src/auto_archiver/utils/custom_logger.py b/src/auto_archiver/utils/custom_logger.py index 9c04f35..0b6c36c 100644 --- a/src/auto_archiver/utils/custom_logger.py +++ b/src/auto_archiver/utils/custom_logger.py @@ -2,13 +2,21 @@ from loguru import logger import json +def extract_location(record, short=False): + """Extracts the file name, function name, and line number from the log record.""" + if short: + return f"{record['file'].name}:{record['line']}" + return f"{record['file'].name}:{record['function']}:{record['line']}" + + def extract_log_data(record): subset = { "level": record["level"].name, "time": record["time"].isoformat(timespec="seconds"), } - subset["loc"] = f"{record['file'].name}:{record['function']}:{record['line']}" + subset["loc"] = extract_location(record) + # This is where logger.contextualize() parameters can be added to the output for extra_key in ["trace", "url", "worksheet", "row"]: if extra_val := record.get("extra", {}).get(extra_key): subset[extra_key] = extra_val @@ -19,9 +27,14 @@ def extract_log_data(record): return subset -def serialize_no_message(record): +def serialize_for_console(record): subset = extract_log_data(record) subset.pop("message", None) + subset.pop("level", None) + subset.pop("loc", None) + subset.pop("time", None) + if not subset: + return "" return json.dumps(subset, ensure_ascii=False) @@ -31,7 +44,16 @@ def serialize(record): def patching(record): record["extra"]["serialized"] = serialize(record) - record["extra"]["serialize_no_message"] = serialize_no_message(record) + record["extra"]["serialize_for_console"] = serialize_for_console(record) + + +def format_for_human_readable_console(): + return ( + "{time:YYYY-MM-DD HH:mm:ss.SSS} | " + "{level: <8} | " + "{file}:{function}:{line} | " + "{extra[serialize_for_console]} {message}" + ) logger = logger.patch(patching) From 4ad71b35895592d58eba038fe22f3a17f905f6a9 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 30 Jun 2025 01:42:34 +0100 Subject: [PATCH 12/17] adds retry to worksheet read for slow worksheets --- src/auto_archiver/modules/gsheet_feeder_db/gworksheet.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/auto_archiver/modules/gsheet_feeder_db/gworksheet.py b/src/auto_archiver/modules/gsheet_feeder_db/gworksheet.py index 6dac059..f22c07a 100644 --- a/src/auto_archiver/modules/gsheet_feeder_db/gworksheet.py +++ b/src/auto_archiver/modules/gsheet_feeder_db/gworksheet.py @@ -1,4 +1,5 @@ from gspread import utils +from retrying import retry class GWorksheet: @@ -26,6 +27,12 @@ class GWorksheet: "replaywebpage": "replaywebpage", } + @retry( + wait_incrementing_start=1000, + wait_incrementing_increment=3000, + wait_incrementing_max=20_000, + stop_max_attempt_number=5, + ) def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1): self.wks = worksheet self.columns = columns From b2648fa3cd48d4dd59be0de72854cf62c2735fc3 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 30 Jun 2025 01:47:12 +0100 Subject: [PATCH 13/17] follow docs advice on exponential backoff of SheetsAPI --- .../modules/gsheet_feeder_db/gsheet_feeder_db.py | 4 +--- src/auto_archiver/modules/gsheet_feeder_db/gworksheet.py | 6 ++---- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py index ddc02df..645bd45 100644 --- a/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py +++ b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py @@ -178,9 +178,7 @@ class GsheetsFeederDB(Feeder, Database): ) @retry( - wait_incrementing_start=1000, - wait_incrementing_increment=3000, - wait_incrementing_max=20_000, + wait_exponential_multiplier=1, stop_max_attempt_number=5, ) def batch_set_cell_with_retry(gw, cell_updates: list): diff --git a/src/auto_archiver/modules/gsheet_feeder_db/gworksheet.py b/src/auto_archiver/modules/gsheet_feeder_db/gworksheet.py index f22c07a..95ea126 100644 --- a/src/auto_archiver/modules/gsheet_feeder_db/gworksheet.py +++ b/src/auto_archiver/modules/gsheet_feeder_db/gworksheet.py @@ -28,10 +28,8 @@ class GWorksheet: } @retry( - wait_incrementing_start=1000, - wait_incrementing_increment=3000, - wait_incrementing_max=20_000, - stop_max_attempt_number=5, + wait_exponential_multiplier=1, + stop_max_attempt_number=6, ) def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1): self.wks = worksheet From 73c8dc583fc68b139da1c55068ddb2ec97666485 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 30 Jun 2025 01:52:22 +0100 Subject: [PATCH 14/17] closes #333 --- poetry.lock | 36 ++++++++++++++++++------------------ pyproject.toml | 2 +- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/poetry.lock b/poetry.lock index 37b37fa..e9c5468 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2830,25 +2830,25 @@ six = ">=1.7.0" [[package]] name = "rfc3161-client" -version = "1.0.2" +version = "1.0.3" description = "" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "rfc3161_client-1.0.2-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:9cf9a8f813028ef2d5d737f738f27c7abe41a4c5c0570fbc2ddfd5e4d03aee7a"}, - {file = "rfc3161_client-1.0.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:8db097d98b9e3bca4ca68babbeaed8436c4f8d455623c46821bf0cfd8492533f"}, - {file = "rfc3161_client-1.0.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8397241db132602e38bc6c4e416cb47d541528b6665aee9788705949487560f7"}, - {file = "rfc3161_client-1.0.2-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8fe3c05f050b18719dac4accce6fdae88e7d5309eb36292eac0cad2f989d159e"}, - {file = "rfc3161_client-1.0.2-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:af30b5e46db8b88c1bf7eae182e1bd4080f5d2475044f6ae04ab545e0faaa217"}, - {file = "rfc3161_client-1.0.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a93b3b3f79f83fefd5399004d3cd522fe93f49dbbb4865dba2c6ac6d8190ab60"}, - {file = "rfc3161_client-1.0.2-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:714b5fd21b56b5d47136e4ca2ad346db26320a47b282b20d14337711e2bdec5b"}, - {file = "rfc3161_client-1.0.2-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:19cf1cdfa7a3c189d10e58ffdc9553f78972b45bce9dc713c78752b6dd696b5a"}, - {file = "rfc3161_client-1.0.2-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:24653746e2d3868ac53bb47a46d2b891ffddd7fa939954df47301566919ed7e3"}, - {file = "rfc3161_client-1.0.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b5a2e502d60176c3d376a7c81a3748b96df64c3c7ff46934f8f0e35b72f9922d"}, - {file = "rfc3161_client-1.0.2-cp39-abi3-win32.whl", hash = "sha256:8cb9d6aa413362b98f40ce4c6667e69ae29a31c91c657547de99203e353ebc43"}, - {file = "rfc3161_client-1.0.2-cp39-abi3-win_amd64.whl", hash = "sha256:03bb5c92a59dd028959142a2dba8edfbf7575d3ccd74ac50eaf2c0ada45e3a40"}, - {file = "rfc3161_client-1.0.2.tar.gz", hash = "sha256:37c78277d78aab02baf17393c30f66d1c2ab1a398d3540b0657792c0ceb81858"}, + {file = "rfc3161_client-1.0.3-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:b3f513adc5d4c1c59aed1f5f89fbe2e560410f461ae163fdca8c130939df79d6"}, + {file = "rfc3161_client-1.0.3-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:863d97877c3aa7e42682f70da0f3009618bc1e2aa0a7353133b94dd649d3a602"}, + {file = "rfc3161_client-1.0.3-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:649037dbade2e78bdc1e8d7d917b04f27c245e0d758ab713f2ddeeec0fc6dd52"}, + {file = "rfc3161_client-1.0.3-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c6743aa339c07772a53ffb1accc7def78c11d8ebba57c6d25329c1d412dde4dd"}, + {file = "rfc3161_client-1.0.3-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0d40bb252d1a0714f4faa6b538be0bcbe9d13c6a7a37188b26f9f23d34aad7a3"}, + {file = "rfc3161_client-1.0.3-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f76bdf2a9f80ea97a99324fa74695621fddc0e6f5d4a4a4e0ca30e822a37e534"}, + {file = "rfc3161_client-1.0.3-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9d4d628e00fee72f07bdc779ce75160036c8cb318cac5336cd12692e2d7153e8"}, + {file = "rfc3161_client-1.0.3-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:e5eeb73862b28e5aacc2951c0aec72ecff5209925a4c5be2753cd30f13c39ae5"}, + {file = "rfc3161_client-1.0.3-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:39e188281bc04378130ed52b1b00ee330570f04f0000cc60a0a534803f349482"}, + {file = "rfc3161_client-1.0.3-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ea49605cf10558145b075979d8bfc8bff685c44815bf8b66fd580ced642216c9"}, + {file = "rfc3161_client-1.0.3-cp39-abi3-win32.whl", hash = "sha256:a231b2d3430216491a4dac0cb04afdad0398bf5ded39138938b6002734abf2b4"}, + {file = "rfc3161_client-1.0.3-cp39-abi3-win_amd64.whl", hash = "sha256:f2a925e668b7637c0aecd416dd060ec9579a5edd62502bb88efa981791419a44"}, + {file = "rfc3161_client-1.0.3.tar.gz", hash = "sha256:e9b614a5a4596ab9aea44d3fe8a4995bd84ac7f20dcbfaa82b115224202d88d8"}, ] [package.dependencies] @@ -2856,7 +2856,7 @@ cryptography = ">=43,<46" [package.extras] dev = ["maturin (>=1.7,<2.0)", "rfc3161-client[doc,lint,test]"] -lint = ["interrogate", "ruff (>=0.7,<0.12)"] +lint = ["interrogate", "mypy", "ruff (>=0.7,<0.13)", "types-requests"] test = ["coverage[toml]", "pretend", "pytest", "pytest-cov"] [[package]] @@ -2901,7 +2901,7 @@ description = "Manipulate well-formed Roman numerals" optional = false python-versions = ">=3.9" groups = ["docs"] -markers = "python_version != \"3.10\"" +markers = "python_version >= \"3.11\"" files = [ {file = "roman_numerals_py-3.1.0-py3-none-any.whl", hash = "sha256:9da2ad2fb670bcf24e81070ceb3be72f6c11c440d73bd579fbeca1e9f330954c"}, {file = "roman_numerals_py-3.1.0.tar.gz", hash = "sha256:be4bf804f083a4ce001b5eb7e3c0862479d10f94c936f6c4e5f250aa5ff5bd2d"}, @@ -3330,7 +3330,7 @@ description = "Python documentation generator" optional = false python-versions = ">=3.11" groups = ["docs"] -markers = "python_version != \"3.10\"" +markers = "python_version >= \"3.11\"" files = [ {file = "sphinx-8.2.3-py3-none-any.whl", hash = "sha256:4405915165f13521d875a8c29c8970800a0141c14cc5416a38feca4ea5d9b9c3"}, {file = "sphinx-8.2.3.tar.gz", hash = "sha256:398ad29dee7f63a75888314e9424d40f52ce5a6a87ae88e7071e80af296ec348"}, @@ -4196,4 +4196,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.13" -content-hash = "1ab1e4c9b8beb51116052c1e8d180616a0938757f173f05b7355e279902d3350" +content-hash = "8f0806dff086087dcf5bbec03902bdd05794dab3d16e7e4b379015db26211c92" diff --git a/pyproject.toml b/pyproject.toml index 4b2eff7..a529334 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,7 @@ dependencies = [ "retrying (>=0.0.0)", "rich-argparse (>=1.6.0,<2.0.0)", "ruamel-yaml (>=0.18.10,<0.19.0)", - "rfc3161-client (>=1.0.1,<2.0.0)", + "rfc3161-client (==1.0.3)", "cryptography (>44.0.1,<45.0.0)", "opentimestamps (>=0.4.5,<0.5.0)", "bgutil-ytdlp-pot-provider (>=1.0.0)", From 30ea8a0ba451e29b09ec5cc76919739299773734 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 30 Jun 2025 02:20:09 +0100 Subject: [PATCH 15/17] bumps dependencies --- poetry.lock | 88 ++++++++++++++++++++++++++--------------------------- 1 file changed, 43 insertions(+), 45 deletions(-) diff --git a/poetry.lock b/poetry.lock index e9c5468..0fbc219 100644 --- a/poetry.lock +++ b/poetry.lock @@ -193,18 +193,18 @@ files = [ [[package]] name = "boto3" -version = "1.38.37" +version = "1.38.46" description = "The AWS SDK for Python" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "boto3-1.38.37-py3-none-any.whl", hash = "sha256:46a512b1fbc4c51a9abfef8e2130db0806cb00ef137e161f6f751421c78a7c0c"}, - {file = "boto3-1.38.37.tar.gz", hash = "sha256:4ccd700a2a36de0cd63bd8c79cca6164cb684e34fc1126de5c41525e4d0bfaee"}, + {file = "boto3-1.38.46-py3-none-any.whl", hash = "sha256:9c8e88a32a6465e5905308708cff5b17547117f06982908bdfdb0108b4a65079"}, + {file = "boto3-1.38.46.tar.gz", hash = "sha256:d1ca2b53138afd0341e1962bd52be6071ab7a63c5b4f89228c5ef8942c40c852"}, ] [package.dependencies] -botocore = ">=1.38.37,<1.39.0" +botocore = ">=1.38.46,<1.39.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.13.0,<0.14.0" @@ -213,14 +213,14 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.38.37" +version = "1.38.46" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "botocore-1.38.37-py3-none-any.whl", hash = "sha256:f8ad063b7dcdbf12f2c1b5a4405f390ce52beff3b2861af2e5169816ee0146f2"}, - {file = "botocore-1.38.37.tar.gz", hash = "sha256:06ce46da5420ea7cf542ece4ff1ec9045922fef977adf4bbec618c96c7a478bf"}, + {file = "botocore-1.38.46-py3-none-any.whl", hash = "sha256:89ca782ffbf2e8769ca9c89234cfa5ca577f1987d07d913ee3c68c4776b1eb5b"}, + {file = "botocore-1.38.46.tar.gz", hash = "sha256:8798e5a418c27cf93195b077153644aea44cb171fcd56edc1ecebaa1e49e226e"}, ] [package.dependencies] @@ -801,20 +801,20 @@ typing-inspect = ">=0.4.0,<1" [[package]] name = "dateparser" -version = "1.2.1" +version = "1.2.2" description = "Date parsing library designed to parse dates from HTML pages" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "dateparser-1.2.1-py3-none-any.whl", hash = "sha256:bdcac262a467e6260030040748ad7c10d6bacd4f3b9cdb4cfd2251939174508c"}, - {file = "dateparser-1.2.1.tar.gz", hash = "sha256:7e4919aeb48481dbfc01ac9683c8e20bfe95bb715a38c1e9f6af889f4f30ccc3"}, + {file = "dateparser-1.2.2-py3-none-any.whl", hash = "sha256:5a5d7211a09013499867547023a2a0c91d5a27d15dd4dbcea676ea9fe66f2482"}, + {file = "dateparser-1.2.2.tar.gz", hash = "sha256:986316f17cb8cdc23ea8ce563027c5ef12fc725b6fb1d137c14ca08777c5ecf7"}, ] [package.dependencies] python-dateutil = ">=2.7.0" pytz = ">=2024.2" -regex = ">=2015.06.24,<2019.02.19 || >2019.02.19,<2021.8.27 || >2021.8.27" +regex = ">=2024.9.11" tzlocal = ">=0.2" [package.extras] @@ -966,14 +966,14 @@ grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.0)"] [[package]] name = "google-api-python-client" -version = "2.172.0" +version = "2.174.0" description = "Google API Client Library for Python" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "google_api_python_client-2.172.0-py3-none-any.whl", hash = "sha256:9f1b9a268d5dc1228207d246c673d3a09ee211b41a11521d38d9212aeaa43af7"}, - {file = "google_api_python_client-2.172.0.tar.gz", hash = "sha256:dcb3b7e067154b2aa41f1776cf86584a5739c0ac74e6ff46fc665790dca0e6a6"}, + {file = "google_api_python_client-2.174.0-py3-none-any.whl", hash = "sha256:f695205ceec97bfaa1590a14282559c4109326c473b07352233a3584cdbf4b89"}, + {file = "google_api_python_client-2.174.0.tar.gz", hash = "sha256:9eb7616a820b38a9c12c5486f9b9055385c7feb18b20cbafc5c5a688b14f3515"}, ] [package.dependencies] @@ -1604,14 +1604,14 @@ six = ">=1.6.1" [[package]] name = "oauthlib" -version = "3.2.2" +version = "3.3.1" description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic" optional = false -python-versions = ">=3.6" +python-versions = ">=3.8" groups = ["main"] files = [ - {file = "oauthlib-3.2.2-py3-none-any.whl", hash = "sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca"}, - {file = "oauthlib-3.2.2.tar.gz", hash = "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918"}, + {file = "oauthlib-3.3.1-py3-none-any.whl", hash = "sha256:88119c938d2b8fb88561af5f6ee0eec8cc8d552b7bb1f712743136eb7523b7a1"}, + {file = "oauthlib-3.3.1.tar.gz", hash = "sha256:0f0f8aa759826a193cf66c12ea1af1637f87b9b4622d46e866952bb022e538c9"}, ] [package.extras] @@ -2008,14 +2008,14 @@ pytweening = ">=1.0.4" [[package]] name = "pycodestyle" -version = "2.13.0" +version = "2.14.0" description = "Python style guide checker" optional = false python-versions = ">=3.9" groups = ["dev"] files = [ - {file = "pycodestyle-2.13.0-py2.py3-none-any.whl", hash = "sha256:35863c5974a271c7a726ed228a14a4f6daf49df369d8c50cd9a6f58a5e143ba9"}, - {file = "pycodestyle-2.13.0.tar.gz", hash = "sha256:c8415bf09abe81d9c7f872502a6eee881fbe85d8763dd5b9924bb0a01d67efae"}, + {file = "pycodestyle-2.14.0-py2.py3-none-any.whl", hash = "sha256:dd6bf7cb4ee77f8e016f9c8e74a35ddd9f67e1d5fd4184d86c3b98e07099f42d"}, + {file = "pycodestyle-2.14.0.tar.gz", hash = "sha256:c4b5b517d278089ff9d0abdec919cd97262a3367449ea1c8b49b91529167b783"}, ] [[package]] @@ -2126,14 +2126,14 @@ pyrect = "*" [[package]] name = "pygments" -version = "2.19.1" +version = "2.19.2" description = "Pygments is a syntax highlighting package written in Python." optional = false python-versions = ">=3.8" groups = ["main", "dev", "docs"] files = [ - {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"}, - {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"}, + {file = "pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b"}, + {file = "pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887"}, ] [package.extras] @@ -2815,19 +2815,16 @@ rsa = ["oauthlib[signedtoken] (>=3.0.0)"] [[package]] name = "retrying" -version = "1.3.4" +version = "1.4.0" description = "Retrying" optional = false -python-versions = "*" +python-versions = ">=3.6" groups = ["main"] files = [ - {file = "retrying-1.3.4-py3-none-any.whl", hash = "sha256:8cc4d43cb8e1125e0ff3344e9de678fefd85db3b750b81b2240dc0183af37b35"}, - {file = "retrying-1.3.4.tar.gz", hash = "sha256:345da8c5765bd982b1d1915deb9102fd3d1f7ad16bd84a9700b85f64d24e8f3e"}, + {file = "retrying-1.4.0-py3-none-any.whl", hash = "sha256:6509d829c70271937605bce361c8f76e91f9123d355d14df7dc6972b1518064a"}, + {file = "retrying-1.4.0.tar.gz", hash = "sha256:efa99c78bf4fbdbe6f0cba4101470fbc684b93d30ca45ffa1288443a9805172f"}, ] -[package.dependencies] -six = ">=1.7.0" - [[package]] name = "rfc3161-client" version = "1.0.3" @@ -3119,21 +3116,21 @@ websocket-client = ">=1.8.0,<1.9.0" [[package]] name = "seleniumbase" -version = "4.39.4" +version = "4.39.5" description = "A complete web automation framework for end-to-end testing." optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "seleniumbase-4.39.4-py3-none-any.whl", hash = "sha256:15562b2550ce6f6fdcc524ff9bd87a1d7381a558767245f10ff63982f508c281"}, - {file = "seleniumbase-4.39.4.tar.gz", hash = "sha256:8880869b88fa5a48c649a776488bafa1ca97d786fb8a25f63e6d5b5b5fc47f44"}, + {file = "seleniumbase-4.39.5-py3-none-any.whl", hash = "sha256:bda571f4864bba126442571bb0a3ae8a9bee9253461253ac84affd9a48efdb4d"}, + {file = "seleniumbase-4.39.5.tar.gz", hash = "sha256:a6d4930eb894c84d881f0fa596fb357b0fa2bb5a9e89ac3875d9e89eb27054c7"}, ] [package.dependencies] attrs = ">=25.3.0" beautifulsoup4 = "4.13.4" behave = "1.2.6" -certifi = ">=2025.4.26" +certifi = ">=2025.6.15" chardet = "5.2.0" charset-normalizer = ">=3.4.2,<4" colorama = ">=0.4.6" @@ -3192,7 +3189,7 @@ wsproto = "1.2.0" [package.extras] allure = ["allure-behave (>=2.13.5)", "allure-pytest (>=2.13.5)", "allure-python-commons (>=2.13.5)"] -coverage = ["coverage (>=7.6.1) ; python_version < \"3.9\"", "coverage (>=7.9.0) ; python_version >= \"3.9\"", "pytest-cov (>=5.0.0) ; python_version < \"3.9\"", "pytest-cov (>=6.2.1) ; python_version >= \"3.9\""] +coverage = ["coverage (>=7.6.1) ; python_version < \"3.9\"", "coverage (>=7.9.1) ; python_version >= \"3.9\"", "pytest-cov (>=5.0.0) ; python_version < \"3.9\"", "pytest-cov (>=6.2.1) ; python_version >= \"3.9\""] flake8 = ["flake8 (==5.0.4) ; python_version < \"3.9\"", "flake8 (==7.2.0) ; python_version >= \"3.9\"", "mccabe (==0.7.0)", "pycodestyle (==2.13.0) ; python_version >= \"3.9\"", "pycodestyle (==2.9.1) ; python_version < \"3.9\"", "pyflakes (==2.5.0) ; python_version < \"3.9\"", "pyflakes (==3.3.2) ; python_version >= \"3.9\""] ipdb = ["ipdb (==0.13.13)", "ipython (==7.34.0)"] mss = ["mss (==10.0.0) ; python_version >= \"3.9\"", "mss (==9.0.2) ; python_version < \"3.9\""] @@ -3565,18 +3562,19 @@ test = ["pytest"] [[package]] name = "starlette" -version = "0.47.0" +version = "0.47.1" description = "The little ASGI library that shines." optional = false python-versions = ">=3.9" groups = ["docs"] files = [ - {file = "starlette-0.47.0-py3-none-any.whl", hash = "sha256:9d052d4933683af40ffd47c7465433570b4949dc937e20ad1d73b34e72f10c37"}, - {file = "starlette-0.47.0.tar.gz", hash = "sha256:1f64887e94a447fed5f23309fb6890ef23349b7e478faa7b24a851cd4eb844af"}, + {file = "starlette-0.47.1-py3-none-any.whl", hash = "sha256:5e11c9f5c7c3f24959edbf2dffdc01bba860228acf657129467d8a7468591527"}, + {file = "starlette-0.47.1.tar.gz", hash = "sha256:aef012dd2b6be325ffa16698f9dc533614fb1cebd593a906b90dc1025529a79b"}, ] [package.dependencies] anyio = ">=3.6.2,<5" +typing-extensions = {version = ">=4.10.0", markers = "python_version < \"3.13\""} [package.extras] full = ["httpx (>=0.27.0,<0.29.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.18)", "pyyaml"] @@ -3841,14 +3839,14 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "uvicorn" -version = "0.34.3" +version = "0.35.0" description = "The lightning-fast ASGI server." optional = false python-versions = ">=3.9" groups = ["docs"] files = [ - {file = "uvicorn-0.34.3-py3-none-any.whl", hash = "sha256:16246631db62bdfbf069b0645177d6e8a77ba950cfedbfd093acef9444e4d885"}, - {file = "uvicorn-0.34.3.tar.gz", hash = "sha256:35919a9a979d7a59334b6b10e05d77c1d0d574c50e0fc98b8b1a0f165708b55a"}, + {file = "uvicorn-0.35.0-py3-none-any.whl", hash = "sha256:197535216b25ff9b785e29a0b79199f55222193d47f820816e7da751e9bc8d4a"}, + {file = "uvicorn-0.35.0.tar.gz", hash = "sha256:bc662f087f7cf2ce11a1d7fd70b90c9f98ef2e2831556dd078d131b96cc94a01"}, ] [package.dependencies] @@ -4162,14 +4160,14 @@ h11 = ">=0.9.0,<1" [[package]] name = "yt-dlp" -version = "2025.6.9" +version = "2025.6.25" description = "A feature-rich command-line audio/video downloader" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "yt_dlp-2025.6.9-py3-none-any.whl", hash = "sha256:ebdfda9ffa807f6a26aed7c8f906e5557cd06b4c388dc547df1ec2078631fca8"}, - {file = "yt_dlp-2025.6.9.tar.gz", hash = "sha256:751f53a3b61353522bf805fa30bbcbd16666126537e39706eab4f8c368f111ac"}, + {file = "yt_dlp-2025.6.25-py3-none-any.whl", hash = "sha256:1eb31c9a47d56c7433be23a6ae084c640bd4e14961ad43076927ef05280871ea"}, + {file = "yt_dlp-2025.6.25.tar.gz", hash = "sha256:242b648e1a18ab04bdd4cc175a317fe8ec3ad7d0175eee9f981912624b3d6c8b"}, ] [package.dependencies] From c2c9718f73db8afffce39d1f01ad1e31e79b82e2 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 30 Jun 2025 02:20:51 +0100 Subject: [PATCH 16/17] make python api tests work on gh when no env is set --- .github/workflows/tests-download.yaml | 2 +- tests/extractors/test_twitter_api_extractor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests-download.yaml b/.github/workflows/tests-download.yaml index 51102be..1c5f5e5 100644 --- a/.github/workflows/tests-download.yaml +++ b/.github/workflows/tests-download.yaml @@ -47,4 +47,4 @@ jobs: - name: Run Download Tests run: poetry run pytest -ra -v -x -m "download" env: - TWITTER_BEARER_TOKEN: ${{ secrets.TWITTER_BEARER_TOKEN }} + TWITTER_BEARER_TOKEN: ${{ secrets.TWITTER_BEARER_TOKEN || '' }} diff --git a/tests/extractors/test_twitter_api_extractor.py b/tests/extractors/test_twitter_api_extractor.py index 1e07434..94e18b3 100644 --- a/tests/extractors/test_twitter_api_extractor.py +++ b/tests/extractors/test_twitter_api_extractor.py @@ -13,7 +13,7 @@ class TestTwitterApiExtractor(TestExtractorBase): config = { "bearer_tokens": [], - "bearer_token": os.environ.get("TWITTER_BEARER_TOKEN", "TEST_KEY"), + "bearer_token": os.environ.get("TWITTER_BEARER_TOKEN") or "TEST_KEY", "consumer_key": os.environ.get("TWITTER_CONSUMER_KEY"), "consumer_secret": os.environ.get("TWITTER_CONSUMER_SECRET"), "access_token": os.environ.get("TWITTER_ACCESS_TOKEN"), From 649412053e2d9d6848ced0bb3835f5663b85fbf4 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 30 Jun 2025 02:27:21 +0100 Subject: [PATCH 17/17] exclude non-ready code --- .../captcha_services/anti_captcha.py | 60 ------------------- 1 file changed, 60 deletions(-) delete mode 100644 src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/anti_captcha.py diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/anti_captcha.py b/src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/anti_captcha.py deleted file mode 100644 index f624953..0000000 --- a/src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/anti_captcha.py +++ /dev/null @@ -1,60 +0,0 @@ -# def solve_captcha(image_url): -# # Download image -# img_data = requests.get(image_url).content -# encoded_image = base64.b64encode(img_data).decode() - -# # Submit to AntiCaptcha -# task = { -# "clientKey": ANTI_CAPTCHA_KEY, -# "task": { -# "type": "ImageToTextTask", -# "body": encoded_image -# } -# } -# print("[*] Sending captcha request to anti-captcha...") - -# task_response = requests.post("https://api.anti-captcha.com/createTask", json=task).json() -# task_id = task_response["taskId"] -# print(f"[*] Anti-captcha response: {task_response}") - -# # Poll for result -# while True: -# time.sleep(5) -# res = requests.post("https://api.anti-captcha.com/getTaskResult", json={ -# "clientKey": ANTI_CAPTCHA_KEY, -# "taskId": task_id -# }).json() -# if res["status"] == "ready": -# print(f"[*] Captcha solved: {res}") -# return res["solution"]["text"] -# print(f"[*] Polling for captcha solution: {res['status']}") - - -# def solve_recaptcha(site_key, page_url): -# print("[*] Sending captcha request to anti-captcha...") -# # Step 1: Send captcha request -# task_payload = { -# "clientKey": ANTI_CAPTCHA_KEY, -# "task": { -# "type": "NoCaptchaTaskProxyless", -# "websiteURL": page_url, -# "websiteKey": site_key -# } -# } -# response = requests.post("https://api.anti-captcha.com/createTask", json=task_payload).json() -# print(f"[*] Anti-captcha response: {response}") -# task_id = response["taskId"] - -# # Step 2: Poll for solution -# print("[*] Polling for captcha solution...") -# for i in range(40): # ~80 seconds -# time.sleep(2) -# result = requests.post("https://api.anti-captcha.com/getTaskResult", json={ -# "clientKey": ANTI_CAPTCHA_KEY, -# "taskId": task_id -# }).json() -# print(f" Poll {i+1}: status={result['status']}") -# if result["status"] == "ready": -# print("[*] Captcha solved!") -# return result["solution"]["gRecaptchaResponse"] -# raise TimeoutError("AntiCaptcha took too long")