From ce4d7ac6496d2621fdace69217014e6c6a04bae0 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Sat, 21 Jun 2025 15:54:51 +0100 Subject: [PATCH] WIP refactor logging --- scripts/telegram_setup.py | 2 +- src/auto_archiver/core/base_module.py | 2 +- src/auto_archiver/core/config.py | 8 +-- src/auto_archiver/core/extractor.py | 6 +- src/auto_archiver/core/media.py | 5 +- src/auto_archiver/core/metadata.py | 2 +- src/auto_archiver/core/module.py | 2 +- src/auto_archiver/core/orchestrator.py | 48 ++++++++------- src/auto_archiver/core/storage.py | 2 +- .../antibot_extractor_enricher.py | 23 ++++--- .../captcha_services/anti_captcha.py | 60 +++++++++++++++++++ .../antibot_extractor_enricher/dropin.py | 7 ++- .../dropins/linkedin.py | 4 +- .../dropins/reddit.py | 8 +-- .../antibot_extractor_enricher/dropins/vk.py | 6 +- src/auto_archiver/modules/api_db/api_db.py | 6 +- .../atlos_feeder_db_storage.py | 16 ++--- .../modules/cli_feeder/cli_feeder.py | 5 -- .../modules/console_db/console_db.py | 2 +- src/auto_archiver/modules/csv_db/csv_db.py | 2 +- .../modules/csv_feeder/csv_feeder.py | 9 ++- .../modules/gdrive_storage/gdrive_storage.py | 10 ++-- .../modules/generic_extractor/bluesky.py | 4 +- .../generic_extractor/generic_extractor.py | 51 ++++++++-------- .../modules/generic_extractor/tiktok.py | 4 +- .../modules/generic_extractor/twitter.py | 6 +- .../gsheet_feeder_db/gsheet_feeder_db.py | 39 ++++++------ .../modules/hash_enricher/hash_enricher.py | 5 +- .../modules/html_formatter/html_formatter.py | 4 +- .../instagram_api_extractor.py | 36 +++++------ .../instagram_extractor.py | 40 ++++++------- .../instagram_tbot_extractor.py | 2 +- .../modules/json_enricher/json_enricher.py | 2 +- .../modules/local_storage/local_storage.py | 2 +- .../modules/meta_enricher/meta_enricher.py | 2 +- .../metadata_enricher/metadata_enricher.py | 2 +- .../opentimestamps_enricher.py | 2 +- .../pdq_hash_enricher/pdq_hash_enricher.py | 2 +- .../modules/s3_storage/s3_storage.py | 2 +- .../modules/ssl_enricher/ssl_enricher.py | 2 +- .../telegram_extractor/telegram_extractor.py | 2 +- .../telethon_extractor/telethon_extractor.py | 2 +- .../thumbnail_enricher/thumbnail_enricher.py | 2 +- .../timestamping_enricher.py | 2 +- .../twitter_api_extractor.py | 2 +- .../wacz_extractor_enricher.py | 2 +- .../wayback_extractor_enricher.py | 2 +- .../whisper_enricher/whisper_enricher.py | 2 +- src/auto_archiver/utils/custom_logger.py | 37 ++++++++++++ src/auto_archiver/utils/misc.py | 2 +- tests/conftest.py | 2 +- .../example_extractor/example_extractor.py | 2 +- .../example_module/example_module.py | 2 +- tests/test_implementation.py | 2 +- 54 files changed, 298 insertions(+), 207 deletions(-) create mode 100644 src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/anti_captcha.py create mode 100644 src/auto_archiver/utils/custom_logger.py diff --git a/scripts/telegram_setup.py b/scripts/telegram_setup.py index 9480cd8..c11f94a 100644 --- a/scripts/telegram_setup.py +++ b/scripts/telegram_setup.py @@ -14,7 +14,7 @@ You will need to provide your phone number and a 2FA code the first time you run import os from telethon.sync import TelegramClient -from loguru import logger +from auto_archiver.utils.custom_logger import logger # Create a diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index 6461ab7..f12c38d 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -7,7 +7,7 @@ from tempfile import TemporaryDirectory from auto_archiver.utils import url as UrlUtil from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES -from loguru import logger +from auto_archiver.utils.custom_logger import logger if TYPE_CHECKING: from .module import ModuleFactory diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index a2d7679..8e65edf 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -10,7 +10,7 @@ from ruamel.yaml import YAML, CommentedMap import json import os -from loguru import logger +from auto_archiver.utils.custom_logger import logger from copy import deepcopy from auto_archiver.core.consts import MODULE_TYPES @@ -118,8 +118,7 @@ class DefaultValidatingParser(argparse.ArgumentParser): """ Override of error to format a nicer looking error message using logger """ - logger.error("Problem with configuration file (tip: use --help to see the available options):") - logger.error(message) + logger.error(f"Problem with configuration file (tip: use --help to see the available options): \n{message}") self.exit(2) def parse_known_args(self, args=None, namespace=None): @@ -136,8 +135,7 @@ class DefaultValidatingParser(argparse.ArgumentParser): try: self._check_value(action, action.default) except argparse.ArgumentError as e: - logger.error(f"You have an invalid setting in your configuration file ({action.dest}):") - logger.error(e) + logger.error(f"You have an invalid setting in your configuration file ({action.dest}):\n {e}") exit() return super().parse_known_args(args, namespace) diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index 5dca928..1720c68 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -12,7 +12,7 @@ from contextlib import suppress import mimetypes import os import requests -from loguru import logger +from auto_archiver.utils.custom_logger import logger from retrying import retry import re @@ -94,7 +94,7 @@ class Extractor(BaseModule): to_filename = to_filename[-64:] to_filename = os.path.join(self.tmp_dir, to_filename) if verbose: - logger.debug(f"downloading {url[0:50]=} {to_filename=}") + logger.debug(f"downloading {to_filename=}") headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36" } @@ -117,7 +117,7 @@ class Extractor(BaseModule): return to_filename except requests.RequestException as e: - logger.warning(f"Failed to fetch the Media URL: {str(e)[:250]}") + logger.warning(f"Failed to fetch the Media URL: {e}") if try_best_quality: return None, url diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py index 2fad0ec..fee81d3 100644 --- a/src/auto_archiver/core/media.py +++ b/src/auto_archiver/core/media.py @@ -11,7 +11,7 @@ from dataclasses import dataclass, field from dataclasses_json import dataclass_json, config import mimetypes -from loguru import logger +from auto_archiver.utils.custom_logger import logger @dataclass_json # annotation order matters @@ -121,8 +121,7 @@ class Media: except Error: return False # ffmpeg errors when reading bad files except Exception as e: - logger.error(e) - logger.error(traceback.format_exc()) + logger.error(f"{e}: {traceback.format_exc()}") try: fsize = os.path.getsize(self.filename) return fsize > 20_000 diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index 370af78..f1ac3c0 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -17,7 +17,7 @@ from dataclasses_json import dataclass_json import datetime from urllib.parse import urlparse from dateutil.parser import parse as parse_dt -from loguru import logger +from auto_archiver.utils.custom_logger import logger from .media import Media diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index f620500..1aad298 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -16,7 +16,7 @@ import sys from importlib.util import find_spec import os from os.path import join -from loguru import logger +from auto_archiver.utils.custom_logger import logger import auto_archiver from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE, SetupError diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index a028ac7..27a1bc9e 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -15,9 +15,11 @@ import traceback from copy import copy from rich_argparse import RichHelpFormatter -from loguru import logger +from auto_archiver.utils.custom_logger import logger import requests +from auto_archiver.utils.misc import random_str + from .metadata import Metadata, Media from auto_archiver.version import __version__ from .config import ( @@ -342,7 +344,12 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ # add other logging info if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0 use_level = logging_config["level"] - self.logger_id = logger.add(sys.stderr, level=use_level) + self.logger_id = logger.add( + sys.stderr, + level=use_level, + catch=True, + format="{level}: {message} {extra[serialize_no_message]}", + ) rotation = logging_config["rotation"] log_file = logging_config["file"] @@ -356,9 +363,10 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ f"{log_file}.{i}_{level.lower()}", filter=lambda rec, lvl=level: rec["level"].name == lvl, rotation=rotation, + format="{extra[serialized]}", ) elif log_file: - logger.add(log_file, rotation=rotation, level=use_level) + logger.add(log_file, rotation=rotation, level=use_level, format="{extra[serialized]}") def install_modules(self, modules_by_type): """ @@ -466,13 +474,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ update_cmd = "`docker pull bellingcat/auto-archiver:latest`" else: update_cmd = "`pip install --upgrade auto-archiver`" - logger.warning("") - logger.warning("********* IMPORTANT: UPDATE AVAILABLE ********") logger.warning( - f"A new version of auto-archiver is available (v{latest_version}, you have v{current_version})" + f"\n********* IMPORTANT: UPDATE AVAILABLE ********\nA new version of auto-archiver is available (v{latest_version}, you have v{current_version})\nMake sure to update to the latest version using: {update_cmd}\n" ) - logger.warning(f"Make sure to update to the latest version using: {update_cmd}") - logger.warning("") def setup(self, args: list): """ @@ -522,7 +526,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ self.setup(args) return self.feed() except Exception as e: - logger.error(e) + logger.error(f"{e}: {traceback.format_exc()}") exit(1) def cleanup(self) -> None: @@ -534,10 +538,12 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ url_count = 0 for feeder in self.feeders: for item in feeder: - yield self.feed_item(item) - url_count += 1 + with logger.contextualize(url=item.get_url(), trace=random_str(12)): + logger.info("started processing") + yield self.feed_item(item) + url_count += 1 - logger.info(f"Processed {url_count} URL(s)") + logger.info(f"processed {url_count} URL(s)") self.cleanup() def feed_item(self, item: Metadata) -> Metadata: @@ -555,13 +561,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ return self.archive(item) except KeyboardInterrupt: # catches keyboard interruptions to do a clean exit - logger.warning(f"caught interrupt on {item=}") + logger.warning("caught interrupt") for d in self.databases: d.aborted(item) self.cleanup() exit() except Exception as e: - logger.error(f"Got unexpected error on item {item}: {e}\n{traceback.format_exc()}") + logger.error(f"Got unexpected error: {e}\n{traceback.format_exc()}") for d in self.databases: if isinstance(e, AssertionError): d.failed(item, str(e)) @@ -589,7 +595,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ try: check_url_or_raise(original_url) except ValueError as e: - logger.error(f"Error archiving URL {original_url}: {e}") + logger.error(f"Error archiving: {e}") raise e # 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs @@ -599,7 +605,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ result.set_url(url) if original_url != url: - logger.debug(f"Sanitized URL from {original_url} to {url}") + logger.debug(f"Sanitized URL to {url}") result.set("original_url", original_url) # 2 - notify start to DBs, propagate already archived if feature enabled in DBs @@ -614,25 +620,25 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ try: d.done(cached_result, cached=True) except Exception as e: - logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}") + logger.error(f"database {d.name}: {e}: {traceback.format_exc()}") return cached_result # 3 - call extractors until one succeeds for a in self.extractors: - logger.info(f"Trying extractor {a.name} for {url}") + logger.info(f"trying extractor {a.name}") try: result.merge(a.download(result)) if result.is_success(): break except Exception as e: - logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}") + logger.error(f"archiver {a.name}: {e}: {traceback.format_exc()}") # 4 - call enrichers to work with archived content for e in self.enrichers: try: e.enrich(result) except Exception as exc: - logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}") + logger.error(f"enricher {e.name}: {exc}: {traceback.format_exc()}") # 5 - store all downloaded/generated media result.store(storages=self.storages) @@ -651,7 +657,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ try: d.done(result) except Exception as e: - logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}") + logger.error(f"database {d.name}: {e}: {traceback.format_exc()}") return result diff --git a/src/auto_archiver/core/storage.py b/src/auto_archiver/core/storage.py index 3205f5a..fd743cb 100644 --- a/src/auto_archiver/core/storage.py +++ b/src/auto_archiver/core/storage.py @@ -24,7 +24,7 @@ from abc import abstractmethod from typing import IO import os -from loguru import logger +from auto_archiver.utils.custom_logger import logger from slugify import slugify from auto_archiver.utils.misc import random_str diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py index 04e4702..e380adb 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py @@ -7,7 +7,7 @@ from urllib.parse import urljoin import glob import importlib.util -from loguru import logger +from auto_archiver.utils.custom_logger import logger import selenium from seleniumbase import SB @@ -57,7 +57,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): continue # Skip imported modules/classes/functions if isinstance(obj, type) and issubclass(obj, Dropin): dropins.append(obj) - logger.debug(f"ANTIBOT loaded drop-in classes: {', '.join([d.__name__ for d in dropins])}") + logger.debug(f"loaded drop-in classes: {', '.join([d.__name__ for d in dropins])}") return dropins def sanitize_url(self, url: str) -> str: @@ -83,14 +83,13 @@ class AntibotExtractorEnricher(Extractor, Enricher): def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool: using_user_data_dir = self.user_data_dir if custom_data_dir else None url = to_enrich.get_url() - url_sample = url[:75] try: with SB(uc=True, agent=self.agent, headed=None, user_data_dir=using_user_data_dir, proxy=self.proxy) as sb: - logger.info(f"ANTIBOT selenium browser is up with agent {self.agent}, opening {url_sample}...") + logger.info(f"selenium browser is up with agent {self.agent}, opening url...") sb.uc_open_with_reconnect(url, 4) - logger.debug(f"ANTIBOT handling CAPTCHAs for {url_sample}...") + logger.debug("handling CAPTCHAs for...") sb.uc_gui_handle_cf() sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future @@ -98,7 +97,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): dropin.open_page(url) if self.detect_auth_wall and self._hit_auth_wall(sb): - logger.warning(f"ANTIBOT SKIP since auth wall or CAPTCHA was detected for {url_sample}") + logger.warning("skipping since auth wall or CAPTCHA was detected") return False sb.wait_for_ready_state_complete() @@ -125,18 +124,18 @@ class AntibotExtractorEnricher(Extractor, Enricher): js_css_selector=dropin.js_for_video_css_selectors(), max_media=self.max_download_videos - downloaded_videos, ) - logger.info(f"ANTIBOT completed for {url_sample}") + logger.info("completed") return to_enrich except selenium.common.exceptions.SessionNotCreatedException as e: if custom_data_dir: # the retry logic only works once logger.error( - f"ANTIBOT session not created error: {e}. Please remove the user_data_dir {self.user_data_dir} and try again, will retry without user data dir though." + f"session not created error: {e}. Please remove the user_data_dir {self.user_data_dir} and try again, will retry without user data dir though." ) return self.enrich(to_enrich, custom_data_dir=False) raise e # re-raise except Exception as e: - logger.error(f"ANTIBOT runtime error: {e}: {traceback.format_exc()}") + logger.error(f"runtime error: {e}: {traceback.format_exc()}") return False def _get_suitable_dropin(self, url: str, sb: SB): @@ -146,7 +145,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): """ for dropin in self.dropins: if dropin.suitable(url): - logger.debug(f"ANTIBOT using drop-in {dropin.__name__} for {url}") + logger.debug(f"using drop-in {dropin.__name__}") return dropin(sb, self) return DefaultDropin(sb, self) @@ -241,7 +240,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): x = max(sb.execute_script("return document.documentElement.scrollWidth"), w) y = min(max(sb.execute_script("return document.documentElement.scrollHeight"), h), 25_000) - logger.debug(f"Setting window size to {x}x{y} for full page screenshot.") + logger.debug(f"setting window size to {x}x{y} for full page screenshot.") sb.set_window_size(x, y) screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png") @@ -280,7 +279,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): # js_for_css_selectors for src in sources: if len(all_urls) >= max_media: - logger.debug(f"Reached max download limit of {max_media} images/videos.") + logger.debug(f"reached max download limit of {max_media} images/videos.") break if not is_relevant_url(src): continue diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/anti_captcha.py b/src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/anti_captcha.py new file mode 100644 index 0000000..f624953 --- /dev/null +++ b/src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/anti_captcha.py @@ -0,0 +1,60 @@ +# def solve_captcha(image_url): +# # Download image +# img_data = requests.get(image_url).content +# encoded_image = base64.b64encode(img_data).decode() + +# # Submit to AntiCaptcha +# task = { +# "clientKey": ANTI_CAPTCHA_KEY, +# "task": { +# "type": "ImageToTextTask", +# "body": encoded_image +# } +# } +# print("[*] Sending captcha request to anti-captcha...") + +# task_response = requests.post("https://api.anti-captcha.com/createTask", json=task).json() +# task_id = task_response["taskId"] +# print(f"[*] Anti-captcha response: {task_response}") + +# # Poll for result +# while True: +# time.sleep(5) +# res = requests.post("https://api.anti-captcha.com/getTaskResult", json={ +# "clientKey": ANTI_CAPTCHA_KEY, +# "taskId": task_id +# }).json() +# if res["status"] == "ready": +# print(f"[*] Captcha solved: {res}") +# return res["solution"]["text"] +# print(f"[*] Polling for captcha solution: {res['status']}") + + +# def solve_recaptcha(site_key, page_url): +# print("[*] Sending captcha request to anti-captcha...") +# # Step 1: Send captcha request +# task_payload = { +# "clientKey": ANTI_CAPTCHA_KEY, +# "task": { +# "type": "NoCaptchaTaskProxyless", +# "websiteURL": page_url, +# "websiteKey": site_key +# } +# } +# response = requests.post("https://api.anti-captcha.com/createTask", json=task_payload).json() +# print(f"[*] Anti-captcha response: {response}") +# task_id = response["taskId"] + +# # Step 2: Poll for solution +# print("[*] Polling for captcha solution...") +# for i in range(40): # ~80 seconds +# time.sleep(2) +# result = requests.post("https://api.anti-captcha.com/getTaskResult", json={ +# "clientKey": ANTI_CAPTCHA_KEY, +# "taskId": task_id +# }).json() +# print(f" Poll {i+1}: status={result['status']}") +# if result["status"] == "ready": +# print("[*] Captcha solved!") +# return result["solution"]["gRecaptchaResponse"] +# raise TimeoutError("AntiCaptcha took too long") diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py index d4b255d..c45d7ad 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py @@ -1,6 +1,7 @@ import os +import traceback from typing import Mapping -from loguru import logger +from auto_archiver.utils.custom_logger import logger from seleniumbase import SB import yt_dlp @@ -143,7 +144,7 @@ class Dropin: with yt_dlp.YoutubeDL(validated_options) as ydl: for url in video_urls: try: - logger.debug(f"Downloading video from URL: {url}") + logger.debug("downloading video from url") info = ydl.extract_info(url, download=True) filename = ydl_entry_to_filename(ydl, info) if not filename: # Failed to download video. @@ -155,5 +156,5 @@ class Dropin: to_enrich.add_media(media) downloaded += 1 except Exception as e: - logger.error(f"Error downloading {url}: {e}") + logger.error(f"download failed: {e} {traceback.format_exc()}") return downloaded diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py index 336b630..082e409 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py @@ -1,5 +1,5 @@ from typing import Mapping -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin @@ -62,7 +62,7 @@ class LinkedinDropin(Dropin): self.sb.wait_for_ready_state_complete() username, password = self._get_username_password("linkedin.com") - logger.debug("LinkedinDropin Logging in to Linkedin with username: {}", username) + logger.debug("logging in to Linkedin with username: {}", username) self.sb.type("#username", username) self.sb.type("#password", password) self.sb.click_if_visible("#password-visibility-toggle", timeout=0.5) diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py index 3f699b6..7f5e23e 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py @@ -3,7 +3,7 @@ from typing import Mapping from auto_archiver.core.metadata import Metadata from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin -from loguru import logger +from auto_archiver.utils.custom_logger import logger class RedditDropin(Dropin): @@ -50,7 +50,7 @@ class RedditDropin(Dropin): self._close_cookies_banner() username, password = self._get_username_password("reddit.com") - logger.debug("RedditDropin Logging in to Reddit with username: {}", username) + logger.debug("logging in to Reddit with username: {}", username) self.sb.type("#login-username", username) self.sb.type("#login-password", password) @@ -68,7 +68,7 @@ class RedditDropin(Dropin): self.sb.click_link_text("Log in") self.sb.wait_for_ready_state_complete() if self.sb.is_text_visible("Welcome back"): - logger.debug("RedditDropin Login successful") + logger.debug("login successful") self.sb.click_if_visible("this link") def _close_cookies_banner(self): @@ -88,5 +88,5 @@ class RedditDropin(Dropin): .map(el => el.src || el.href) .filter(url => url && /\.(m3u8|mpd|ism)$/.test(url)); """) - logger.debug("RedditDropin Found {} video URLs", len(filtered_urls)) + logger.debug("found {} video URLs", len(filtered_urls)) return 0, self._download_videos_with_ytdlp(filtered_urls, to_enrich) diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py index 3f92eda..02afd75 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py @@ -4,7 +4,7 @@ from typing import Mapping from auto_archiver.core.metadata import Metadata from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin -from loguru import logger +from auto_archiver.utils.custom_logger import logger class VkDropin(Dropin): @@ -57,12 +57,12 @@ class VkDropin(Dropin): self.sb.open("https://vk.com") self.sb.wait_for_ready_state_complete() if "/feed" in self.sb.get_current_url(): - logger.debug("Already logged in to VK.") + logger.debug("already logged in to VK.") return True # need to login username, password = self._get_username_password("vk.com") - logger.debug("Logging in to VK with username: {}", username) + logger.debug("logging in to VK with username: {}", username) self.sb.click('[data-testid="enter-another-way"]', timeout=10) self.sb.clear('input[name="login"][type="tel"]', by="css selector", timeout=10) diff --git a/src/auto_archiver/modules/api_db/api_db.py b/src/auto_archiver/modules/api_db/api_db.py index c422248..1475375 100644 --- a/src/auto_archiver/modules/api_db/api_db.py +++ b/src/auto_archiver/modules/api_db/api_db.py @@ -2,7 +2,7 @@ from typing import Union import os import requests -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Database from auto_archiver.core import Metadata @@ -36,9 +36,9 @@ class AAApiDb(Database): if not self.store_results: return if cached: - logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached") + logger.debug("skipping saving archive to AA API because it was cached") return - logger.debug(f"saving archive of {item.get_url()} to the AA API.") + logger.debug("saving archive to the AA API.") payload = { "author_id": self.author_id, diff --git a/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py b/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py index c84abd6..814800d 100644 --- a/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py +++ b/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py @@ -3,7 +3,7 @@ import os from typing import IO, Iterator, Optional, Union import requests -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Database, Feeder, Media, Metadata, Storage from auto_archiver.utils import calculate_file_hash @@ -66,13 +66,13 @@ class AtlosFeederDbStorage(Feeder, Database, Storage): """Mark an item as failed in Atlos, if the ID exists.""" atlos_id = item.metadata.get("atlos_id") if not atlos_id: - logger.info(f"Item {item.get_url()} has no Atlos ID, skipping") + logger.info("No Atlos ID available, skipping") return self._post( f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver", json={"metadata": {"processed": True, "status": "error", "error": reason}}, ) - logger.info(f"Stored failure for {item.get_url()} (ID {atlos_id}) on Atlos: {reason}") + logger.info(f"stored failure ID {atlos_id} on Atlos: {reason}") def fetch(self, item: Metadata) -> Union[Metadata, bool]: """check and fetch if the given item has been archived already, each @@ -88,7 +88,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage): """Mark an item as successfully archived in Atlos.""" atlos_id = item.metadata.get("atlos_id") if not atlos_id: - logger.info(f"Item {item.get_url()} has no Atlos ID, skipping") + logger.info("item has no Atlos ID, skipping") return self._post( f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver", @@ -100,7 +100,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage): } }, ) - logger.info(f"Stored success for {item.get_url()} (ID {atlos_id}) on Atlos") + logger.info(f"stored success ID {atlos_id} on Atlos") # ! Atlos Module - Storage Methods @@ -111,12 +111,12 @@ class AtlosFeederDbStorage(Feeder, Database, Storage): def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool: """Upload a media file to Atlos if it has not been uploaded already.""" if metadata is None: - logger.error(f"No metadata provided for {media.filename}") + logger.error(f"no metadata provided for {media.filename}") return False atlos_id = metadata.get("atlos_id") if not atlos_id: - logger.error(f"No Atlos ID found in metadata; can't store {media.filename} in Atlos.") + logger.error(f"no Atlos ID found in metadata; can't store {media.filename} in Atlos.") return False media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096) @@ -135,7 +135,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage): params={"title": media.properties}, files={"file": (os.path.basename(media.filename), file_obj)}, ) - logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}") + logger.info(f"uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}") return True def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: diff --git a/src/auto_archiver/modules/cli_feeder/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py index 5935466..7bb243b 100644 --- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py +++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py @@ -1,5 +1,3 @@ -from loguru import logger - from auto_archiver.core.feeder import Feeder from auto_archiver.core.metadata import Metadata from auto_archiver.core.consts import SetupError @@ -16,8 +14,5 @@ class CLIFeeder(Feeder): def __iter__(self) -> Metadata: urls = self.config["urls"] for url in urls: - logger.debug(f"Processing {url}") m = Metadata().set_url(url) yield m - - logger.success(f"Processed {len(urls)} URL(s)") diff --git a/src/auto_archiver/modules/console_db/console_db.py b/src/auto_archiver/modules/console_db/console_db.py index c6711c5..d6c1383 100644 --- a/src/auto_archiver/modules/console_db/console_db.py +++ b/src/auto_archiver/modules/console_db/console_db.py @@ -1,4 +1,4 @@ -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Database from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/csv_db/csv_db.py b/src/auto_archiver/modules/csv_db/csv_db.py index ac31027..aff4ad0 100644 --- a/src/auto_archiver/modules/csv_db/csv_db.py +++ b/src/auto_archiver/modules/csv_db/csv_db.py @@ -1,5 +1,5 @@ import os -from loguru import logger +from auto_archiver.utils.custom_logger import logger from csv import DictWriter from dataclasses import asdict diff --git a/src/auto_archiver/modules/csv_feeder/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py index 9c72162..f41f6b4 100644 --- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py +++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py @@ -1,4 +1,4 @@ -from loguru import logger +from auto_archiver.utils.custom_logger import logger import csv from auto_archiver.core import Feeder @@ -20,20 +20,19 @@ class CSVFeeder(Feeder): url_column = first_row.index(url_column) except ValueError: logger.error( - f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?" + f"column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?" ) return elif not (url_or_none(first_row[url_column])): # it's a header row, but we've been given a column number already - logger.debug(f"Skipping header row: {first_row}") + logger.debug(f"skipping header row: {first_row}") else: # first row isn't a header row, rewind the file f.seek(0) for row in reader: if not url_or_none(row[url_column]): - logger.warning(f"Not a valid URL in row: {row}, skipping") + logger.warning(f"not a valid URL in row: {row}, skipping") continue url = row[url_column] - logger.debug(f"Processing {url}") yield Metadata().set_url(url) diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index 50ce244..6a15e80 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -8,7 +8,7 @@ from google.oauth2 import service_account from google.oauth2.credentials import Credentials from googleapiclient.discovery import build from googleapiclient.http import MediaFileUpload -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Media from auto_archiver.core import Storage @@ -23,10 +23,10 @@ class GDriveStorage(Storage): def _setup_google_drive_service(self): """Initialize Google Drive service based on provided credentials.""" if self.oauth_token: - logger.debug(f"Using Google Drive OAuth token: {self.oauth_token}") + logger.debug(f"using Google Drive OAuth token: {self.oauth_token}") self.service = self._initialize_with_oauth_token() elif self.service_account: - logger.debug(f"Using Google Drive service account: {self.service_account}") + logger.debug(f"using Google Drive service account: {self.service_account}") self.service = self._initialize_with_service_account() else: raise ValueError("Missing credentials: either `oauth_token` or `service_account` must be provided.") @@ -41,7 +41,7 @@ class GDriveStorage(Storage): if not creds.valid and creds.expired and creds.refresh_token: creds.refresh(Request()) with open(self.oauth_token, "w") as token_file: - logger.debug("Saving refreshed OAuth token.") + logger.debug("saving refreshed OAuth token.") token_file.write(creds.to_json()) elif not creds.valid: raise ValueError("Invalid OAuth token. Please regenerate the token.") @@ -180,7 +180,7 @@ class GDriveStorage(Storage): Creates a new GDrive folder @name inside folder @parent_id Returns id of the created folder """ - logger.debug(f"Creating new folder with {name=} inside {parent_id=}") + logger.debug(f"creating new folder with {name=} inside {parent_id=}") file_metadata = {"name": [name], "mimeType": "application/vnd.google-apps.folder", "parents": [parent_id]} gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields="id").execute() return gd_folder.get("id") diff --git a/src/auto_archiver/modules/generic_extractor/bluesky.py b/src/auto_archiver/modules/generic_extractor/bluesky.py index 5baad6c..261ff03 100644 --- a/src/auto_archiver/modules/generic_extractor/bluesky.py +++ b/src/auto_archiver/modules/generic_extractor/bluesky.py @@ -1,4 +1,4 @@ -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core.extractor import Extractor from auto_archiver.core.metadata import Metadata, Media @@ -18,7 +18,7 @@ class Bluesky(GenericDropin): # download if embeds present (1 video XOR >=1 images) for media in self._download_bsky_embeds(post, archiver): result.add_media(media) - logger.debug(f"Downloaded {len(result.media)} media files") + logger.debug(f"downloaded {len(result.media)} media files") return result diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index a65c5fe..f71ac28 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -14,7 +14,7 @@ from yt_dlp.extractor.common import InfoExtractor from yt_dlp.utils import MaxDownloadsReached import pysubs2 -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core.extractor import Extractor from auto_archiver.core import Metadata, Media @@ -63,12 +63,11 @@ class GenericExtractor(Extractor): if os.environ.get("AUTO_ARCHIVER_ALLOW_RESTART", "1") != "1": logger.warning("yt-dlp or plugin was updated — please restart auto-archiver manually") else: - logger.warning("yt-dlp or plugin was updated — restarting auto-archiver") - logger.warning(" ======= RESTARTING ======= ") + logger.warning("yt-dlp or plugin was updated — restarting auto-archiver\n ======= RESTARTING ======= ") os.execv(sys.executable, [sys.executable] + sys.argv) def update_package(self, package_name: str) -> bool: - logger.info(f"Checking and updating {package_name}...") + logger.info(f"checking and updating {package_name}...") from importlib.metadata import version as get_version old_version = get_version(package_name) @@ -80,7 +79,7 @@ class GenericExtractor(Extractor): return True logger.info(f"{package_name} already up to date") except Exception as e: - logger.error(f"Error updating {package_name}: {e}") + logger.error(f"failed to update {package_name}: {e}") return False def setup_po_tokens(self) -> None: @@ -111,7 +110,7 @@ class GenericExtractor(Extractor): missing_tools = [tool for tool in ("node", "yarn", "npx") if shutil.which(tool) is None] if missing_tools: logger.error( - f"Cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. " + f"cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. " "Install these tools or run bgutils via Docker. " "See: https://github.com/Brainicism/bgutil-ytdlp-pot-provider" ) @@ -140,7 +139,7 @@ class GenericExtractor(Extractor): f"https://github.com/Brainicism/bgutil-ytdlp-pot-provider/archive/refs/tags/{plugin_version}.zip" ) zip_path = os.path.join(base_dir, f"{plugin_version}.zip") - logger.info(f"Downloading bgutils release zip for version {plugin_version}...") + logger.info(f"downloading bgutils release zip for version {plugin_version}...") urlretrieve(zip_url, zip_path) with zipfile.ZipFile(zip_path, "r") as z: z.extractall(base_dir) @@ -149,7 +148,7 @@ class GenericExtractor(Extractor): extracted_root = os.path.join(base_dir, f"bgutil-ytdlp-pot-provider-{plugin_version}") shutil.move(os.path.join(extracted_root, "server"), server_dir) shutil.rmtree(extracted_root) - logger.info("Installing dependencies and transpiling PoT Generator script...") + logger.info("installing dependencies and transpiling PoT Generator script...") subprocess.run(["yarn", "install", "--frozen-lockfile"], cwd=server_dir, check=True) subprocess.run(["npx", "tsc"], cwd=server_dir, check=True) @@ -165,7 +164,7 @@ class GenericExtractor(Extractor): logger.info(f"PO Token script configured at: {script_path}") except Exception as e: - logger.error(f"Failed to set up PO Token script: {e}") + logger.error(f"failed to set up PO Token script: {e}") def suitable_extractors(self, url: str) -> Generator[str, None, None]: """ @@ -206,7 +205,7 @@ class GenericExtractor(Extractor): media = Media(cover_image_path) metadata.add_media(media, id="cover") except Exception as e: - logger.error(f"Error downloading cover image {thumbnail_url}: {e}") + logger.error(f"could not download cover image {thumbnail_url}: {e}") dropin = self.dropin_for_name(info_extractor.ie_key()) if dropin: @@ -353,7 +352,7 @@ class GenericExtractor(Extractor): if not dropin: # TODO: add a proper link to 'how to create your own dropin' - logger.debug(f"""Could not find valid dropin for {info_extractor.ie_key()}. + logger.debug(f"""could not find valid dropin for {info_extractor.ie_key()}. Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""") return False @@ -389,7 +388,7 @@ class GenericExtractor(Extractor): # file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies. continue - logger.debug(f"Using filename {filename} for entry {entry.get('id', 'unknown')}") + logger.debug(f"using filename {filename} for entry {entry.get('id', 'unknown')}") new_media = Media(filename) for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]: @@ -404,12 +403,12 @@ class GenericExtractor(Extractor): text = " ".join([line.text for line in subs]) new_media.set(f"subtitles_{lang}", text) except Exception as e: - logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}") + logger.error(f"error loading subtitle file {val.get('filepath')}: {e}") result.add_media(new_media) except Exception as e: - logger.error(f"Error processing entry {entry}: {e}") + logger.error(f"error processing entry {entry}: {e}") if not len(result.media): - logger.info(f"No media found for entry {entry}, skipping.") + logger.info(f"no media found for entry {entry}, skipping.") return False return self.add_metadata(data, info_extractor, url, result) @@ -471,14 +470,14 @@ class GenericExtractor(Extractor): def _helper_for_successful_extract_info(data, info_extractor, url, ydl): if data.get("is_live", False) and not self.livestreams: - logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting") + logger.warning("livestream detected, skipping due to 'livestreams' configuration setting") return False # it's a valid video, that the youtubdedl can download out of the box return self.get_metadata_for_video(data, info_extractor, url, ydl) try: if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor): - logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}") + logger.debug(f"skipping using ytdlp to download files for {info_extractor.ie_key()}") raise SkipYtdlp() # don't download since it can be a live stream @@ -497,17 +496,17 @@ class GenericExtractor(Extractor): if not isinstance(e, SkipYtdlp): logger.debug( - f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead' + f'issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead' ) try: result = self.get_metadata_for_post(info_extractor, url, ydl) except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e: - logger.error("Error downloading metadata for post: {error}", error=str(post_e)) + logger.error("error downloading metadata for post: {error}", error=str(post_e)) return False except Exception as generic_e: logger.debug( - 'Attempt to extract using ytdlp extractor "{name}" failed: \n {error}', + 'attempt to extract using ytdlp extractor "{name}" failed: \n {error}', name=info_extractor.IE_NAME, error=str(generic_e), exc_info=True, @@ -560,17 +559,17 @@ class GenericExtractor(Extractor): # order of importance: username/password -> api_key -> cookie -> cookies_from_browser -> cookies_file if auth: if "username" in auth and "password" in auth: - logger.debug(f"Using provided auth username and password for {url}") + logger.debug("using provided auth username and password") ydl_options.extend(("--username", auth["username"])) ydl_options.extend(("--password", auth["password"])) elif "cookie" in auth: - logger.debug(f"Using provided auth cookie for {url}") + logger.debug("using provided auth cookie") yt_dlp.utils.std_headers["cookie"] = auth["cookie"] elif "cookies_from_browser" in auth: - logger.debug(f"Using extracted cookies from browser {auth['cookies_from_browser']} for {url}") + logger.debug(f"using extracted cookies from browser {auth['cookies_from_browser']}") ydl_options.extend(("--cookies-from-browser", auth["cookies_from_browser"])) elif "cookies_file" in auth: - logger.debug(f"Using cookies from file {auth['cookies_file']} for {url}") + logger.debug(f"using cookies from file {auth['cookies_file']}") ydl_options.extend(("--cookies", auth["cookies_file"])) # Applying user-defined extractor_args @@ -580,11 +579,11 @@ class GenericExtractor(Extractor): arg_str = ";".join(f"{k}={v}" for k, v in args.items()) else: arg_str = str(args) - logger.debug(f"Setting extractor_args: {key}:{arg_str}") + logger.debug(f"setting extractor_args: {key}:{arg_str}") ydl_options.extend(["--extractor-args", f"{key}:{arg_str}"]) if self.ytdlp_args: - logger.debug("Adding additional ytdlp arguments: {self.ytdlp_args}") + logger.debug(f"adding additional ytdlp arguments: {self.ytdlp_args}") ydl_options += self.ytdlp_args.split(" ") *_, validated_options = yt_dlp.parse_options(ydl_options) diff --git a/src/auto_archiver/modules/generic_extractor/tiktok.py b/src/auto_archiver/modules/generic_extractor/tiktok.py index 902eb05..66936e3 100644 --- a/src/auto_archiver/modules/generic_extractor/tiktok.py +++ b/src/auto_archiver/modules/generic_extractor/tiktok.py @@ -1,5 +1,5 @@ import requests -from loguru import logger +from auto_archiver.utils.custom_logger import logger from yt_dlp.extractor.tiktok import TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE @@ -22,7 +22,7 @@ class Tiktok(GenericDropin): return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE)) def extract_post(self, url: str, ie_instance): - logger.debug(f"Using Tikwm API to attempt to download tiktok video from {url=}") + logger.debug(f"using Tikwm API to attempt to download tiktok video from {url=}") endpoint = self.TIKWM_ENDPOINT.format(url=url) diff --git a/src/auto_archiver/modules/generic_extractor/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py index 9006e57..c5964ad 100644 --- a/src/auto_archiver/modules/generic_extractor/twitter.py +++ b/src/auto_archiver/modules/generic_extractor/twitter.py @@ -1,7 +1,7 @@ import re import mimetypes -from loguru import logger +from auto_archiver.utils.custom_logger import logger from slugify import slugify from auto_archiver.core.metadata import Metadata, Media @@ -40,7 +40,7 @@ class Twitter(GenericDropin): raise ValueError("Error retreiving post. Are you sure it exists?") timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y") except (ValueError, KeyError) as ex: - logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}") + logger.warning(f"unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}") return False full_text = tweet.pop("full_text", "") @@ -49,7 +49,7 @@ class Twitter(GenericDropin): result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp) if not tweet.get("entities", {}).get("media"): - logger.debug("No media found, archiving tweet text only") + logger.debug("no media found, archiving tweet text only") result.status = "twitter-ytdl" return result for i, tw_media in enumerate(tweet["entities"]["media"]): diff --git a/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py index 10300e0..0f03de7 100644 --- a/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py +++ b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py @@ -10,11 +10,12 @@ The filtered rows are processed into `Metadata` objects. """ import os +import traceback from typing import Tuple, Union, Iterator from urllib.parse import quote import gspread -from loguru import logger +from auto_archiver.utils.custom_logger import logger from slugify import slugify from retrying import retry @@ -41,19 +42,19 @@ class GsheetsFeederDB(Feeder, Database): sh = self.open_sheet() for ii, worksheet in enumerate(sh.worksheets()): if not self.should_process_sheet(worksheet.title): - logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules") + logger.debug(f"skipped worksheet '{worksheet.title}' due to allow/block rules") continue - logger.info(f"Opening worksheet {ii=}: {worksheet.title=} header={self.header}") + logger.info(f"opening worksheet {ii=}: {worksheet.title=} header={self.header}") gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns) if len(missing_cols := self.missing_required_columns(gw)): logger.debug( - f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}" + f"skipped worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}" ) continue - - # process and yield metadata here: - yield from self._process_rows(gw) - logger.info(f"Finished worksheet {worksheet.title}") + with logger.contextualize(worksheet=f"{sh.title}:{worksheet.title}"): + # process and yield metadata here: + yield from self._process_rows(gw) + logger.info(f"finished worksheet {worksheet.title}") def _process_rows(self, gw: GWorksheet): for row in range(1 + self.header, gw.count_rows() + 1): @@ -69,7 +70,9 @@ class GsheetsFeederDB(Feeder, Database): # All checks done - archival process starts here m = Metadata().set_url(url) self._set_context(m, gw, row) - yield m + + with logger.contextualize(row=row): + yield m def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata: # TODO: Check folder value not being recognised @@ -99,16 +102,16 @@ class GsheetsFeederDB(Feeder, Database): return missing def started(self, item: Metadata) -> None: - logger.info(f"STARTED {item}") + logger.info("STARTED") gw, row = self._retrieve_gsheet(item) gw.set_cell(row, "status", "Archive in progress") def failed(self, item: Metadata, reason: str) -> None: - logger.error(f"FAILED {item}") + logger.error("FAILED") self._safe_status_update(item, f"Archive failed {reason}") def aborted(self, item: Metadata) -> None: - logger.warning(f"ABORTED {item}") + logger.warning("ABORTED") self._safe_status_update(item, "") def fetch(self, item: Metadata) -> Union[Metadata, bool]: @@ -122,9 +125,7 @@ class GsheetsFeederDB(Feeder, Database): cell_updates = [] row_values = gw.get_row(row) - spreadsheet = gw.wks.spreadsheet.title - worksheet = gw.wks.title - logger.info(f"DONE url='{item.get_url()}' {row=} on {spreadsheet=} : {worksheet=}") + logger.info("DONE") def batch_if_valid(col, val, final_value=None): final_value = final_value or val @@ -132,7 +133,7 @@ class GsheetsFeederDB(Feeder, Database): if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "": cell_updates.append((row, col, final_value)) except Exception as e: - logger.error(f"Unable to batch {col}={final_value} due to {e}") + logger.error(f"unable to batch {col}={final_value} due to {e}") status_message = item.status if cached: @@ -192,15 +193,13 @@ class GsheetsFeederDB(Feeder, Database): gw, row = self._retrieve_gsheet(item) gw.set_cell(row, "status", new_status) except Exception as e: - logger.debug(f"Unable to update sheet: {e}") + logger.debug(f"unable to update sheet: {e}: {traceback.format_exc()}") def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: if gsheet := item.get_context("gsheet"): gw: GWorksheet = gsheet.get("worksheet") row: int = gsheet.get("row") elif self.sheet_id: - logger.error( - f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder." - ) + logger.error("unable to retrieve Gsheet, GsheetDB must be used alongside GsheetFeeder.") return gw, row diff --git a/src/auto_archiver/modules/hash_enricher/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py index 71425f2..799c5b3 100644 --- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py +++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py @@ -9,7 +9,7 @@ making it suitable for handling large files efficiently. """ import hashlib -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata @@ -22,8 +22,7 @@ class HashEnricher(Enricher): """ def enrich(self, to_enrich: Metadata) -> None: - url = to_enrich.get_url() - logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})") + logger.debug(f"calculating media hashes with algo={self.algorithm}") for i, m in enumerate(to_enrich.media): if len(hd := self.calculate_hash(m.filename)): diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index f5da1d8..41188f1 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -4,7 +4,7 @@ import os import pathlib from jinja2 import Environment, FileSystemLoader from urllib.parse import quote -from loguru import logger +from auto_archiver.utils.custom_logger import logger import json import base64 @@ -35,7 +35,7 @@ class HtmlFormatter(Formatter): def format(self, item: Metadata) -> Media: url = item.get_url() if item.is_empty(): - logger.debug(f"[SKIP] FORMAT there is no media or metadata to format: {url=}") + logger.debug("nothing to format, skipping") return content = self.template.render( diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py index 1694ddc..e21b089 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py +++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py @@ -14,7 +14,7 @@ from datetime import datetime import traceback import requests -from loguru import logger +from auto_archiver.utils.custom_logger import logger from retrying import retry from tqdm import tqdm @@ -45,11 +45,11 @@ class InstagramAPIExtractor(Extractor): url = item.get_url() url.replace("instagr.com", "instagram.com").replace("instagr.am", "instagram.com") insta_matches = self.valid_url.findall(url) - logger.info(f"{insta_matches=}") + if not len(insta_matches) or len(insta_matches[0]) != 3: return if len(insta_matches) > 1: - logger.warning(f"Multiple instagram matches found in {url=}, using the first one") + logger.debug("multiple instagram matches found, using the first one") return g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2] if g1 == "": @@ -65,7 +65,7 @@ class InstagramAPIExtractor(Extractor): return self.download_post(item, id=g3, context="story") return self.download_stories(item, g2) else: - logger.warning(f"Unknown instagram regex group match {g1=} found in {url=}") + logger.warning(f"unknown instagram regex group match {g1=}") return @retry(wait_random_min=1000, wait_random_max=3000, stop_max_attempt_number=5) @@ -112,8 +112,8 @@ class InstagramAPIExtractor(Extractor): count_posts += len(stories) result.set("#stories", len(stories)) except Exception as e: - result.append("errors", f"Error downloading stories for {username}") - logger.error(f"Error downloading stories for {username}: {e} {traceback.format_exc()}") + result.append("errors", f"error downloading stories for {username}") + logger.error(f"error downloading stories for {username}: {e} {traceback.format_exc()}") # download all posts try: @@ -122,8 +122,8 @@ class InstagramAPIExtractor(Extractor): result, user_id, max_to_download=self.full_profile_max_posts - count_posts ) except Exception as e: - result.append("errors", f"Error downloading posts for {username}") - logger.error(f"Error downloading posts for {username}: {e} {traceback.format_exc()}") + result.append("errors", f"error downloading posts for {username}") + logger.error(f"error downloading posts for {username}: {e} {traceback.format_exc()}") # download all tagged try: @@ -132,8 +132,8 @@ class InstagramAPIExtractor(Extractor): result, user_id, max_to_download=self.full_profile_max_posts - count_posts ) except Exception as e: - result.append("errors", f"Error downloading tagged posts for {username}") - logger.error(f"Error downloading tagged posts for {username}: {e} {traceback.format_exc()}") + result.append("errors", f"error downloading tagged posts for {username}") + logger.error(f"error downloading tagged posts for {username}: {e} {traceback.format_exc()}") # download all highlights try: @@ -159,10 +159,10 @@ class InstagramAPIExtractor(Extractor): except Exception as e: result.append( "errors", - f"Error downloading highlight id{h.get('pk')} for {username}", + f"error downloading highlight id{h.get('pk')} for {username}", ) logger.error( - f"Error downloading highlight id{h.get('pk')} for {username}: {e} {traceback.format_exc()}" + f"error downloading highlight id{h.get('pk')} for {username}: {e} {traceback.format_exc()}" ) if count_highlights >= max_to_download: logger.debug(f"HIGHLIGHTS reached max_to_download={self.full_profile_max_posts}") @@ -208,8 +208,8 @@ class InstagramAPIExtractor(Extractor): try: self.scrape_item(result, h, "highlight") except Exception as e: - result.append("errors", f"Error downloading highlight {h.get('id')}") - logger.error(f"Error downloading highlight, skipping {h.get('id')}: {e} {traceback.format_exc()}") + result.append("errors", f"error downloading highlight {h.get('id')}") + logger.error(f"error downloading highlight, skipping {h.get('id')}: {e} {traceback.format_exc()}") return h_info @@ -251,8 +251,8 @@ class InstagramAPIExtractor(Extractor): try: self.scrape_item(result, p, "post") except Exception as e: - result.append("errors", f"Error downloading post {p.get('id')}") - logger.error(f"Error downloading post, skipping {p.get('id')}: {e} {traceback.format_exc()}") + result.append("errors", f"error downloading post {p.get('id')}") + logger.error(f"error downloading post, skipping {p.get('id')}: {e} {traceback.format_exc()}") pbar.update(1) post_count += 1 if post_count >= max_to_download: @@ -279,8 +279,8 @@ class InstagramAPIExtractor(Extractor): try: self.scrape_item(result, p, "tagged") except Exception as e: - result.append("errors", f"Error downloading tagged post {p.get('id')}") - logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e} {traceback.format_exc()}") + result.append("errors", f"error downloading tagged post {p.get('id')}") + logger.error(f"error downloading tagged post, skipping {p.get('id')}: {e} {traceback.format_exc()}") pbar.update(1) tagged_count += 1 if tagged_count >= max_to_download: diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py index d559c47..af525f3 100644 --- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py @@ -8,7 +8,7 @@ import re import os import shutil import instaloader -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Extractor from auto_archiver.core import Metadata @@ -29,8 +29,9 @@ class InstagramExtractor(Extractor): # TODO: links to stories def setup(self) -> None: - logger.warning("Instagram Extractor is not actively maintained, and may not work as expected.") - logger.warning("Please consider using the Instagram Tbot Extractor or Instagram API Extractor instead.") + logger.warning( + "Instagram Extractor is not actively maintained, and may not work as expected.\nPlease consider using the Instagram Tbot Extractor or Instagram API Extractor instead." + ) self.insta = instaloader.Instaloader( download_geotags=True, @@ -43,12 +44,11 @@ class InstagramExtractor(Extractor): self.insta.load_session_from_file(self.username, self.session_file) except Exception: try: - logger.debug("Session file failed", exc_info=True) - logger.info("No valid session file found - Attempting login with use and password.") + logger.info("no valid session file found - Attempting login with use and password.") self.insta.login(self.username, self.password) self.insta.save_session_to_file(self.session_file) except Exception as e: - logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}") + logger.error(f"failed to setup Instagram Extractor with Instagrapi. {e}") def download(self, item: Metadata) -> Metadata: url = item.get_url() @@ -72,14 +72,14 @@ class InstagramExtractor(Extractor): result = self.download_profile(url, profile_matches[0]) except Exception as e: logger.error( - f"Failed to download with instagram extractor due to: {e}, make sure your account credentials are valid." + f"failed to download with instagram extractor due to: {e}, make sure your account credentials are valid." ) finally: shutil.rmtree(self.download_folder, ignore_errors=True) return result def download_post(self, url: str, post_id: str) -> Metadata: - logger.debug(f"Instagram {post_id=} detected in {url=}") + logger.debug(f"Instagram {post_id=} detected") post = instaloader.Post.from_shortcode(self.insta.context, post_id) if self.insta.download_post(post, target=post.owner_username): @@ -87,7 +87,7 @@ class InstagramExtractor(Extractor): def download_profile(self, url: str, username: str) -> Metadata: # gets posts, posts where username is tagged, igtv postss, stories, and highlights - logger.debug(f"Instagram {username=} detected in {url=}") + logger.debug(f"Instagram {username=} detected") profile = instaloader.Profile.from_username(self.insta.context, username) try: @@ -95,27 +95,27 @@ class InstagramExtractor(Extractor): try: self.insta.download_post(post, target=f"profile_post_{post.owner_username}") except Exception as e: - logger.error(f"Failed to download post: {post.shortcode}: {e}") + logger.error(f"failed to download post: {post.shortcode}: {e}") except Exception as e: - logger.error(f"Failed profile.get_posts: {e}") + logger.error(f"failed profile.get_posts: {e}") try: for post in profile.get_tagged_posts(): try: self.insta.download_post(post, target=f"tagged_post_{post.owner_username}") except Exception as e: - logger.error(f"Failed to download tagged post: {post.shortcode}: {e}") + logger.error(f"failed to download tagged post: {post.shortcode}: {e}") except Exception as e: - logger.error(f"Failed profile.get_tagged_posts: {e}") + logger.error(f"failed profile.get_tagged_posts: {e}") try: for post in profile.get_igtv_posts(): try: self.insta.download_post(post, target=f"igtv_post_{post.owner_username}") except Exception as e: - logger.error(f"Failed to download igtv post: {post.shortcode}: {e}") + logger.error(f"failed to download igtv post: {post.shortcode}: {e}") except Exception as e: - logger.error(f"Failed profile.get_igtv_posts: {e}") + logger.error(f"failed profile.get_igtv_posts: {e}") try: for story in self.insta.get_stories([profile.userid]): @@ -123,9 +123,9 @@ class InstagramExtractor(Extractor): try: self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}") except Exception as e: - logger.error(f"Failed to download story item: {item}: {e}") + logger.error(f"failed to download story item: {item}: {e}") except Exception as e: - logger.error(f"Failed get_stories: {e}") + logger.error(f"failed get_stories: {e}") try: for highlight in self.insta.get_highlights(profile.userid): @@ -133,9 +133,9 @@ class InstagramExtractor(Extractor): try: self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}") except Exception as e: - logger.error(f"Failed to download highlight item: {item}: {e}") + logger.error(f"failed to download highlight item: {item}: {e}") except Exception as e: - logger.error(f"Failed get_highlights: {e}") + logger.error(f"failed get_highlights: {e}") return self.process_downloads(url, f"@{username}", profile._asdict(), None) @@ -158,4 +158,4 @@ class InstagramExtractor(Extractor): return result.success("instagram") except Exception as e: - logger.error(f"Could not fetch instagram post {url} due to: {e}") + logger.error(f"could not fetch instagram post due to: {e}") diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py index b4f9378..9d1fd7e 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py @@ -12,7 +12,7 @@ import shutil import time from sqlite3 import OperationalError -from loguru import logger +from auto_archiver.utils.custom_logger import logger from telethon.sync import TelegramClient from auto_archiver.core import Extractor diff --git a/src/auto_archiver/modules/json_enricher/json_enricher.py b/src/auto_archiver/modules/json_enricher/json_enricher.py index b0900b6..7a5c41e 100644 --- a/src/auto_archiver/modules/json_enricher/json_enricher.py +++ b/src/auto_archiver/modules/json_enricher/json_enricher.py @@ -1,5 +1,5 @@ import json -from loguru import logger +from auto_archiver.utils.custom_logger import logger import os from auto_archiver.core import Enricher diff --git a/src/auto_archiver/modules/local_storage/local_storage.py b/src/auto_archiver/modules/local_storage/local_storage.py index fdc6978..79cb1e8 100644 --- a/src/auto_archiver/modules/local_storage/local_storage.py +++ b/src/auto_archiver/modules/local_storage/local_storage.py @@ -1,7 +1,7 @@ import shutil from typing import IO import os -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Media from auto_archiver.core import Storage diff --git a/src/auto_archiver/modules/meta_enricher/meta_enricher.py b/src/auto_archiver/modules/meta_enricher/meta_enricher.py index 9356b16..74f4b9b 100644 --- a/src/auto_archiver/modules/meta_enricher/meta_enricher.py +++ b/src/auto_archiver/modules/meta_enricher/meta_enricher.py @@ -1,6 +1,6 @@ import datetime import os -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py index e4fac44..b59ce62 100644 --- a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py +++ b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py @@ -1,6 +1,6 @@ import subprocess import traceback -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py index c920a03..272b112 100644 --- a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py +++ b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py @@ -1,6 +1,6 @@ import os -from loguru import logger +from auto_archiver.utils.custom_logger import logger import opentimestamps from opentimestamps.calendar import RemoteCalendar, DEFAULT_CALENDAR_WHITELIST from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile diff --git a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py index 19b9c59..bad408f 100644 --- a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py +++ b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py @@ -15,7 +15,7 @@ import traceback import pdqhash import numpy as np from PIL import Image, UnidentifiedImageError -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py index b5d905d..602cbe4 100644 --- a/src/auto_archiver/modules/s3_storage/s3_storage.py +++ b/src/auto_archiver/modules/s3_storage/s3_storage.py @@ -2,7 +2,7 @@ from typing import IO import boto3 import os -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Media from auto_archiver.core import Storage diff --git a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py index 3ab1389..f6f7b01 100644 --- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py +++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py @@ -2,7 +2,7 @@ import ssl import os from slugify import slugify from urllib.parse import urlparse -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata, Media diff --git a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py index e70198d..f32fb1e 100644 --- a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py +++ b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py @@ -2,7 +2,7 @@ import requests import re import html from bs4 import BeautifulSoup -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Extractor from auto_archiver.core import Metadata, Media diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 2643b32..2dcc90e 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -17,7 +17,7 @@ from telethon.errors.rpcerrorlist import ( ) from tqdm import tqdm -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Extractor from auto_archiver.core import Metadata, Media diff --git a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py index a8f844f..4e15adf 100644 --- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py +++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py @@ -9,7 +9,7 @@ and identify important moments without watching the entire video. import ffmpeg import os -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Enricher from auto_archiver.core import Media, Metadata diff --git a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py index 1626b71..1c95f24 100644 --- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py +++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py @@ -5,7 +5,7 @@ import hashlib from slugify import slugify import requests -from loguru import logger +from auto_archiver.utils.custom_logger import logger from rfc3161_client import (decode_timestamp_response,TimestampRequestBuilder,TimeStampResponse, VerifierBuilder) from rfc3161_client import VerificationError as Rfc3161VerificationError diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py index 5a0023a..71ea318 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py +++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py @@ -4,7 +4,7 @@ import re import mimetypes import requests -from loguru import logger +from auto_archiver.utils.custom_logger import logger from pytwitter import Api from slugify import slugify diff --git a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py index b1fbd80..4e21cf7 100644 --- a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py +++ b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py @@ -4,7 +4,7 @@ import os import shutil import subprocess from zipfile import ZipFile -from loguru import logger +from auto_archiver.utils.custom_logger import logger from warcio.archiveiterator import ArchiveIterator from auto_archiver.core import Media, Metadata diff --git a/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py b/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py index f06effd..2cb1815 100644 --- a/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py +++ b/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py @@ -1,5 +1,5 @@ import json -from loguru import logger +from auto_archiver.utils.custom_logger import logger import time import requests diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index 063bd26..043fc30 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -1,7 +1,7 @@ import traceback import requests import time -from loguru import logger +from auto_archiver.utils.custom_logger import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata, Media diff --git a/src/auto_archiver/utils/custom_logger.py b/src/auto_archiver/utils/custom_logger.py new file mode 100644 index 0000000..9c04f35 --- /dev/null +++ b/src/auto_archiver/utils/custom_logger.py @@ -0,0 +1,37 @@ +from loguru import logger +import json + + +def extract_log_data(record): + subset = { + "level": record["level"].name, + "time": record["time"].isoformat(timespec="seconds"), + } + subset["loc"] = f"{record['file'].name}:{record['function']}:{record['line']}" + + for extra_key in ["trace", "url", "worksheet", "row"]: + if extra_val := record.get("extra", {}).get(extra_key): + subset[extra_key] = extra_val + + subset["message"] = record["message"] + if exception := record.get("exception"): + subset["exception"] = exception + return subset + + +def serialize_no_message(record): + subset = extract_log_data(record) + subset.pop("message", None) + return json.dumps(subset, ensure_ascii=False) + + +def serialize(record): + return json.dumps(extract_log_data(record), ensure_ascii=False) + + +def patching(record): + record["extra"]["serialized"] = serialize(record) + record["extra"]["serialize_no_message"] = serialize_no_message(record) + + +logger = logger.patch(patching) diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py index 27a1bc9d..4c872f3 100644 --- a/src/auto_archiver/utils/misc.py +++ b/src/auto_archiver/utils/misc.py @@ -7,7 +7,7 @@ from datetime import datetime, timezone from dateutil.parser import parse as parse_dt import requests -from loguru import logger +from auto_archiver.utils.custom_logger import logger def mkdir_if_not_exists(folder): diff --git a/tests/conftest.py b/tests/conftest.py index a54f01d..6f47a46 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,7 +9,7 @@ from tempfile import TemporaryDirectory from typing import Dict, Tuple import hashlib -from loguru import logger +from auto_archiver.utils.custom_logger import logger import pytest from auto_archiver.core.metadata import Metadata, Media from auto_archiver.core.module import ModuleFactory diff --git a/tests/data/test_modules/example_extractor/example_extractor.py b/tests/data/test_modules/example_extractor/example_extractor.py index ade26e4..6a54b40 100644 --- a/tests/data/test_modules/example_extractor/example_extractor.py +++ b/tests/data/test_modules/example_extractor/example_extractor.py @@ -1,6 +1,6 @@ from auto_archiver.core import Extractor -from loguru import logger +from auto_archiver.utils.custom_logger import logger class ExampleExtractor(Extractor): diff --git a/tests/data/test_modules/example_module/example_module.py b/tests/data/test_modules/example_module/example_module.py index 898df96..655afec 100644 --- a/tests/data/test_modules/example_module/example_module.py +++ b/tests/data/test_modules/example_module/example_module.py @@ -1,6 +1,6 @@ from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata -from loguru import logger +from auto_archiver.utils.custom_logger import logger class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter): diff --git a/tests/test_implementation.py b/tests/test_implementation.py index e52a8d8..69dd5e6 100644 --- a/tests/test_implementation.py +++ b/tests/test_implementation.py @@ -25,7 +25,7 @@ def orchestration_file(orchestration_file_path): def autoarchiver(tmp_path, monkeypatch, request): def _autoarchiver(args=[]): def cleanup(): - from loguru import logger + from auto_archiver.utils.custom_logger import logger if not logger._core.handlers.get(0): logger._core.handlers_count = 0