From ce4d7ac6496d2621fdace69217014e6c6a04bae0 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Sat, 21 Jun 2025 15:54:51 +0100
Subject: [PATCH] WIP refactor logging
---
scripts/telegram_setup.py | 2 +-
src/auto_archiver/core/base_module.py | 2 +-
src/auto_archiver/core/config.py | 8 +--
src/auto_archiver/core/extractor.py | 6 +-
src/auto_archiver/core/media.py | 5 +-
src/auto_archiver/core/metadata.py | 2 +-
src/auto_archiver/core/module.py | 2 +-
src/auto_archiver/core/orchestrator.py | 48 ++++++++-------
src/auto_archiver/core/storage.py | 2 +-
.../antibot_extractor_enricher.py | 23 ++++---
.../captcha_services/anti_captcha.py | 60 +++++++++++++++++++
.../antibot_extractor_enricher/dropin.py | 7 ++-
.../dropins/linkedin.py | 4 +-
.../dropins/reddit.py | 8 +--
.../antibot_extractor_enricher/dropins/vk.py | 6 +-
src/auto_archiver/modules/api_db/api_db.py | 6 +-
.../atlos_feeder_db_storage.py | 16 ++---
.../modules/cli_feeder/cli_feeder.py | 5 --
.../modules/console_db/console_db.py | 2 +-
src/auto_archiver/modules/csv_db/csv_db.py | 2 +-
.../modules/csv_feeder/csv_feeder.py | 9 ++-
.../modules/gdrive_storage/gdrive_storage.py | 10 ++--
.../modules/generic_extractor/bluesky.py | 4 +-
.../generic_extractor/generic_extractor.py | 51 ++++++++--------
.../modules/generic_extractor/tiktok.py | 4 +-
.../modules/generic_extractor/twitter.py | 6 +-
.../gsheet_feeder_db/gsheet_feeder_db.py | 39 ++++++------
.../modules/hash_enricher/hash_enricher.py | 5 +-
.../modules/html_formatter/html_formatter.py | 4 +-
.../instagram_api_extractor.py | 36 +++++------
.../instagram_extractor.py | 40 ++++++-------
.../instagram_tbot_extractor.py | 2 +-
.../modules/json_enricher/json_enricher.py | 2 +-
.../modules/local_storage/local_storage.py | 2 +-
.../modules/meta_enricher/meta_enricher.py | 2 +-
.../metadata_enricher/metadata_enricher.py | 2 +-
.../opentimestamps_enricher.py | 2 +-
.../pdq_hash_enricher/pdq_hash_enricher.py | 2 +-
.../modules/s3_storage/s3_storage.py | 2 +-
.../modules/ssl_enricher/ssl_enricher.py | 2 +-
.../telegram_extractor/telegram_extractor.py | 2 +-
.../telethon_extractor/telethon_extractor.py | 2 +-
.../thumbnail_enricher/thumbnail_enricher.py | 2 +-
.../timestamping_enricher.py | 2 +-
.../twitter_api_extractor.py | 2 +-
.../wacz_extractor_enricher.py | 2 +-
.../wayback_extractor_enricher.py | 2 +-
.../whisper_enricher/whisper_enricher.py | 2 +-
src/auto_archiver/utils/custom_logger.py | 37 ++++++++++++
src/auto_archiver/utils/misc.py | 2 +-
tests/conftest.py | 2 +-
.../example_extractor/example_extractor.py | 2 +-
.../example_module/example_module.py | 2 +-
tests/test_implementation.py | 2 +-
54 files changed, 298 insertions(+), 207 deletions(-)
create mode 100644 src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/anti_captcha.py
create mode 100644 src/auto_archiver/utils/custom_logger.py
diff --git a/scripts/telegram_setup.py b/scripts/telegram_setup.py
index 9480cd8..c11f94a 100644
--- a/scripts/telegram_setup.py
+++ b/scripts/telegram_setup.py
@@ -14,7 +14,7 @@ You will need to provide your phone number and a 2FA code the first time you run
import os
from telethon.sync import TelegramClient
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
# Create a
diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py
index 6461ab7..f12c38d 100644
--- a/src/auto_archiver/core/base_module.py
+++ b/src/auto_archiver/core/base_module.py
@@ -7,7 +7,7 @@ from tempfile import TemporaryDirectory
from auto_archiver.utils import url as UrlUtil
from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
if TYPE_CHECKING:
from .module import ModuleFactory
diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py
index a2d7679..8e65edf 100644
--- a/src/auto_archiver/core/config.py
+++ b/src/auto_archiver/core/config.py
@@ -10,7 +10,7 @@ from ruamel.yaml import YAML, CommentedMap
import json
import os
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from copy import deepcopy
from auto_archiver.core.consts import MODULE_TYPES
@@ -118,8 +118,7 @@ class DefaultValidatingParser(argparse.ArgumentParser):
"""
Override of error to format a nicer looking error message using logger
"""
- logger.error("Problem with configuration file (tip: use --help to see the available options):")
- logger.error(message)
+ logger.error(f"Problem with configuration file (tip: use --help to see the available options): \n{message}")
self.exit(2)
def parse_known_args(self, args=None, namespace=None):
@@ -136,8 +135,7 @@ class DefaultValidatingParser(argparse.ArgumentParser):
try:
self._check_value(action, action.default)
except argparse.ArgumentError as e:
- logger.error(f"You have an invalid setting in your configuration file ({action.dest}):")
- logger.error(e)
+ logger.error(f"You have an invalid setting in your configuration file ({action.dest}):\n {e}")
exit()
return super().parse_known_args(args, namespace)
diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py
index 5dca928..1720c68 100644
--- a/src/auto_archiver/core/extractor.py
+++ b/src/auto_archiver/core/extractor.py
@@ -12,7 +12,7 @@ from contextlib import suppress
import mimetypes
import os
import requests
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from retrying import retry
import re
@@ -94,7 +94,7 @@ class Extractor(BaseModule):
to_filename = to_filename[-64:]
to_filename = os.path.join(self.tmp_dir, to_filename)
if verbose:
- logger.debug(f"downloading {url[0:50]=} {to_filename=}")
+ logger.debug(f"downloading {to_filename=}")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
}
@@ -117,7 +117,7 @@ class Extractor(BaseModule):
return to_filename
except requests.RequestException as e:
- logger.warning(f"Failed to fetch the Media URL: {str(e)[:250]}")
+ logger.warning(f"Failed to fetch the Media URL: {e}")
if try_best_quality:
return None, url
diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py
index 2fad0ec..fee81d3 100644
--- a/src/auto_archiver/core/media.py
+++ b/src/auto_archiver/core/media.py
@@ -11,7 +11,7 @@ from dataclasses import dataclass, field
from dataclasses_json import dataclass_json, config
import mimetypes
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
@dataclass_json # annotation order matters
@@ -121,8 +121,7 @@ class Media:
except Error:
return False # ffmpeg errors when reading bad files
except Exception as e:
- logger.error(e)
- logger.error(traceback.format_exc())
+ logger.error(f"{e}: {traceback.format_exc()}")
try:
fsize = os.path.getsize(self.filename)
return fsize > 20_000
diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py
index 370af78..f1ac3c0 100644
--- a/src/auto_archiver/core/metadata.py
+++ b/src/auto_archiver/core/metadata.py
@@ -17,7 +17,7 @@ from dataclasses_json import dataclass_json
import datetime
from urllib.parse import urlparse
from dateutil.parser import parse as parse_dt
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from .media import Media
diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py
index f620500..1aad298 100644
--- a/src/auto_archiver/core/module.py
+++ b/src/auto_archiver/core/module.py
@@ -16,7 +16,7 @@ import sys
from importlib.util import find_spec
import os
from os.path import join
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
import auto_archiver
from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE, SetupError
diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py
index a028ac7..27a1bc9e 100644
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -15,9 +15,11 @@ import traceback
from copy import copy
from rich_argparse import RichHelpFormatter
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
import requests
+from auto_archiver.utils.misc import random_str
+
from .metadata import Metadata, Media
from auto_archiver.version import __version__
from .config import (
@@ -342,7 +344,12 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
# add other logging info
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
use_level = logging_config["level"]
- self.logger_id = logger.add(sys.stderr, level=use_level)
+ self.logger_id = logger.add(
+ sys.stderr,
+ level=use_level,
+ catch=True,
+ format="{level}: {message} {extra[serialize_no_message]}",
+ )
rotation = logging_config["rotation"]
log_file = logging_config["file"]
@@ -356,9 +363,10 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
f"{log_file}.{i}_{level.lower()}",
filter=lambda rec, lvl=level: rec["level"].name == lvl,
rotation=rotation,
+ format="{extra[serialized]}",
)
elif log_file:
- logger.add(log_file, rotation=rotation, level=use_level)
+ logger.add(log_file, rotation=rotation, level=use_level, format="{extra[serialized]}")
def install_modules(self, modules_by_type):
"""
@@ -466,13 +474,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
update_cmd = "`docker pull bellingcat/auto-archiver:latest`"
else:
update_cmd = "`pip install --upgrade auto-archiver`"
- logger.warning("")
- logger.warning("********* IMPORTANT: UPDATE AVAILABLE ********")
logger.warning(
- f"A new version of auto-archiver is available (v{latest_version}, you have v{current_version})"
+ f"\n********* IMPORTANT: UPDATE AVAILABLE ********\nA new version of auto-archiver is available (v{latest_version}, you have v{current_version})\nMake sure to update to the latest version using: {update_cmd}\n"
)
- logger.warning(f"Make sure to update to the latest version using: {update_cmd}")
- logger.warning("")
def setup(self, args: list):
"""
@@ -522,7 +526,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
self.setup(args)
return self.feed()
except Exception as e:
- logger.error(e)
+ logger.error(f"{e}: {traceback.format_exc()}")
exit(1)
def cleanup(self) -> None:
@@ -534,10 +538,12 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
url_count = 0
for feeder in self.feeders:
for item in feeder:
- yield self.feed_item(item)
- url_count += 1
+ with logger.contextualize(url=item.get_url(), trace=random_str(12)):
+ logger.info("started processing")
+ yield self.feed_item(item)
+ url_count += 1
- logger.info(f"Processed {url_count} URL(s)")
+ logger.info(f"processed {url_count} URL(s)")
self.cleanup()
def feed_item(self, item: Metadata) -> Metadata:
@@ -555,13 +561,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
return self.archive(item)
except KeyboardInterrupt:
# catches keyboard interruptions to do a clean exit
- logger.warning(f"caught interrupt on {item=}")
+ logger.warning("caught interrupt")
for d in self.databases:
d.aborted(item)
self.cleanup()
exit()
except Exception as e:
- logger.error(f"Got unexpected error on item {item}: {e}\n{traceback.format_exc()}")
+ logger.error(f"Got unexpected error: {e}\n{traceback.format_exc()}")
for d in self.databases:
if isinstance(e, AssertionError):
d.failed(item, str(e))
@@ -589,7 +595,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
try:
check_url_or_raise(original_url)
except ValueError as e:
- logger.error(f"Error archiving URL {original_url}: {e}")
+ logger.error(f"Error archiving: {e}")
raise e
# 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
@@ -599,7 +605,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
result.set_url(url)
if original_url != url:
- logger.debug(f"Sanitized URL from {original_url} to {url}")
+ logger.debug(f"Sanitized URL to {url}")
result.set("original_url", original_url)
# 2 - notify start to DBs, propagate already archived if feature enabled in DBs
@@ -614,25 +620,25 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
try:
d.done(cached_result, cached=True)
except Exception as e:
- logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
+ logger.error(f"database {d.name}: {e}: {traceback.format_exc()}")
return cached_result
# 3 - call extractors until one succeeds
for a in self.extractors:
- logger.info(f"Trying extractor {a.name} for {url}")
+ logger.info(f"trying extractor {a.name}")
try:
result.merge(a.download(result))
if result.is_success():
break
except Exception as e:
- logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")
+ logger.error(f"archiver {a.name}: {e}: {traceback.format_exc()}")
# 4 - call enrichers to work with archived content
for e in self.enrichers:
try:
e.enrich(result)
except Exception as exc:
- logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
+ logger.error(f"enricher {e.name}: {exc}: {traceback.format_exc()}")
# 5 - store all downloaded/generated media
result.store(storages=self.storages)
@@ -651,7 +657,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
try:
d.done(result)
except Exception as e:
- logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
+ logger.error(f"database {d.name}: {e}: {traceback.format_exc()}")
return result
diff --git a/src/auto_archiver/core/storage.py b/src/auto_archiver/core/storage.py
index 3205f5a..fd743cb 100644
--- a/src/auto_archiver/core/storage.py
+++ b/src/auto_archiver/core/storage.py
@@ -24,7 +24,7 @@ from abc import abstractmethod
from typing import IO
import os
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from slugify import slugify
from auto_archiver.utils.misc import random_str
diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py
index 04e4702..e380adb 100644
--- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py
@@ -7,7 +7,7 @@ from urllib.parse import urljoin
import glob
import importlib.util
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
import selenium
from seleniumbase import SB
@@ -57,7 +57,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
continue # Skip imported modules/classes/functions
if isinstance(obj, type) and issubclass(obj, Dropin):
dropins.append(obj)
- logger.debug(f"ANTIBOT loaded drop-in classes: {', '.join([d.__name__ for d in dropins])}")
+ logger.debug(f"loaded drop-in classes: {', '.join([d.__name__ for d in dropins])}")
return dropins
def sanitize_url(self, url: str) -> str:
@@ -83,14 +83,13 @@ class AntibotExtractorEnricher(Extractor, Enricher):
def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool:
using_user_data_dir = self.user_data_dir if custom_data_dir else None
url = to_enrich.get_url()
- url_sample = url[:75]
try:
with SB(uc=True, agent=self.agent, headed=None, user_data_dir=using_user_data_dir, proxy=self.proxy) as sb:
- logger.info(f"ANTIBOT selenium browser is up with agent {self.agent}, opening {url_sample}...")
+ logger.info(f"selenium browser is up with agent {self.agent}, opening url...")
sb.uc_open_with_reconnect(url, 4)
- logger.debug(f"ANTIBOT handling CAPTCHAs for {url_sample}...")
+ logger.debug("handling CAPTCHAs for...")
sb.uc_gui_handle_cf()
sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future
@@ -98,7 +97,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
dropin.open_page(url)
if self.detect_auth_wall and self._hit_auth_wall(sb):
- logger.warning(f"ANTIBOT SKIP since auth wall or CAPTCHA was detected for {url_sample}")
+ logger.warning("skipping since auth wall or CAPTCHA was detected")
return False
sb.wait_for_ready_state_complete()
@@ -125,18 +124,18 @@ class AntibotExtractorEnricher(Extractor, Enricher):
js_css_selector=dropin.js_for_video_css_selectors(),
max_media=self.max_download_videos - downloaded_videos,
)
- logger.info(f"ANTIBOT completed for {url_sample}")
+ logger.info("completed")
return to_enrich
except selenium.common.exceptions.SessionNotCreatedException as e:
if custom_data_dir: # the retry logic only works once
logger.error(
- f"ANTIBOT session not created error: {e}. Please remove the user_data_dir {self.user_data_dir} and try again, will retry without user data dir though."
+ f"session not created error: {e}. Please remove the user_data_dir {self.user_data_dir} and try again, will retry without user data dir though."
)
return self.enrich(to_enrich, custom_data_dir=False)
raise e # re-raise
except Exception as e:
- logger.error(f"ANTIBOT runtime error: {e}: {traceback.format_exc()}")
+ logger.error(f"runtime error: {e}: {traceback.format_exc()}")
return False
def _get_suitable_dropin(self, url: str, sb: SB):
@@ -146,7 +145,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
"""
for dropin in self.dropins:
if dropin.suitable(url):
- logger.debug(f"ANTIBOT using drop-in {dropin.__name__} for {url}")
+ logger.debug(f"using drop-in {dropin.__name__}")
return dropin(sb, self)
return DefaultDropin(sb, self)
@@ -241,7 +240,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
x = max(sb.execute_script("return document.documentElement.scrollWidth"), w)
y = min(max(sb.execute_script("return document.documentElement.scrollHeight"), h), 25_000)
- logger.debug(f"Setting window size to {x}x{y} for full page screenshot.")
+ logger.debug(f"setting window size to {x}x{y} for full page screenshot.")
sb.set_window_size(x, y)
screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png")
@@ -280,7 +279,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
# js_for_css_selectors
for src in sources:
if len(all_urls) >= max_media:
- logger.debug(f"Reached max download limit of {max_media} images/videos.")
+ logger.debug(f"reached max download limit of {max_media} images/videos.")
break
if not is_relevant_url(src):
continue
diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/anti_captcha.py b/src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/anti_captcha.py
new file mode 100644
index 0000000..f624953
--- /dev/null
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/anti_captcha.py
@@ -0,0 +1,60 @@
+# def solve_captcha(image_url):
+# # Download image
+# img_data = requests.get(image_url).content
+# encoded_image = base64.b64encode(img_data).decode()
+
+# # Submit to AntiCaptcha
+# task = {
+# "clientKey": ANTI_CAPTCHA_KEY,
+# "task": {
+# "type": "ImageToTextTask",
+# "body": encoded_image
+# }
+# }
+# print("[*] Sending captcha request to anti-captcha...")
+
+# task_response = requests.post("https://api.anti-captcha.com/createTask", json=task).json()
+# task_id = task_response["taskId"]
+# print(f"[*] Anti-captcha response: {task_response}")
+
+# # Poll for result
+# while True:
+# time.sleep(5)
+# res = requests.post("https://api.anti-captcha.com/getTaskResult", json={
+# "clientKey": ANTI_CAPTCHA_KEY,
+# "taskId": task_id
+# }).json()
+# if res["status"] == "ready":
+# print(f"[*] Captcha solved: {res}")
+# return res["solution"]["text"]
+# print(f"[*] Polling for captcha solution: {res['status']}")
+
+
+# def solve_recaptcha(site_key, page_url):
+# print("[*] Sending captcha request to anti-captcha...")
+# # Step 1: Send captcha request
+# task_payload = {
+# "clientKey": ANTI_CAPTCHA_KEY,
+# "task": {
+# "type": "NoCaptchaTaskProxyless",
+# "websiteURL": page_url,
+# "websiteKey": site_key
+# }
+# }
+# response = requests.post("https://api.anti-captcha.com/createTask", json=task_payload).json()
+# print(f"[*] Anti-captcha response: {response}")
+# task_id = response["taskId"]
+
+# # Step 2: Poll for solution
+# print("[*] Polling for captcha solution...")
+# for i in range(40): # ~80 seconds
+# time.sleep(2)
+# result = requests.post("https://api.anti-captcha.com/getTaskResult", json={
+# "clientKey": ANTI_CAPTCHA_KEY,
+# "taskId": task_id
+# }).json()
+# print(f" Poll {i+1}: status={result['status']}")
+# if result["status"] == "ready":
+# print("[*] Captcha solved!")
+# return result["solution"]["gRecaptchaResponse"]
+# raise TimeoutError("AntiCaptcha took too long")
diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py
index d4b255d..c45d7ad 100644
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py
@@ -1,6 +1,7 @@
import os
+import traceback
from typing import Mapping
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from seleniumbase import SB
import yt_dlp
@@ -143,7 +144,7 @@ class Dropin:
with yt_dlp.YoutubeDL(validated_options) as ydl:
for url in video_urls:
try:
- logger.debug(f"Downloading video from URL: {url}")
+ logger.debug("downloading video from url")
info = ydl.extract_info(url, download=True)
filename = ydl_entry_to_filename(ydl, info)
if not filename: # Failed to download video.
@@ -155,5 +156,5 @@ class Dropin:
to_enrich.add_media(media)
downloaded += 1
except Exception as e:
- logger.error(f"Error downloading {url}: {e}")
+ logger.error(f"download failed: {e} {traceback.format_exc()}")
return downloaded
diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py
index 336b630..082e409 100644
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py
@@ -1,5 +1,5 @@
from typing import Mapping
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
@@ -62,7 +62,7 @@ class LinkedinDropin(Dropin):
self.sb.wait_for_ready_state_complete()
username, password = self._get_username_password("linkedin.com")
- logger.debug("LinkedinDropin Logging in to Linkedin with username: {}", username)
+ logger.debug("logging in to Linkedin with username: {}", username)
self.sb.type("#username", username)
self.sb.type("#password", password)
self.sb.click_if_visible("#password-visibility-toggle", timeout=0.5)
diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py
index 3f699b6..7f5e23e 100644
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py
@@ -3,7 +3,7 @@ from typing import Mapping
from auto_archiver.core.metadata import Metadata
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
class RedditDropin(Dropin):
@@ -50,7 +50,7 @@ class RedditDropin(Dropin):
self._close_cookies_banner()
username, password = self._get_username_password("reddit.com")
- logger.debug("RedditDropin Logging in to Reddit with username: {}", username)
+ logger.debug("logging in to Reddit with username: {}", username)
self.sb.type("#login-username", username)
self.sb.type("#login-password", password)
@@ -68,7 +68,7 @@ class RedditDropin(Dropin):
self.sb.click_link_text("Log in")
self.sb.wait_for_ready_state_complete()
if self.sb.is_text_visible("Welcome back"):
- logger.debug("RedditDropin Login successful")
+ logger.debug("login successful")
self.sb.click_if_visible("this link")
def _close_cookies_banner(self):
@@ -88,5 +88,5 @@ class RedditDropin(Dropin):
.map(el => el.src || el.href)
.filter(url => url && /\.(m3u8|mpd|ism)$/.test(url));
""")
- logger.debug("RedditDropin Found {} video URLs", len(filtered_urls))
+ logger.debug("found {} video URLs", len(filtered_urls))
return 0, self._download_videos_with_ytdlp(filtered_urls, to_enrich)
diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py
index 3f92eda..02afd75 100644
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py
@@ -4,7 +4,7 @@ from typing import Mapping
from auto_archiver.core.metadata import Metadata
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
class VkDropin(Dropin):
@@ -57,12 +57,12 @@ class VkDropin(Dropin):
self.sb.open("https://vk.com")
self.sb.wait_for_ready_state_complete()
if "/feed" in self.sb.get_current_url():
- logger.debug("Already logged in to VK.")
+ logger.debug("already logged in to VK.")
return True
# need to login
username, password = self._get_username_password("vk.com")
- logger.debug("Logging in to VK with username: {}", username)
+ logger.debug("logging in to VK with username: {}", username)
self.sb.click('[data-testid="enter-another-way"]', timeout=10)
self.sb.clear('input[name="login"][type="tel"]', by="css selector", timeout=10)
diff --git a/src/auto_archiver/modules/api_db/api_db.py b/src/auto_archiver/modules/api_db/api_db.py
index c422248..1475375 100644
--- a/src/auto_archiver/modules/api_db/api_db.py
+++ b/src/auto_archiver/modules/api_db/api_db.py
@@ -2,7 +2,7 @@ from typing import Union
import os
import requests
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Database
from auto_archiver.core import Metadata
@@ -36,9 +36,9 @@ class AAApiDb(Database):
if not self.store_results:
return
if cached:
- logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
+ logger.debug("skipping saving archive to AA API because it was cached")
return
- logger.debug(f"saving archive of {item.get_url()} to the AA API.")
+ logger.debug("saving archive to the AA API.")
payload = {
"author_id": self.author_id,
diff --git a/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py b/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py
index c84abd6..814800d 100644
--- a/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py
+++ b/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py
@@ -3,7 +3,7 @@ import os
from typing import IO, Iterator, Optional, Union
import requests
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Database, Feeder, Media, Metadata, Storage
from auto_archiver.utils import calculate_file_hash
@@ -66,13 +66,13 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
"""Mark an item as failed in Atlos, if the ID exists."""
atlos_id = item.metadata.get("atlos_id")
if not atlos_id:
- logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
+ logger.info("No Atlos ID available, skipping")
return
self._post(
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
json={"metadata": {"processed": True, "status": "error", "error": reason}},
)
- logger.info(f"Stored failure for {item.get_url()} (ID {atlos_id}) on Atlos: {reason}")
+ logger.info(f"stored failure ID {atlos_id} on Atlos: {reason}")
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
"""check and fetch if the given item has been archived already, each
@@ -88,7 +88,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
"""Mark an item as successfully archived in Atlos."""
atlos_id = item.metadata.get("atlos_id")
if not atlos_id:
- logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
+ logger.info("item has no Atlos ID, skipping")
return
self._post(
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
@@ -100,7 +100,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
}
},
)
- logger.info(f"Stored success for {item.get_url()} (ID {atlos_id}) on Atlos")
+ logger.info(f"stored success ID {atlos_id} on Atlos")
# ! Atlos Module - Storage Methods
@@ -111,12 +111,12 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool:
"""Upload a media file to Atlos if it has not been uploaded already."""
if metadata is None:
- logger.error(f"No metadata provided for {media.filename}")
+ logger.error(f"no metadata provided for {media.filename}")
return False
atlos_id = metadata.get("atlos_id")
if not atlos_id:
- logger.error(f"No Atlos ID found in metadata; can't store {media.filename} in Atlos.")
+ logger.error(f"no Atlos ID found in metadata; can't store {media.filename} in Atlos.")
return False
media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096)
@@ -135,7 +135,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
params={"title": media.properties},
files={"file": (os.path.basename(media.filename), file_obj)},
)
- logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}")
+ logger.info(f"uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}")
return True
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
diff --git a/src/auto_archiver/modules/cli_feeder/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py
index 5935466..7bb243b 100644
--- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py
+++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py
@@ -1,5 +1,3 @@
-from loguru import logger
-
from auto_archiver.core.feeder import Feeder
from auto_archiver.core.metadata import Metadata
from auto_archiver.core.consts import SetupError
@@ -16,8 +14,5 @@ class CLIFeeder(Feeder):
def __iter__(self) -> Metadata:
urls = self.config["urls"]
for url in urls:
- logger.debug(f"Processing {url}")
m = Metadata().set_url(url)
yield m
-
- logger.success(f"Processed {len(urls)} URL(s)")
diff --git a/src/auto_archiver/modules/console_db/console_db.py b/src/auto_archiver/modules/console_db/console_db.py
index c6711c5..d6c1383 100644
--- a/src/auto_archiver/modules/console_db/console_db.py
+++ b/src/auto_archiver/modules/console_db/console_db.py
@@ -1,4 +1,4 @@
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Database
from auto_archiver.core import Metadata
diff --git a/src/auto_archiver/modules/csv_db/csv_db.py b/src/auto_archiver/modules/csv_db/csv_db.py
index ac31027..aff4ad0 100644
--- a/src/auto_archiver/modules/csv_db/csv_db.py
+++ b/src/auto_archiver/modules/csv_db/csv_db.py
@@ -1,5 +1,5 @@
import os
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from csv import DictWriter
from dataclasses import asdict
diff --git a/src/auto_archiver/modules/csv_feeder/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py
index 9c72162..f41f6b4 100644
--- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py
+++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py
@@ -1,4 +1,4 @@
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
import csv
from auto_archiver.core import Feeder
@@ -20,20 +20,19 @@ class CSVFeeder(Feeder):
url_column = first_row.index(url_column)
except ValueError:
logger.error(
- f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?"
+ f"column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?"
)
return
elif not (url_or_none(first_row[url_column])):
# it's a header row, but we've been given a column number already
- logger.debug(f"Skipping header row: {first_row}")
+ logger.debug(f"skipping header row: {first_row}")
else:
# first row isn't a header row, rewind the file
f.seek(0)
for row in reader:
if not url_or_none(row[url_column]):
- logger.warning(f"Not a valid URL in row: {row}, skipping")
+ logger.warning(f"not a valid URL in row: {row}, skipping")
continue
url = row[url_column]
- logger.debug(f"Processing {url}")
yield Metadata().set_url(url)
diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
index 50ce244..6a15e80 100644
--- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
+++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
@@ -8,7 +8,7 @@ from google.oauth2 import service_account
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Media
from auto_archiver.core import Storage
@@ -23,10 +23,10 @@ class GDriveStorage(Storage):
def _setup_google_drive_service(self):
"""Initialize Google Drive service based on provided credentials."""
if self.oauth_token:
- logger.debug(f"Using Google Drive OAuth token: {self.oauth_token}")
+ logger.debug(f"using Google Drive OAuth token: {self.oauth_token}")
self.service = self._initialize_with_oauth_token()
elif self.service_account:
- logger.debug(f"Using Google Drive service account: {self.service_account}")
+ logger.debug(f"using Google Drive service account: {self.service_account}")
self.service = self._initialize_with_service_account()
else:
raise ValueError("Missing credentials: either `oauth_token` or `service_account` must be provided.")
@@ -41,7 +41,7 @@ class GDriveStorage(Storage):
if not creds.valid and creds.expired and creds.refresh_token:
creds.refresh(Request())
with open(self.oauth_token, "w") as token_file:
- logger.debug("Saving refreshed OAuth token.")
+ logger.debug("saving refreshed OAuth token.")
token_file.write(creds.to_json())
elif not creds.valid:
raise ValueError("Invalid OAuth token. Please regenerate the token.")
@@ -180,7 +180,7 @@ class GDriveStorage(Storage):
Creates a new GDrive folder @name inside folder @parent_id
Returns id of the created folder
"""
- logger.debug(f"Creating new folder with {name=} inside {parent_id=}")
+ logger.debug(f"creating new folder with {name=} inside {parent_id=}")
file_metadata = {"name": [name], "mimeType": "application/vnd.google-apps.folder", "parents": [parent_id]}
gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields="id").execute()
return gd_folder.get("id")
diff --git a/src/auto_archiver/modules/generic_extractor/bluesky.py b/src/auto_archiver/modules/generic_extractor/bluesky.py
index 5baad6c..261ff03 100644
--- a/src/auto_archiver/modules/generic_extractor/bluesky.py
+++ b/src/auto_archiver/modules/generic_extractor/bluesky.py
@@ -1,4 +1,4 @@
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from auto_archiver.core.extractor import Extractor
from auto_archiver.core.metadata import Metadata, Media
@@ -18,7 +18,7 @@ class Bluesky(GenericDropin):
# download if embeds present (1 video XOR >=1 images)
for media in self._download_bsky_embeds(post, archiver):
result.add_media(media)
- logger.debug(f"Downloaded {len(result.media)} media files")
+ logger.debug(f"downloaded {len(result.media)} media files")
return result
diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
index a65c5fe..f71ac28 100644
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -14,7 +14,7 @@ from yt_dlp.extractor.common import InfoExtractor
from yt_dlp.utils import MaxDownloadsReached
import pysubs2
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from auto_archiver.core.extractor import Extractor
from auto_archiver.core import Metadata, Media
@@ -63,12 +63,11 @@ class GenericExtractor(Extractor):
if os.environ.get("AUTO_ARCHIVER_ALLOW_RESTART", "1") != "1":
logger.warning("yt-dlp or plugin was updated — please restart auto-archiver manually")
else:
- logger.warning("yt-dlp or plugin was updated — restarting auto-archiver")
- logger.warning(" ======= RESTARTING ======= ")
+ logger.warning("yt-dlp or plugin was updated — restarting auto-archiver\n ======= RESTARTING ======= ")
os.execv(sys.executable, [sys.executable] + sys.argv)
def update_package(self, package_name: str) -> bool:
- logger.info(f"Checking and updating {package_name}...")
+ logger.info(f"checking and updating {package_name}...")
from importlib.metadata import version as get_version
old_version = get_version(package_name)
@@ -80,7 +79,7 @@ class GenericExtractor(Extractor):
return True
logger.info(f"{package_name} already up to date")
except Exception as e:
- logger.error(f"Error updating {package_name}: {e}")
+ logger.error(f"failed to update {package_name}: {e}")
return False
def setup_po_tokens(self) -> None:
@@ -111,7 +110,7 @@ class GenericExtractor(Extractor):
missing_tools = [tool for tool in ("node", "yarn", "npx") if shutil.which(tool) is None]
if missing_tools:
logger.error(
- f"Cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. "
+ f"cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. "
"Install these tools or run bgutils via Docker. "
"See: https://github.com/Brainicism/bgutil-ytdlp-pot-provider"
)
@@ -140,7 +139,7 @@ class GenericExtractor(Extractor):
f"https://github.com/Brainicism/bgutil-ytdlp-pot-provider/archive/refs/tags/{plugin_version}.zip"
)
zip_path = os.path.join(base_dir, f"{plugin_version}.zip")
- logger.info(f"Downloading bgutils release zip for version {plugin_version}...")
+ logger.info(f"downloading bgutils release zip for version {plugin_version}...")
urlretrieve(zip_url, zip_path)
with zipfile.ZipFile(zip_path, "r") as z:
z.extractall(base_dir)
@@ -149,7 +148,7 @@ class GenericExtractor(Extractor):
extracted_root = os.path.join(base_dir, f"bgutil-ytdlp-pot-provider-{plugin_version}")
shutil.move(os.path.join(extracted_root, "server"), server_dir)
shutil.rmtree(extracted_root)
- logger.info("Installing dependencies and transpiling PoT Generator script...")
+ logger.info("installing dependencies and transpiling PoT Generator script...")
subprocess.run(["yarn", "install", "--frozen-lockfile"], cwd=server_dir, check=True)
subprocess.run(["npx", "tsc"], cwd=server_dir, check=True)
@@ -165,7 +164,7 @@ class GenericExtractor(Extractor):
logger.info(f"PO Token script configured at: {script_path}")
except Exception as e:
- logger.error(f"Failed to set up PO Token script: {e}")
+ logger.error(f"failed to set up PO Token script: {e}")
def suitable_extractors(self, url: str) -> Generator[str, None, None]:
"""
@@ -206,7 +205,7 @@ class GenericExtractor(Extractor):
media = Media(cover_image_path)
metadata.add_media(media, id="cover")
except Exception as e:
- logger.error(f"Error downloading cover image {thumbnail_url}: {e}")
+ logger.error(f"could not download cover image {thumbnail_url}: {e}")
dropin = self.dropin_for_name(info_extractor.ie_key())
if dropin:
@@ -353,7 +352,7 @@ class GenericExtractor(Extractor):
if not dropin:
# TODO: add a proper link to 'how to create your own dropin'
- logger.debug(f"""Could not find valid dropin for {info_extractor.ie_key()}.
+ logger.debug(f"""could not find valid dropin for {info_extractor.ie_key()}.
Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""")
return False
@@ -389,7 +388,7 @@ class GenericExtractor(Extractor):
# file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies.
continue
- logger.debug(f"Using filename {filename} for entry {entry.get('id', 'unknown')}")
+ logger.debug(f"using filename {filename} for entry {entry.get('id', 'unknown')}")
new_media = Media(filename)
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
@@ -404,12 +403,12 @@ class GenericExtractor(Extractor):
text = " ".join([line.text for line in subs])
new_media.set(f"subtitles_{lang}", text)
except Exception as e:
- logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}")
+ logger.error(f"error loading subtitle file {val.get('filepath')}: {e}")
result.add_media(new_media)
except Exception as e:
- logger.error(f"Error processing entry {entry}: {e}")
+ logger.error(f"error processing entry {entry}: {e}")
if not len(result.media):
- logger.info(f"No media found for entry {entry}, skipping.")
+ logger.info(f"no media found for entry {entry}, skipping.")
return False
return self.add_metadata(data, info_extractor, url, result)
@@ -471,14 +470,14 @@ class GenericExtractor(Extractor):
def _helper_for_successful_extract_info(data, info_extractor, url, ydl):
if data.get("is_live", False) and not self.livestreams:
- logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
+ logger.warning("livestream detected, skipping due to 'livestreams' configuration setting")
return False
# it's a valid video, that the youtubdedl can download out of the box
return self.get_metadata_for_video(data, info_extractor, url, ydl)
try:
if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
- logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
+ logger.debug(f"skipping using ytdlp to download files for {info_extractor.ie_key()}")
raise SkipYtdlp()
# don't download since it can be a live stream
@@ -497,17 +496,17 @@ class GenericExtractor(Extractor):
if not isinstance(e, SkipYtdlp):
logger.debug(
- f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead'
+ f'issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead'
)
try:
result = self.get_metadata_for_post(info_extractor, url, ydl)
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
- logger.error("Error downloading metadata for post: {error}", error=str(post_e))
+ logger.error("error downloading metadata for post: {error}", error=str(post_e))
return False
except Exception as generic_e:
logger.debug(
- 'Attempt to extract using ytdlp extractor "{name}" failed: \n {error}',
+ 'attempt to extract using ytdlp extractor "{name}" failed: \n {error}',
name=info_extractor.IE_NAME,
error=str(generic_e),
exc_info=True,
@@ -560,17 +559,17 @@ class GenericExtractor(Extractor):
# order of importance: username/password -> api_key -> cookie -> cookies_from_browser -> cookies_file
if auth:
if "username" in auth and "password" in auth:
- logger.debug(f"Using provided auth username and password for {url}")
+ logger.debug("using provided auth username and password")
ydl_options.extend(("--username", auth["username"]))
ydl_options.extend(("--password", auth["password"]))
elif "cookie" in auth:
- logger.debug(f"Using provided auth cookie for {url}")
+ logger.debug("using provided auth cookie")
yt_dlp.utils.std_headers["cookie"] = auth["cookie"]
elif "cookies_from_browser" in auth:
- logger.debug(f"Using extracted cookies from browser {auth['cookies_from_browser']} for {url}")
+ logger.debug(f"using extracted cookies from browser {auth['cookies_from_browser']}")
ydl_options.extend(("--cookies-from-browser", auth["cookies_from_browser"]))
elif "cookies_file" in auth:
- logger.debug(f"Using cookies from file {auth['cookies_file']} for {url}")
+ logger.debug(f"using cookies from file {auth['cookies_file']}")
ydl_options.extend(("--cookies", auth["cookies_file"]))
# Applying user-defined extractor_args
@@ -580,11 +579,11 @@ class GenericExtractor(Extractor):
arg_str = ";".join(f"{k}={v}" for k, v in args.items())
else:
arg_str = str(args)
- logger.debug(f"Setting extractor_args: {key}:{arg_str}")
+ logger.debug(f"setting extractor_args: {key}:{arg_str}")
ydl_options.extend(["--extractor-args", f"{key}:{arg_str}"])
if self.ytdlp_args:
- logger.debug("Adding additional ytdlp arguments: {self.ytdlp_args}")
+ logger.debug(f"adding additional ytdlp arguments: {self.ytdlp_args}")
ydl_options += self.ytdlp_args.split(" ")
*_, validated_options = yt_dlp.parse_options(ydl_options)
diff --git a/src/auto_archiver/modules/generic_extractor/tiktok.py b/src/auto_archiver/modules/generic_extractor/tiktok.py
index 902eb05..66936e3 100644
--- a/src/auto_archiver/modules/generic_extractor/tiktok.py
+++ b/src/auto_archiver/modules/generic_extractor/tiktok.py
@@ -1,5 +1,5 @@
import requests
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from yt_dlp.extractor.tiktok import TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE
@@ -22,7 +22,7 @@ class Tiktok(GenericDropin):
return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE))
def extract_post(self, url: str, ie_instance):
- logger.debug(f"Using Tikwm API to attempt to download tiktok video from {url=}")
+ logger.debug(f"using Tikwm API to attempt to download tiktok video from {url=}")
endpoint = self.TIKWM_ENDPOINT.format(url=url)
diff --git a/src/auto_archiver/modules/generic_extractor/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py
index 9006e57..c5964ad 100644
--- a/src/auto_archiver/modules/generic_extractor/twitter.py
+++ b/src/auto_archiver/modules/generic_extractor/twitter.py
@@ -1,7 +1,7 @@
import re
import mimetypes
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from slugify import slugify
from auto_archiver.core.metadata import Metadata, Media
@@ -40,7 +40,7 @@ class Twitter(GenericDropin):
raise ValueError("Error retreiving post. Are you sure it exists?")
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
except (ValueError, KeyError) as ex:
- logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
+ logger.warning(f"unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
return False
full_text = tweet.pop("full_text", "")
@@ -49,7 +49,7 @@ class Twitter(GenericDropin):
result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp)
if not tweet.get("entities", {}).get("media"):
- logger.debug("No media found, archiving tweet text only")
+ logger.debug("no media found, archiving tweet text only")
result.status = "twitter-ytdl"
return result
for i, tw_media in enumerate(tweet["entities"]["media"]):
diff --git a/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py
index 10300e0..0f03de7 100644
--- a/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py
+++ b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py
@@ -10,11 +10,12 @@ The filtered rows are processed into `Metadata` objects.
"""
import os
+import traceback
from typing import Tuple, Union, Iterator
from urllib.parse import quote
import gspread
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from slugify import slugify
from retrying import retry
@@ -41,19 +42,19 @@ class GsheetsFeederDB(Feeder, Database):
sh = self.open_sheet()
for ii, worksheet in enumerate(sh.worksheets()):
if not self.should_process_sheet(worksheet.title):
- logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
+ logger.debug(f"skipped worksheet '{worksheet.title}' due to allow/block rules")
continue
- logger.info(f"Opening worksheet {ii=}: {worksheet.title=} header={self.header}")
+ logger.info(f"opening worksheet {ii=}: {worksheet.title=} header={self.header}")
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
if len(missing_cols := self.missing_required_columns(gw)):
logger.debug(
- f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}"
+ f"skipped worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}"
)
continue
-
- # process and yield metadata here:
- yield from self._process_rows(gw)
- logger.info(f"Finished worksheet {worksheet.title}")
+ with logger.contextualize(worksheet=f"{sh.title}:{worksheet.title}"):
+ # process and yield metadata here:
+ yield from self._process_rows(gw)
+ logger.info(f"finished worksheet {worksheet.title}")
def _process_rows(self, gw: GWorksheet):
for row in range(1 + self.header, gw.count_rows() + 1):
@@ -69,7 +70,9 @@ class GsheetsFeederDB(Feeder, Database):
# All checks done - archival process starts here
m = Metadata().set_url(url)
self._set_context(m, gw, row)
- yield m
+
+ with logger.contextualize(row=row):
+ yield m
def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
# TODO: Check folder value not being recognised
@@ -99,16 +102,16 @@ class GsheetsFeederDB(Feeder, Database):
return missing
def started(self, item: Metadata) -> None:
- logger.info(f"STARTED {item}")
+ logger.info("STARTED")
gw, row = self._retrieve_gsheet(item)
gw.set_cell(row, "status", "Archive in progress")
def failed(self, item: Metadata, reason: str) -> None:
- logger.error(f"FAILED {item}")
+ logger.error("FAILED")
self._safe_status_update(item, f"Archive failed {reason}")
def aborted(self, item: Metadata) -> None:
- logger.warning(f"ABORTED {item}")
+ logger.warning("ABORTED")
self._safe_status_update(item, "")
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
@@ -122,9 +125,7 @@ class GsheetsFeederDB(Feeder, Database):
cell_updates = []
row_values = gw.get_row(row)
- spreadsheet = gw.wks.spreadsheet.title
- worksheet = gw.wks.title
- logger.info(f"DONE url='{item.get_url()}' {row=} on {spreadsheet=} : {worksheet=}")
+ logger.info("DONE")
def batch_if_valid(col, val, final_value=None):
final_value = final_value or val
@@ -132,7 +133,7 @@ class GsheetsFeederDB(Feeder, Database):
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
cell_updates.append((row, col, final_value))
except Exception as e:
- logger.error(f"Unable to batch {col}={final_value} due to {e}")
+ logger.error(f"unable to batch {col}={final_value} due to {e}")
status_message = item.status
if cached:
@@ -192,15 +193,13 @@ class GsheetsFeederDB(Feeder, Database):
gw, row = self._retrieve_gsheet(item)
gw.set_cell(row, "status", new_status)
except Exception as e:
- logger.debug(f"Unable to update sheet: {e}")
+ logger.debug(f"unable to update sheet: {e}: {traceback.format_exc()}")
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
if gsheet := item.get_context("gsheet"):
gw: GWorksheet = gsheet.get("worksheet")
row: int = gsheet.get("row")
elif self.sheet_id:
- logger.error(
- f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder."
- )
+ logger.error("unable to retrieve Gsheet, GsheetDB must be used alongside GsheetFeeder.")
return gw, row
diff --git a/src/auto_archiver/modules/hash_enricher/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py
index 71425f2..799c5b3 100644
--- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py
+++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py
@@ -9,7 +9,7 @@ making it suitable for handling large files efficiently.
"""
import hashlib
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata
@@ -22,8 +22,7 @@ class HashEnricher(Enricher):
"""
def enrich(self, to_enrich: Metadata) -> None:
- url = to_enrich.get_url()
- logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
+ logger.debug(f"calculating media hashes with algo={self.algorithm}")
for i, m in enumerate(to_enrich.media):
if len(hd := self.calculate_hash(m.filename)):
diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py
index f5da1d8..41188f1 100644
--- a/src/auto_archiver/modules/html_formatter/html_formatter.py
+++ b/src/auto_archiver/modules/html_formatter/html_formatter.py
@@ -4,7 +4,7 @@ import os
import pathlib
from jinja2 import Environment, FileSystemLoader
from urllib.parse import quote
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
import json
import base64
@@ -35,7 +35,7 @@ class HtmlFormatter(Formatter):
def format(self, item: Metadata) -> Media:
url = item.get_url()
if item.is_empty():
- logger.debug(f"[SKIP] FORMAT there is no media or metadata to format: {url=}")
+ logger.debug("nothing to format, skipping")
return
content = self.template.render(
diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py
index 1694ddc..e21b089 100644
--- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py
+++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py
@@ -14,7 +14,7 @@ from datetime import datetime
import traceback
import requests
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from retrying import retry
from tqdm import tqdm
@@ -45,11 +45,11 @@ class InstagramAPIExtractor(Extractor):
url = item.get_url()
url.replace("instagr.com", "instagram.com").replace("instagr.am", "instagram.com")
insta_matches = self.valid_url.findall(url)
- logger.info(f"{insta_matches=}")
+
if not len(insta_matches) or len(insta_matches[0]) != 3:
return
if len(insta_matches) > 1:
- logger.warning(f"Multiple instagram matches found in {url=}, using the first one")
+ logger.debug("multiple instagram matches found, using the first one")
return
g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2]
if g1 == "":
@@ -65,7 +65,7 @@ class InstagramAPIExtractor(Extractor):
return self.download_post(item, id=g3, context="story")
return self.download_stories(item, g2)
else:
- logger.warning(f"Unknown instagram regex group match {g1=} found in {url=}")
+ logger.warning(f"unknown instagram regex group match {g1=}")
return
@retry(wait_random_min=1000, wait_random_max=3000, stop_max_attempt_number=5)
@@ -112,8 +112,8 @@ class InstagramAPIExtractor(Extractor):
count_posts += len(stories)
result.set("#stories", len(stories))
except Exception as e:
- result.append("errors", f"Error downloading stories for {username}")
- logger.error(f"Error downloading stories for {username}: {e} {traceback.format_exc()}")
+ result.append("errors", f"error downloading stories for {username}")
+ logger.error(f"error downloading stories for {username}: {e} {traceback.format_exc()}")
# download all posts
try:
@@ -122,8 +122,8 @@ class InstagramAPIExtractor(Extractor):
result, user_id, max_to_download=self.full_profile_max_posts - count_posts
)
except Exception as e:
- result.append("errors", f"Error downloading posts for {username}")
- logger.error(f"Error downloading posts for {username}: {e} {traceback.format_exc()}")
+ result.append("errors", f"error downloading posts for {username}")
+ logger.error(f"error downloading posts for {username}: {e} {traceback.format_exc()}")
# download all tagged
try:
@@ -132,8 +132,8 @@ class InstagramAPIExtractor(Extractor):
result, user_id, max_to_download=self.full_profile_max_posts - count_posts
)
except Exception as e:
- result.append("errors", f"Error downloading tagged posts for {username}")
- logger.error(f"Error downloading tagged posts for {username}: {e} {traceback.format_exc()}")
+ result.append("errors", f"error downloading tagged posts for {username}")
+ logger.error(f"error downloading tagged posts for {username}: {e} {traceback.format_exc()}")
# download all highlights
try:
@@ -159,10 +159,10 @@ class InstagramAPIExtractor(Extractor):
except Exception as e:
result.append(
"errors",
- f"Error downloading highlight id{h.get('pk')} for {username}",
+ f"error downloading highlight id{h.get('pk')} for {username}",
)
logger.error(
- f"Error downloading highlight id{h.get('pk')} for {username}: {e} {traceback.format_exc()}"
+ f"error downloading highlight id{h.get('pk')} for {username}: {e} {traceback.format_exc()}"
)
if count_highlights >= max_to_download:
logger.debug(f"HIGHLIGHTS reached max_to_download={self.full_profile_max_posts}")
@@ -208,8 +208,8 @@ class InstagramAPIExtractor(Extractor):
try:
self.scrape_item(result, h, "highlight")
except Exception as e:
- result.append("errors", f"Error downloading highlight {h.get('id')}")
- logger.error(f"Error downloading highlight, skipping {h.get('id')}: {e} {traceback.format_exc()}")
+ result.append("errors", f"error downloading highlight {h.get('id')}")
+ logger.error(f"error downloading highlight, skipping {h.get('id')}: {e} {traceback.format_exc()}")
return h_info
@@ -251,8 +251,8 @@ class InstagramAPIExtractor(Extractor):
try:
self.scrape_item(result, p, "post")
except Exception as e:
- result.append("errors", f"Error downloading post {p.get('id')}")
- logger.error(f"Error downloading post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
+ result.append("errors", f"error downloading post {p.get('id')}")
+ logger.error(f"error downloading post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
pbar.update(1)
post_count += 1
if post_count >= max_to_download:
@@ -279,8 +279,8 @@ class InstagramAPIExtractor(Extractor):
try:
self.scrape_item(result, p, "tagged")
except Exception as e:
- result.append("errors", f"Error downloading tagged post {p.get('id')}")
- logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
+ result.append("errors", f"error downloading tagged post {p.get('id')}")
+ logger.error(f"error downloading tagged post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
pbar.update(1)
tagged_count += 1
if tagged_count >= max_to_download:
diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
index d559c47..af525f3 100644
--- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
+++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
@@ -8,7 +8,7 @@ import re
import os
import shutil
import instaloader
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata
@@ -29,8 +29,9 @@ class InstagramExtractor(Extractor):
# TODO: links to stories
def setup(self) -> None:
- logger.warning("Instagram Extractor is not actively maintained, and may not work as expected.")
- logger.warning("Please consider using the Instagram Tbot Extractor or Instagram API Extractor instead.")
+ logger.warning(
+ "Instagram Extractor is not actively maintained, and may not work as expected.\nPlease consider using the Instagram Tbot Extractor or Instagram API Extractor instead."
+ )
self.insta = instaloader.Instaloader(
download_geotags=True,
@@ -43,12 +44,11 @@ class InstagramExtractor(Extractor):
self.insta.load_session_from_file(self.username, self.session_file)
except Exception:
try:
- logger.debug("Session file failed", exc_info=True)
- logger.info("No valid session file found - Attempting login with use and password.")
+ logger.info("no valid session file found - Attempting login with use and password.")
self.insta.login(self.username, self.password)
self.insta.save_session_to_file(self.session_file)
except Exception as e:
- logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}")
+ logger.error(f"failed to setup Instagram Extractor with Instagrapi. {e}")
def download(self, item: Metadata) -> Metadata:
url = item.get_url()
@@ -72,14 +72,14 @@ class InstagramExtractor(Extractor):
result = self.download_profile(url, profile_matches[0])
except Exception as e:
logger.error(
- f"Failed to download with instagram extractor due to: {e}, make sure your account credentials are valid."
+ f"failed to download with instagram extractor due to: {e}, make sure your account credentials are valid."
)
finally:
shutil.rmtree(self.download_folder, ignore_errors=True)
return result
def download_post(self, url: str, post_id: str) -> Metadata:
- logger.debug(f"Instagram {post_id=} detected in {url=}")
+ logger.debug(f"Instagram {post_id=} detected")
post = instaloader.Post.from_shortcode(self.insta.context, post_id)
if self.insta.download_post(post, target=post.owner_username):
@@ -87,7 +87,7 @@ class InstagramExtractor(Extractor):
def download_profile(self, url: str, username: str) -> Metadata:
# gets posts, posts where username is tagged, igtv postss, stories, and highlights
- logger.debug(f"Instagram {username=} detected in {url=}")
+ logger.debug(f"Instagram {username=} detected")
profile = instaloader.Profile.from_username(self.insta.context, username)
try:
@@ -95,27 +95,27 @@ class InstagramExtractor(Extractor):
try:
self.insta.download_post(post, target=f"profile_post_{post.owner_username}")
except Exception as e:
- logger.error(f"Failed to download post: {post.shortcode}: {e}")
+ logger.error(f"failed to download post: {post.shortcode}: {e}")
except Exception as e:
- logger.error(f"Failed profile.get_posts: {e}")
+ logger.error(f"failed profile.get_posts: {e}")
try:
for post in profile.get_tagged_posts():
try:
self.insta.download_post(post, target=f"tagged_post_{post.owner_username}")
except Exception as e:
- logger.error(f"Failed to download tagged post: {post.shortcode}: {e}")
+ logger.error(f"failed to download tagged post: {post.shortcode}: {e}")
except Exception as e:
- logger.error(f"Failed profile.get_tagged_posts: {e}")
+ logger.error(f"failed profile.get_tagged_posts: {e}")
try:
for post in profile.get_igtv_posts():
try:
self.insta.download_post(post, target=f"igtv_post_{post.owner_username}")
except Exception as e:
- logger.error(f"Failed to download igtv post: {post.shortcode}: {e}")
+ logger.error(f"failed to download igtv post: {post.shortcode}: {e}")
except Exception as e:
- logger.error(f"Failed profile.get_igtv_posts: {e}")
+ logger.error(f"failed profile.get_igtv_posts: {e}")
try:
for story in self.insta.get_stories([profile.userid]):
@@ -123,9 +123,9 @@ class InstagramExtractor(Extractor):
try:
self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}")
except Exception as e:
- logger.error(f"Failed to download story item: {item}: {e}")
+ logger.error(f"failed to download story item: {item}: {e}")
except Exception as e:
- logger.error(f"Failed get_stories: {e}")
+ logger.error(f"failed get_stories: {e}")
try:
for highlight in self.insta.get_highlights(profile.userid):
@@ -133,9 +133,9 @@ class InstagramExtractor(Extractor):
try:
self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}")
except Exception as e:
- logger.error(f"Failed to download highlight item: {item}: {e}")
+ logger.error(f"failed to download highlight item: {item}: {e}")
except Exception as e:
- logger.error(f"Failed get_highlights: {e}")
+ logger.error(f"failed get_highlights: {e}")
return self.process_downloads(url, f"@{username}", profile._asdict(), None)
@@ -158,4 +158,4 @@ class InstagramExtractor(Extractor):
return result.success("instagram")
except Exception as e:
- logger.error(f"Could not fetch instagram post {url} due to: {e}")
+ logger.error(f"could not fetch instagram post due to: {e}")
diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
index b4f9378..9d1fd7e 100644
--- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
+++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
@@ -12,7 +12,7 @@ import shutil
import time
from sqlite3 import OperationalError
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from telethon.sync import TelegramClient
from auto_archiver.core import Extractor
diff --git a/src/auto_archiver/modules/json_enricher/json_enricher.py b/src/auto_archiver/modules/json_enricher/json_enricher.py
index b0900b6..7a5c41e 100644
--- a/src/auto_archiver/modules/json_enricher/json_enricher.py
+++ b/src/auto_archiver/modules/json_enricher/json_enricher.py
@@ -1,5 +1,5 @@
import json
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
import os
from auto_archiver.core import Enricher
diff --git a/src/auto_archiver/modules/local_storage/local_storage.py b/src/auto_archiver/modules/local_storage/local_storage.py
index fdc6978..79cb1e8 100644
--- a/src/auto_archiver/modules/local_storage/local_storage.py
+++ b/src/auto_archiver/modules/local_storage/local_storage.py
@@ -1,7 +1,7 @@
import shutil
from typing import IO
import os
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Media
from auto_archiver.core import Storage
diff --git a/src/auto_archiver/modules/meta_enricher/meta_enricher.py b/src/auto_archiver/modules/meta_enricher/meta_enricher.py
index 9356b16..74f4b9b 100644
--- a/src/auto_archiver/modules/meta_enricher/meta_enricher.py
+++ b/src/auto_archiver/modules/meta_enricher/meta_enricher.py
@@ -1,6 +1,6 @@
import datetime
import os
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata
diff --git a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py
index e4fac44..b59ce62 100644
--- a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py
+++ b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py
@@ -1,6 +1,6 @@
import subprocess
import traceback
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata
diff --git a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py
index c920a03..272b112 100644
--- a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py
+++ b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py
@@ -1,6 +1,6 @@
import os
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
import opentimestamps
from opentimestamps.calendar import RemoteCalendar, DEFAULT_CALENDAR_WHITELIST
from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile
diff --git a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py
index 19b9c59..bad408f 100644
--- a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py
+++ b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py
@@ -15,7 +15,7 @@ import traceback
import pdqhash
import numpy as np
from PIL import Image, UnidentifiedImageError
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata
diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py
index b5d905d..602cbe4 100644
--- a/src/auto_archiver/modules/s3_storage/s3_storage.py
+++ b/src/auto_archiver/modules/s3_storage/s3_storage.py
@@ -2,7 +2,7 @@ from typing import IO
import boto3
import os
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Media
from auto_archiver.core import Storage
diff --git a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
index 3ab1389..f6f7b01 100644
--- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
+++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
@@ -2,7 +2,7 @@ import ssl
import os
from slugify import slugify
from urllib.parse import urlparse
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, Media
diff --git a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py
index e70198d..f32fb1e 100644
--- a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py
+++ b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py
@@ -2,7 +2,7 @@ import requests
import re
import html
from bs4 import BeautifulSoup
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media
diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
index 2643b32..2dcc90e 100644
--- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
+++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
@@ -17,7 +17,7 @@ from telethon.errors.rpcerrorlist import (
)
from tqdm import tqdm
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media
diff --git a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
index a8f844f..4e15adf 100644
--- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
+++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
@@ -9,7 +9,7 @@ and identify important moments without watching the entire video.
import ffmpeg
import os
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Media, Metadata
diff --git a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
index 1626b71..1c95f24 100644
--- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
+++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
@@ -5,7 +5,7 @@ import hashlib
from slugify import slugify
import requests
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from rfc3161_client import (decode_timestamp_response,TimestampRequestBuilder,TimeStampResponse, VerifierBuilder)
from rfc3161_client import VerificationError as Rfc3161VerificationError
diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py
index 5a0023a..71ea318 100644
--- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py
+++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py
@@ -4,7 +4,7 @@ import re
import mimetypes
import requests
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from pytwitter import Api
from slugify import slugify
diff --git a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
index b1fbd80..4e21cf7 100644
--- a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
@@ -4,7 +4,7 @@ import os
import shutil
import subprocess
from zipfile import ZipFile
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from warcio.archiveiterator import ArchiveIterator
from auto_archiver.core import Media, Metadata
diff --git a/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py b/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py
index f06effd..2cb1815 100644
--- a/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py
+++ b/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py
@@ -1,5 +1,5 @@
import json
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
import time
import requests
diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
index 063bd26..043fc30 100644
--- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
+++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
@@ -1,7 +1,7 @@
import traceback
import requests
import time
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, Media
diff --git a/src/auto_archiver/utils/custom_logger.py b/src/auto_archiver/utils/custom_logger.py
new file mode 100644
index 0000000..9c04f35
--- /dev/null
+++ b/src/auto_archiver/utils/custom_logger.py
@@ -0,0 +1,37 @@
+from loguru import logger
+import json
+
+
+def extract_log_data(record):
+ subset = {
+ "level": record["level"].name,
+ "time": record["time"].isoformat(timespec="seconds"),
+ }
+ subset["loc"] = f"{record['file'].name}:{record['function']}:{record['line']}"
+
+ for extra_key in ["trace", "url", "worksheet", "row"]:
+ if extra_val := record.get("extra", {}).get(extra_key):
+ subset[extra_key] = extra_val
+
+ subset["message"] = record["message"]
+ if exception := record.get("exception"):
+ subset["exception"] = exception
+ return subset
+
+
+def serialize_no_message(record):
+ subset = extract_log_data(record)
+ subset.pop("message", None)
+ return json.dumps(subset, ensure_ascii=False)
+
+
+def serialize(record):
+ return json.dumps(extract_log_data(record), ensure_ascii=False)
+
+
+def patching(record):
+ record["extra"]["serialized"] = serialize(record)
+ record["extra"]["serialize_no_message"] = serialize_no_message(record)
+
+
+logger = logger.patch(patching)
diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py
index 27a1bc9d..4c872f3 100644
--- a/src/auto_archiver/utils/misc.py
+++ b/src/auto_archiver/utils/misc.py
@@ -7,7 +7,7 @@ from datetime import datetime, timezone
from dateutil.parser import parse as parse_dt
import requests
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
def mkdir_if_not_exists(folder):
diff --git a/tests/conftest.py b/tests/conftest.py
index a54f01d..6f47a46 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -9,7 +9,7 @@ from tempfile import TemporaryDirectory
from typing import Dict, Tuple
import hashlib
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
import pytest
from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.core.module import ModuleFactory
diff --git a/tests/data/test_modules/example_extractor/example_extractor.py b/tests/data/test_modules/example_extractor/example_extractor.py
index ade26e4..6a54b40 100644
--- a/tests/data/test_modules/example_extractor/example_extractor.py
+++ b/tests/data/test_modules/example_extractor/example_extractor.py
@@ -1,6 +1,6 @@
from auto_archiver.core import Extractor
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
class ExampleExtractor(Extractor):
diff --git a/tests/data/test_modules/example_module/example_module.py b/tests/data/test_modules/example_module/example_module.py
index 898df96..655afec 100644
--- a/tests/data/test_modules/example_module/example_module.py
+++ b/tests/data/test_modules/example_module/example_module.py
@@ -1,6 +1,6 @@
from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata
-from loguru import logger
+from auto_archiver.utils.custom_logger import logger
class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):
diff --git a/tests/test_implementation.py b/tests/test_implementation.py
index e52a8d8..69dd5e6 100644
--- a/tests/test_implementation.py
+++ b/tests/test_implementation.py
@@ -25,7 +25,7 @@ def orchestration_file(orchestration_file_path):
def autoarchiver(tmp_path, monkeypatch, request):
def _autoarchiver(args=[]):
def cleanup():
- from loguru import logger
+ from auto_archiver.utils.custom_logger import logger
if not logger._core.handlers.get(0):
logger._core.handlers_count = 0