WIP refactor logging

This commit is contained in:
msramalho
2025-06-21 15:54:51 +01:00
parent ade7feb5a0
commit ce4d7ac649
54 changed files with 298 additions and 207 deletions

View File

@@ -14,7 +14,7 @@ You will need to provide your phone number and a 2FA code the first time you run
import os import os
from telethon.sync import TelegramClient from telethon.sync import TelegramClient
from loguru import logger from auto_archiver.utils.custom_logger import logger
# Create a # Create a

View File

@@ -7,7 +7,7 @@ from tempfile import TemporaryDirectory
from auto_archiver.utils import url as UrlUtil from auto_archiver.utils import url as UrlUtil
from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES
from loguru import logger from auto_archiver.utils.custom_logger import logger
if TYPE_CHECKING: if TYPE_CHECKING:
from .module import ModuleFactory from .module import ModuleFactory

View File

@@ -10,7 +10,7 @@ from ruamel.yaml import YAML, CommentedMap
import json import json
import os import os
from loguru import logger from auto_archiver.utils.custom_logger import logger
from copy import deepcopy from copy import deepcopy
from auto_archiver.core.consts import MODULE_TYPES from auto_archiver.core.consts import MODULE_TYPES
@@ -118,8 +118,7 @@ class DefaultValidatingParser(argparse.ArgumentParser):
""" """
Override of error to format a nicer looking error message using logger Override of error to format a nicer looking error message using logger
""" """
logger.error("Problem with configuration file (tip: use --help to see the available options):") logger.error(f"Problem with configuration file (tip: use --help to see the available options): \n{message}")
logger.error(message)
self.exit(2) self.exit(2)
def parse_known_args(self, args=None, namespace=None): def parse_known_args(self, args=None, namespace=None):
@@ -136,8 +135,7 @@ class DefaultValidatingParser(argparse.ArgumentParser):
try: try:
self._check_value(action, action.default) self._check_value(action, action.default)
except argparse.ArgumentError as e: except argparse.ArgumentError as e:
logger.error(f"You have an invalid setting in your configuration file ({action.dest}):") logger.error(f"You have an invalid setting in your configuration file ({action.dest}):\n {e}")
logger.error(e)
exit() exit()
return super().parse_known_args(args, namespace) return super().parse_known_args(args, namespace)

View File

@@ -12,7 +12,7 @@ from contextlib import suppress
import mimetypes import mimetypes
import os import os
import requests import requests
from loguru import logger from auto_archiver.utils.custom_logger import logger
from retrying import retry from retrying import retry
import re import re
@@ -94,7 +94,7 @@ class Extractor(BaseModule):
to_filename = to_filename[-64:] to_filename = to_filename[-64:]
to_filename = os.path.join(self.tmp_dir, to_filename) to_filename = os.path.join(self.tmp_dir, to_filename)
if verbose: if verbose:
logger.debug(f"downloading {url[0:50]=} {to_filename=}") logger.debug(f"downloading {to_filename=}")
headers = { headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36" "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
} }
@@ -117,7 +117,7 @@ class Extractor(BaseModule):
return to_filename return to_filename
except requests.RequestException as e: except requests.RequestException as e:
logger.warning(f"Failed to fetch the Media URL: {str(e)[:250]}") logger.warning(f"Failed to fetch the Media URL: {e}")
if try_best_quality: if try_best_quality:
return None, url return None, url

View File

@@ -11,7 +11,7 @@ from dataclasses import dataclass, field
from dataclasses_json import dataclass_json, config from dataclasses_json import dataclass_json, config
import mimetypes import mimetypes
from loguru import logger from auto_archiver.utils.custom_logger import logger
@dataclass_json # annotation order matters @dataclass_json # annotation order matters
@@ -121,8 +121,7 @@ class Media:
except Error: except Error:
return False # ffmpeg errors when reading bad files return False # ffmpeg errors when reading bad files
except Exception as e: except Exception as e:
logger.error(e) logger.error(f"{e}: {traceback.format_exc()}")
logger.error(traceback.format_exc())
try: try:
fsize = os.path.getsize(self.filename) fsize = os.path.getsize(self.filename)
return fsize > 20_000 return fsize > 20_000

View File

@@ -17,7 +17,7 @@ from dataclasses_json import dataclass_json
import datetime import datetime
from urllib.parse import urlparse from urllib.parse import urlparse
from dateutil.parser import parse as parse_dt from dateutil.parser import parse as parse_dt
from loguru import logger from auto_archiver.utils.custom_logger import logger
from .media import Media from .media import Media

View File

@@ -16,7 +16,7 @@ import sys
from importlib.util import find_spec from importlib.util import find_spec
import os import os
from os.path import join from os.path import join
from loguru import logger from auto_archiver.utils.custom_logger import logger
import auto_archiver import auto_archiver
from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE, SetupError from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE, SetupError

View File

@@ -15,9 +15,11 @@ import traceback
from copy import copy from copy import copy
from rich_argparse import RichHelpFormatter from rich_argparse import RichHelpFormatter
from loguru import logger from auto_archiver.utils.custom_logger import logger
import requests import requests
from auto_archiver.utils.misc import random_str
from .metadata import Metadata, Media from .metadata import Metadata, Media
from auto_archiver.version import __version__ from auto_archiver.version import __version__
from .config import ( from .config import (
@@ -342,7 +344,12 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
# add other logging info # add other logging info
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0 if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
use_level = logging_config["level"] use_level = logging_config["level"]
self.logger_id = logger.add(sys.stderr, level=use_level) self.logger_id = logger.add(
sys.stderr,
level=use_level,
catch=True,
format="<level>{level}</level>: <fg #64FFDA>{message}</fg #64FFDA> {extra[serialize_no_message]}",
)
rotation = logging_config["rotation"] rotation = logging_config["rotation"]
log_file = logging_config["file"] log_file = logging_config["file"]
@@ -356,9 +363,10 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
f"{log_file}.{i}_{level.lower()}", f"{log_file}.{i}_{level.lower()}",
filter=lambda rec, lvl=level: rec["level"].name == lvl, filter=lambda rec, lvl=level: rec["level"].name == lvl,
rotation=rotation, rotation=rotation,
format="{extra[serialized]}",
) )
elif log_file: elif log_file:
logger.add(log_file, rotation=rotation, level=use_level) logger.add(log_file, rotation=rotation, level=use_level, format="{extra[serialized]}")
def install_modules(self, modules_by_type): def install_modules(self, modules_by_type):
""" """
@@ -466,13 +474,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
update_cmd = "`docker pull bellingcat/auto-archiver:latest`" update_cmd = "`docker pull bellingcat/auto-archiver:latest`"
else: else:
update_cmd = "`pip install --upgrade auto-archiver`" update_cmd = "`pip install --upgrade auto-archiver`"
logger.warning("")
logger.warning("********* IMPORTANT: UPDATE AVAILABLE ********")
logger.warning( logger.warning(
f"A new version of auto-archiver is available (v{latest_version}, you have v{current_version})" f"\n********* IMPORTANT: UPDATE AVAILABLE ********\nA new version of auto-archiver is available (v{latest_version}, you have v{current_version})\nMake sure to update to the latest version using: {update_cmd}\n"
) )
logger.warning(f"Make sure to update to the latest version using: {update_cmd}")
logger.warning("")
def setup(self, args: list): def setup(self, args: list):
""" """
@@ -522,7 +526,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
self.setup(args) self.setup(args)
return self.feed() return self.feed()
except Exception as e: except Exception as e:
logger.error(e) logger.error(f"{e}: {traceback.format_exc()}")
exit(1) exit(1)
def cleanup(self) -> None: def cleanup(self) -> None:
@@ -534,10 +538,12 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
url_count = 0 url_count = 0
for feeder in self.feeders: for feeder in self.feeders:
for item in feeder: for item in feeder:
with logger.contextualize(url=item.get_url(), trace=random_str(12)):
logger.info("started processing")
yield self.feed_item(item) yield self.feed_item(item)
url_count += 1 url_count += 1
logger.info(f"Processed {url_count} URL(s)") logger.info(f"processed {url_count} URL(s)")
self.cleanup() self.cleanup()
def feed_item(self, item: Metadata) -> Metadata: def feed_item(self, item: Metadata) -> Metadata:
@@ -555,13 +561,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
return self.archive(item) return self.archive(item)
except KeyboardInterrupt: except KeyboardInterrupt:
# catches keyboard interruptions to do a clean exit # catches keyboard interruptions to do a clean exit
logger.warning(f"caught interrupt on {item=}") logger.warning("caught interrupt")
for d in self.databases: for d in self.databases:
d.aborted(item) d.aborted(item)
self.cleanup() self.cleanup()
exit() exit()
except Exception as e: except Exception as e:
logger.error(f"Got unexpected error on item {item}: {e}\n{traceback.format_exc()}") logger.error(f"Got unexpected error: {e}\n{traceback.format_exc()}")
for d in self.databases: for d in self.databases:
if isinstance(e, AssertionError): if isinstance(e, AssertionError):
d.failed(item, str(e)) d.failed(item, str(e))
@@ -589,7 +595,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
try: try:
check_url_or_raise(original_url) check_url_or_raise(original_url)
except ValueError as e: except ValueError as e:
logger.error(f"Error archiving URL {original_url}: {e}") logger.error(f"Error archiving: {e}")
raise e raise e
# 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs # 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
@@ -599,7 +605,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
result.set_url(url) result.set_url(url)
if original_url != url: if original_url != url:
logger.debug(f"Sanitized URL from {original_url} to {url}") logger.debug(f"Sanitized URL to {url}")
result.set("original_url", original_url) result.set("original_url", original_url)
# 2 - notify start to DBs, propagate already archived if feature enabled in DBs # 2 - notify start to DBs, propagate already archived if feature enabled in DBs
@@ -614,25 +620,25 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
try: try:
d.done(cached_result, cached=True) d.done(cached_result, cached=True)
except Exception as e: except Exception as e:
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}") logger.error(f"database {d.name}: {e}: {traceback.format_exc()}")
return cached_result return cached_result
# 3 - call extractors until one succeeds # 3 - call extractors until one succeeds
for a in self.extractors: for a in self.extractors:
logger.info(f"Trying extractor {a.name} for {url}") logger.info(f"trying extractor {a.name}")
try: try:
result.merge(a.download(result)) result.merge(a.download(result))
if result.is_success(): if result.is_success():
break break
except Exception as e: except Exception as e:
logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}") logger.error(f"archiver {a.name}: {e}: {traceback.format_exc()}")
# 4 - call enrichers to work with archived content # 4 - call enrichers to work with archived content
for e in self.enrichers: for e in self.enrichers:
try: try:
e.enrich(result) e.enrich(result)
except Exception as exc: except Exception as exc:
logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}") logger.error(f"enricher {e.name}: {exc}: {traceback.format_exc()}")
# 5 - store all downloaded/generated media # 5 - store all downloaded/generated media
result.store(storages=self.storages) result.store(storages=self.storages)
@@ -651,7 +657,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
try: try:
d.done(result) d.done(result)
except Exception as e: except Exception as e:
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}") logger.error(f"database {d.name}: {e}: {traceback.format_exc()}")
return result return result

View File

@@ -24,7 +24,7 @@ from abc import abstractmethod
from typing import IO from typing import IO
import os import os
from loguru import logger from auto_archiver.utils.custom_logger import logger
from slugify import slugify from slugify import slugify
from auto_archiver.utils.misc import random_str from auto_archiver.utils.misc import random_str

View File

@@ -7,7 +7,7 @@ from urllib.parse import urljoin
import glob import glob
import importlib.util import importlib.util
from loguru import logger from auto_archiver.utils.custom_logger import logger
import selenium import selenium
from seleniumbase import SB from seleniumbase import SB
@@ -57,7 +57,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
continue # Skip imported modules/classes/functions continue # Skip imported modules/classes/functions
if isinstance(obj, type) and issubclass(obj, Dropin): if isinstance(obj, type) and issubclass(obj, Dropin):
dropins.append(obj) dropins.append(obj)
logger.debug(f"ANTIBOT loaded drop-in classes: {', '.join([d.__name__ for d in dropins])}") logger.debug(f"loaded drop-in classes: {', '.join([d.__name__ for d in dropins])}")
return dropins return dropins
def sanitize_url(self, url: str) -> str: def sanitize_url(self, url: str) -> str:
@@ -83,14 +83,13 @@ class AntibotExtractorEnricher(Extractor, Enricher):
def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool: def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool:
using_user_data_dir = self.user_data_dir if custom_data_dir else None using_user_data_dir = self.user_data_dir if custom_data_dir else None
url = to_enrich.get_url() url = to_enrich.get_url()
url_sample = url[:75]
try: try:
with SB(uc=True, agent=self.agent, headed=None, user_data_dir=using_user_data_dir, proxy=self.proxy) as sb: with SB(uc=True, agent=self.agent, headed=None, user_data_dir=using_user_data_dir, proxy=self.proxy) as sb:
logger.info(f"ANTIBOT selenium browser is up with agent {self.agent}, opening {url_sample}...") logger.info(f"selenium browser is up with agent {self.agent}, opening url...")
sb.uc_open_with_reconnect(url, 4) sb.uc_open_with_reconnect(url, 4)
logger.debug(f"ANTIBOT handling CAPTCHAs for {url_sample}...") logger.debug("handling CAPTCHAs for...")
sb.uc_gui_handle_cf() sb.uc_gui_handle_cf()
sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future
@@ -98,7 +97,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
dropin.open_page(url) dropin.open_page(url)
if self.detect_auth_wall and self._hit_auth_wall(sb): if self.detect_auth_wall and self._hit_auth_wall(sb):
logger.warning(f"ANTIBOT SKIP since auth wall or CAPTCHA was detected for {url_sample}") logger.warning("skipping since auth wall or CAPTCHA was detected")
return False return False
sb.wait_for_ready_state_complete() sb.wait_for_ready_state_complete()
@@ -125,18 +124,18 @@ class AntibotExtractorEnricher(Extractor, Enricher):
js_css_selector=dropin.js_for_video_css_selectors(), js_css_selector=dropin.js_for_video_css_selectors(),
max_media=self.max_download_videos - downloaded_videos, max_media=self.max_download_videos - downloaded_videos,
) )
logger.info(f"ANTIBOT completed for {url_sample}") logger.info("completed")
return to_enrich return to_enrich
except selenium.common.exceptions.SessionNotCreatedException as e: except selenium.common.exceptions.SessionNotCreatedException as e:
if custom_data_dir: # the retry logic only works once if custom_data_dir: # the retry logic only works once
logger.error( logger.error(
f"ANTIBOT session not created error: {e}. Please remove the user_data_dir {self.user_data_dir} and try again, will retry without user data dir though." f"session not created error: {e}. Please remove the user_data_dir {self.user_data_dir} and try again, will retry without user data dir though."
) )
return self.enrich(to_enrich, custom_data_dir=False) return self.enrich(to_enrich, custom_data_dir=False)
raise e # re-raise raise e # re-raise
except Exception as e: except Exception as e:
logger.error(f"ANTIBOT runtime error: {e}: {traceback.format_exc()}") logger.error(f"runtime error: {e}: {traceback.format_exc()}")
return False return False
def _get_suitable_dropin(self, url: str, sb: SB): def _get_suitable_dropin(self, url: str, sb: SB):
@@ -146,7 +145,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
""" """
for dropin in self.dropins: for dropin in self.dropins:
if dropin.suitable(url): if dropin.suitable(url):
logger.debug(f"ANTIBOT using drop-in {dropin.__name__} for {url}") logger.debug(f"using drop-in {dropin.__name__}")
return dropin(sb, self) return dropin(sb, self)
return DefaultDropin(sb, self) return DefaultDropin(sb, self)
@@ -241,7 +240,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
x = max(sb.execute_script("return document.documentElement.scrollWidth"), w) x = max(sb.execute_script("return document.documentElement.scrollWidth"), w)
y = min(max(sb.execute_script("return document.documentElement.scrollHeight"), h), 25_000) y = min(max(sb.execute_script("return document.documentElement.scrollHeight"), h), 25_000)
logger.debug(f"Setting window size to {x}x{y} for full page screenshot.") logger.debug(f"setting window size to {x}x{y} for full page screenshot.")
sb.set_window_size(x, y) sb.set_window_size(x, y)
screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png") screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png")
@@ -280,7 +279,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
# js_for_css_selectors # js_for_css_selectors
for src in sources: for src in sources:
if len(all_urls) >= max_media: if len(all_urls) >= max_media:
logger.debug(f"Reached max download limit of {max_media} images/videos.") logger.debug(f"reached max download limit of {max_media} images/videos.")
break break
if not is_relevant_url(src): if not is_relevant_url(src):
continue continue

View File

@@ -0,0 +1,60 @@
# def solve_captcha(image_url):
# # Download image
# img_data = requests.get(image_url).content
# encoded_image = base64.b64encode(img_data).decode()
# # Submit to AntiCaptcha
# task = {
# "clientKey": ANTI_CAPTCHA_KEY,
# "task": {
# "type": "ImageToTextTask",
# "body": encoded_image
# }
# }
# print("[*] Sending captcha request to anti-captcha...")
# task_response = requests.post("https://api.anti-captcha.com/createTask", json=task).json()
# task_id = task_response["taskId"]
# print(f"[*] Anti-captcha response: {task_response}")
# # Poll for result
# while True:
# time.sleep(5)
# res = requests.post("https://api.anti-captcha.com/getTaskResult", json={
# "clientKey": ANTI_CAPTCHA_KEY,
# "taskId": task_id
# }).json()
# if res["status"] == "ready":
# print(f"[*] Captcha solved: {res}")
# return res["solution"]["text"]
# print(f"[*] Polling for captcha solution: {res['status']}")
# def solve_recaptcha(site_key, page_url):
# print("[*] Sending captcha request to anti-captcha...")
# # Step 1: Send captcha request
# task_payload = {
# "clientKey": ANTI_CAPTCHA_KEY,
# "task": {
# "type": "NoCaptchaTaskProxyless",
# "websiteURL": page_url,
# "websiteKey": site_key
# }
# }
# response = requests.post("https://api.anti-captcha.com/createTask", json=task_payload).json()
# print(f"[*] Anti-captcha response: {response}")
# task_id = response["taskId"]
# # Step 2: Poll for solution
# print("[*] Polling for captcha solution...")
# for i in range(40): # ~80 seconds
# time.sleep(2)
# result = requests.post("https://api.anti-captcha.com/getTaskResult", json={
# "clientKey": ANTI_CAPTCHA_KEY,
# "taskId": task_id
# }).json()
# print(f" Poll {i+1}: status={result['status']}")
# if result["status"] == "ready":
# print("[*] Captcha solved!")
# return result["solution"]["gRecaptchaResponse"]
# raise TimeoutError("AntiCaptcha took too long")

View File

@@ -1,6 +1,7 @@
import os import os
import traceback
from typing import Mapping from typing import Mapping
from loguru import logger from auto_archiver.utils.custom_logger import logger
from seleniumbase import SB from seleniumbase import SB
import yt_dlp import yt_dlp
@@ -143,7 +144,7 @@ class Dropin:
with yt_dlp.YoutubeDL(validated_options) as ydl: with yt_dlp.YoutubeDL(validated_options) as ydl:
for url in video_urls: for url in video_urls:
try: try:
logger.debug(f"Downloading video from URL: {url}") logger.debug("downloading video from url")
info = ydl.extract_info(url, download=True) info = ydl.extract_info(url, download=True)
filename = ydl_entry_to_filename(ydl, info) filename = ydl_entry_to_filename(ydl, info)
if not filename: # Failed to download video. if not filename: # Failed to download video.
@@ -155,5 +156,5 @@ class Dropin:
to_enrich.add_media(media) to_enrich.add_media(media)
downloaded += 1 downloaded += 1
except Exception as e: except Exception as e:
logger.error(f"Error downloading {url}: {e}") logger.error(f"download failed: {e} {traceback.format_exc()}")
return downloaded return downloaded

View File

@@ -1,5 +1,5 @@
from typing import Mapping from typing import Mapping
from loguru import logger from auto_archiver.utils.custom_logger import logger
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
@@ -62,7 +62,7 @@ class LinkedinDropin(Dropin):
self.sb.wait_for_ready_state_complete() self.sb.wait_for_ready_state_complete()
username, password = self._get_username_password("linkedin.com") username, password = self._get_username_password("linkedin.com")
logger.debug("LinkedinDropin Logging in to Linkedin with username: {}", username) logger.debug("logging in to Linkedin with username: {}", username)
self.sb.type("#username", username) self.sb.type("#username", username)
self.sb.type("#password", password) self.sb.type("#password", password)
self.sb.click_if_visible("#password-visibility-toggle", timeout=0.5) self.sb.click_if_visible("#password-visibility-toggle", timeout=0.5)

View File

@@ -3,7 +3,7 @@ from typing import Mapping
from auto_archiver.core.metadata import Metadata from auto_archiver.core.metadata import Metadata
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
from loguru import logger from auto_archiver.utils.custom_logger import logger
class RedditDropin(Dropin): class RedditDropin(Dropin):
@@ -50,7 +50,7 @@ class RedditDropin(Dropin):
self._close_cookies_banner() self._close_cookies_banner()
username, password = self._get_username_password("reddit.com") username, password = self._get_username_password("reddit.com")
logger.debug("RedditDropin Logging in to Reddit with username: {}", username) logger.debug("logging in to Reddit with username: {}", username)
self.sb.type("#login-username", username) self.sb.type("#login-username", username)
self.sb.type("#login-password", password) self.sb.type("#login-password", password)
@@ -68,7 +68,7 @@ class RedditDropin(Dropin):
self.sb.click_link_text("Log in") self.sb.click_link_text("Log in")
self.sb.wait_for_ready_state_complete() self.sb.wait_for_ready_state_complete()
if self.sb.is_text_visible("Welcome back"): if self.sb.is_text_visible("Welcome back"):
logger.debug("RedditDropin Login successful") logger.debug("login successful")
self.sb.click_if_visible("this link") self.sb.click_if_visible("this link")
def _close_cookies_banner(self): def _close_cookies_banner(self):
@@ -88,5 +88,5 @@ class RedditDropin(Dropin):
.map(el => el.src || el.href) .map(el => el.src || el.href)
.filter(url => url && /\.(m3u8|mpd|ism)$/.test(url)); .filter(url => url && /\.(m3u8|mpd|ism)$/.test(url));
""") """)
logger.debug("RedditDropin Found {} video URLs", len(filtered_urls)) logger.debug("found {} video URLs", len(filtered_urls))
return 0, self._download_videos_with_ytdlp(filtered_urls, to_enrich) return 0, self._download_videos_with_ytdlp(filtered_urls, to_enrich)

View File

@@ -4,7 +4,7 @@ from typing import Mapping
from auto_archiver.core.metadata import Metadata from auto_archiver.core.metadata import Metadata
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
from loguru import logger from auto_archiver.utils.custom_logger import logger
class VkDropin(Dropin): class VkDropin(Dropin):
@@ -57,12 +57,12 @@ class VkDropin(Dropin):
self.sb.open("https://vk.com") self.sb.open("https://vk.com")
self.sb.wait_for_ready_state_complete() self.sb.wait_for_ready_state_complete()
if "/feed" in self.sb.get_current_url(): if "/feed" in self.sb.get_current_url():
logger.debug("Already logged in to VK.") logger.debug("already logged in to VK.")
return True return True
# need to login # need to login
username, password = self._get_username_password("vk.com") username, password = self._get_username_password("vk.com")
logger.debug("Logging in to VK with username: {}", username) logger.debug("logging in to VK with username: {}", username)
self.sb.click('[data-testid="enter-another-way"]', timeout=10) self.sb.click('[data-testid="enter-another-way"]', timeout=10)
self.sb.clear('input[name="login"][type="tel"]', by="css selector", timeout=10) self.sb.clear('input[name="login"][type="tel"]', by="css selector", timeout=10)

View File

@@ -2,7 +2,7 @@ from typing import Union
import os import os
import requests import requests
from loguru import logger from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Database from auto_archiver.core import Database
from auto_archiver.core import Metadata from auto_archiver.core import Metadata
@@ -36,9 +36,9 @@ class AAApiDb(Database):
if not self.store_results: if not self.store_results:
return return
if cached: if cached:
logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached") logger.debug("skipping saving archive to AA API because it was cached")
return return
logger.debug(f"saving archive of {item.get_url()} to the AA API.") logger.debug("saving archive to the AA API.")
payload = { payload = {
"author_id": self.author_id, "author_id": self.author_id,

View File

@@ -3,7 +3,7 @@ import os
from typing import IO, Iterator, Optional, Union from typing import IO, Iterator, Optional, Union
import requests import requests
from loguru import logger from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Database, Feeder, Media, Metadata, Storage from auto_archiver.core import Database, Feeder, Media, Metadata, Storage
from auto_archiver.utils import calculate_file_hash from auto_archiver.utils import calculate_file_hash
@@ -66,13 +66,13 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
"""Mark an item as failed in Atlos, if the ID exists.""" """Mark an item as failed in Atlos, if the ID exists."""
atlos_id = item.metadata.get("atlos_id") atlos_id = item.metadata.get("atlos_id")
if not atlos_id: if not atlos_id:
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping") logger.info("No Atlos ID available, skipping")
return return
self._post( self._post(
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver", f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
json={"metadata": {"processed": True, "status": "error", "error": reason}}, json={"metadata": {"processed": True, "status": "error", "error": reason}},
) )
logger.info(f"Stored failure for {item.get_url()} (ID {atlos_id}) on Atlos: {reason}") logger.info(f"stored failure ID {atlos_id} on Atlos: {reason}")
def fetch(self, item: Metadata) -> Union[Metadata, bool]: def fetch(self, item: Metadata) -> Union[Metadata, bool]:
"""check and fetch if the given item has been archived already, each """check and fetch if the given item has been archived already, each
@@ -88,7 +88,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
"""Mark an item as successfully archived in Atlos.""" """Mark an item as successfully archived in Atlos."""
atlos_id = item.metadata.get("atlos_id") atlos_id = item.metadata.get("atlos_id")
if not atlos_id: if not atlos_id:
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping") logger.info("item has no Atlos ID, skipping")
return return
self._post( self._post(
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver", f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
@@ -100,7 +100,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
} }
}, },
) )
logger.info(f"Stored success for {item.get_url()} (ID {atlos_id}) on Atlos") logger.info(f"stored success ID {atlos_id} on Atlos")
# ! Atlos Module - Storage Methods # ! Atlos Module - Storage Methods
@@ -111,12 +111,12 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool: def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool:
"""Upload a media file to Atlos if it has not been uploaded already.""" """Upload a media file to Atlos if it has not been uploaded already."""
if metadata is None: if metadata is None:
logger.error(f"No metadata provided for {media.filename}") logger.error(f"no metadata provided for {media.filename}")
return False return False
atlos_id = metadata.get("atlos_id") atlos_id = metadata.get("atlos_id")
if not atlos_id: if not atlos_id:
logger.error(f"No Atlos ID found in metadata; can't store {media.filename} in Atlos.") logger.error(f"no Atlos ID found in metadata; can't store {media.filename} in Atlos.")
return False return False
media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096) media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096)
@@ -135,7 +135,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
params={"title": media.properties}, params={"title": media.properties},
files={"file": (os.path.basename(media.filename), file_obj)}, files={"file": (os.path.basename(media.filename), file_obj)},
) )
logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}") logger.info(f"uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}")
return True return True
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:

View File

@@ -1,5 +1,3 @@
from loguru import logger
from auto_archiver.core.feeder import Feeder from auto_archiver.core.feeder import Feeder
from auto_archiver.core.metadata import Metadata from auto_archiver.core.metadata import Metadata
from auto_archiver.core.consts import SetupError from auto_archiver.core.consts import SetupError
@@ -16,8 +14,5 @@ class CLIFeeder(Feeder):
def __iter__(self) -> Metadata: def __iter__(self) -> Metadata:
urls = self.config["urls"] urls = self.config["urls"]
for url in urls: for url in urls:
logger.debug(f"Processing {url}")
m = Metadata().set_url(url) m = Metadata().set_url(url)
yield m yield m
logger.success(f"Processed {len(urls)} URL(s)")

View File

@@ -1,4 +1,4 @@
from loguru import logger from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Database from auto_archiver.core import Database
from auto_archiver.core import Metadata from auto_archiver.core import Metadata

View File

@@ -1,5 +1,5 @@
import os import os
from loguru import logger from auto_archiver.utils.custom_logger import logger
from csv import DictWriter from csv import DictWriter
from dataclasses import asdict from dataclasses import asdict

View File

@@ -1,4 +1,4 @@
from loguru import logger from auto_archiver.utils.custom_logger import logger
import csv import csv
from auto_archiver.core import Feeder from auto_archiver.core import Feeder
@@ -20,20 +20,19 @@ class CSVFeeder(Feeder):
url_column = first_row.index(url_column) url_column = first_row.index(url_column)
except ValueError: except ValueError:
logger.error( logger.error(
f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?" f"column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?"
) )
return return
elif not (url_or_none(first_row[url_column])): elif not (url_or_none(first_row[url_column])):
# it's a header row, but we've been given a column number already # it's a header row, but we've been given a column number already
logger.debug(f"Skipping header row: {first_row}") logger.debug(f"skipping header row: {first_row}")
else: else:
# first row isn't a header row, rewind the file # first row isn't a header row, rewind the file
f.seek(0) f.seek(0)
for row in reader: for row in reader:
if not url_or_none(row[url_column]): if not url_or_none(row[url_column]):
logger.warning(f"Not a valid URL in row: {row}, skipping") logger.warning(f"not a valid URL in row: {row}, skipping")
continue continue
url = row[url_column] url = row[url_column]
logger.debug(f"Processing {url}")
yield Metadata().set_url(url) yield Metadata().set_url(url)

View File

@@ -8,7 +8,7 @@ from google.oauth2 import service_account
from google.oauth2.credentials import Credentials from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload from googleapiclient.http import MediaFileUpload
from loguru import logger from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Media from auto_archiver.core import Media
from auto_archiver.core import Storage from auto_archiver.core import Storage
@@ -23,10 +23,10 @@ class GDriveStorage(Storage):
def _setup_google_drive_service(self): def _setup_google_drive_service(self):
"""Initialize Google Drive service based on provided credentials.""" """Initialize Google Drive service based on provided credentials."""
if self.oauth_token: if self.oauth_token:
logger.debug(f"Using Google Drive OAuth token: {self.oauth_token}") logger.debug(f"using Google Drive OAuth token: {self.oauth_token}")
self.service = self._initialize_with_oauth_token() self.service = self._initialize_with_oauth_token()
elif self.service_account: elif self.service_account:
logger.debug(f"Using Google Drive service account: {self.service_account}") logger.debug(f"using Google Drive service account: {self.service_account}")
self.service = self._initialize_with_service_account() self.service = self._initialize_with_service_account()
else: else:
raise ValueError("Missing credentials: either `oauth_token` or `service_account` must be provided.") raise ValueError("Missing credentials: either `oauth_token` or `service_account` must be provided.")
@@ -41,7 +41,7 @@ class GDriveStorage(Storage):
if not creds.valid and creds.expired and creds.refresh_token: if not creds.valid and creds.expired and creds.refresh_token:
creds.refresh(Request()) creds.refresh(Request())
with open(self.oauth_token, "w") as token_file: with open(self.oauth_token, "w") as token_file:
logger.debug("Saving refreshed OAuth token.") logger.debug("saving refreshed OAuth token.")
token_file.write(creds.to_json()) token_file.write(creds.to_json())
elif not creds.valid: elif not creds.valid:
raise ValueError("Invalid OAuth token. Please regenerate the token.") raise ValueError("Invalid OAuth token. Please regenerate the token.")
@@ -180,7 +180,7 @@ class GDriveStorage(Storage):
Creates a new GDrive folder @name inside folder @parent_id Creates a new GDrive folder @name inside folder @parent_id
Returns id of the created folder Returns id of the created folder
""" """
logger.debug(f"Creating new folder with {name=} inside {parent_id=}") logger.debug(f"creating new folder with {name=} inside {parent_id=}")
file_metadata = {"name": [name], "mimeType": "application/vnd.google-apps.folder", "parents": [parent_id]} file_metadata = {"name": [name], "mimeType": "application/vnd.google-apps.folder", "parents": [parent_id]}
gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields="id").execute() gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields="id").execute()
return gd_folder.get("id") return gd_folder.get("id")

View File

@@ -1,4 +1,4 @@
from loguru import logger from auto_archiver.utils.custom_logger import logger
from auto_archiver.core.extractor import Extractor from auto_archiver.core.extractor import Extractor
from auto_archiver.core.metadata import Metadata, Media from auto_archiver.core.metadata import Metadata, Media
@@ -18,7 +18,7 @@ class Bluesky(GenericDropin):
# download if embeds present (1 video XOR >=1 images) # download if embeds present (1 video XOR >=1 images)
for media in self._download_bsky_embeds(post, archiver): for media in self._download_bsky_embeds(post, archiver):
result.add_media(media) result.add_media(media)
logger.debug(f"Downloaded {len(result.media)} media files") logger.debug(f"downloaded {len(result.media)} media files")
return result return result

View File

@@ -14,7 +14,7 @@ from yt_dlp.extractor.common import InfoExtractor
from yt_dlp.utils import MaxDownloadsReached from yt_dlp.utils import MaxDownloadsReached
import pysubs2 import pysubs2
from loguru import logger from auto_archiver.utils.custom_logger import logger
from auto_archiver.core.extractor import Extractor from auto_archiver.core.extractor import Extractor
from auto_archiver.core import Metadata, Media from auto_archiver.core import Metadata, Media
@@ -63,12 +63,11 @@ class GenericExtractor(Extractor):
if os.environ.get("AUTO_ARCHIVER_ALLOW_RESTART", "1") != "1": if os.environ.get("AUTO_ARCHIVER_ALLOW_RESTART", "1") != "1":
logger.warning("yt-dlp or plugin was updated — please restart auto-archiver manually") logger.warning("yt-dlp or plugin was updated — please restart auto-archiver manually")
else: else:
logger.warning("yt-dlp or plugin was updated — restarting auto-archiver") logger.warning("yt-dlp or plugin was updated — restarting auto-archiver\n ======= RESTARTING ======= ")
logger.warning(" ======= RESTARTING ======= ")
os.execv(sys.executable, [sys.executable] + sys.argv) os.execv(sys.executable, [sys.executable] + sys.argv)
def update_package(self, package_name: str) -> bool: def update_package(self, package_name: str) -> bool:
logger.info(f"Checking and updating {package_name}...") logger.info(f"checking and updating {package_name}...")
from importlib.metadata import version as get_version from importlib.metadata import version as get_version
old_version = get_version(package_name) old_version = get_version(package_name)
@@ -80,7 +79,7 @@ class GenericExtractor(Extractor):
return True return True
logger.info(f"{package_name} already up to date") logger.info(f"{package_name} already up to date")
except Exception as e: except Exception as e:
logger.error(f"Error updating {package_name}: {e}") logger.error(f"failed to update {package_name}: {e}")
return False return False
def setup_po_tokens(self) -> None: def setup_po_tokens(self) -> None:
@@ -111,7 +110,7 @@ class GenericExtractor(Extractor):
missing_tools = [tool for tool in ("node", "yarn", "npx") if shutil.which(tool) is None] missing_tools = [tool for tool in ("node", "yarn", "npx") if shutil.which(tool) is None]
if missing_tools: if missing_tools:
logger.error( logger.error(
f"Cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. " f"cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. "
"Install these tools or run bgutils via Docker. " "Install these tools or run bgutils via Docker. "
"See: https://github.com/Brainicism/bgutil-ytdlp-pot-provider" "See: https://github.com/Brainicism/bgutil-ytdlp-pot-provider"
) )
@@ -140,7 +139,7 @@ class GenericExtractor(Extractor):
f"https://github.com/Brainicism/bgutil-ytdlp-pot-provider/archive/refs/tags/{plugin_version}.zip" f"https://github.com/Brainicism/bgutil-ytdlp-pot-provider/archive/refs/tags/{plugin_version}.zip"
) )
zip_path = os.path.join(base_dir, f"{plugin_version}.zip") zip_path = os.path.join(base_dir, f"{plugin_version}.zip")
logger.info(f"Downloading bgutils release zip for version {plugin_version}...") logger.info(f"downloading bgutils release zip for version {plugin_version}...")
urlretrieve(zip_url, zip_path) urlretrieve(zip_url, zip_path)
with zipfile.ZipFile(zip_path, "r") as z: with zipfile.ZipFile(zip_path, "r") as z:
z.extractall(base_dir) z.extractall(base_dir)
@@ -149,7 +148,7 @@ class GenericExtractor(Extractor):
extracted_root = os.path.join(base_dir, f"bgutil-ytdlp-pot-provider-{plugin_version}") extracted_root = os.path.join(base_dir, f"bgutil-ytdlp-pot-provider-{plugin_version}")
shutil.move(os.path.join(extracted_root, "server"), server_dir) shutil.move(os.path.join(extracted_root, "server"), server_dir)
shutil.rmtree(extracted_root) shutil.rmtree(extracted_root)
logger.info("Installing dependencies and transpiling PoT Generator script...") logger.info("installing dependencies and transpiling PoT Generator script...")
subprocess.run(["yarn", "install", "--frozen-lockfile"], cwd=server_dir, check=True) subprocess.run(["yarn", "install", "--frozen-lockfile"], cwd=server_dir, check=True)
subprocess.run(["npx", "tsc"], cwd=server_dir, check=True) subprocess.run(["npx", "tsc"], cwd=server_dir, check=True)
@@ -165,7 +164,7 @@ class GenericExtractor(Extractor):
logger.info(f"PO Token script configured at: {script_path}") logger.info(f"PO Token script configured at: {script_path}")
except Exception as e: except Exception as e:
logger.error(f"Failed to set up PO Token script: {e}") logger.error(f"failed to set up PO Token script: {e}")
def suitable_extractors(self, url: str) -> Generator[str, None, None]: def suitable_extractors(self, url: str) -> Generator[str, None, None]:
""" """
@@ -206,7 +205,7 @@ class GenericExtractor(Extractor):
media = Media(cover_image_path) media = Media(cover_image_path)
metadata.add_media(media, id="cover") metadata.add_media(media, id="cover")
except Exception as e: except Exception as e:
logger.error(f"Error downloading cover image {thumbnail_url}: {e}") logger.error(f"could not download cover image {thumbnail_url}: {e}")
dropin = self.dropin_for_name(info_extractor.ie_key()) dropin = self.dropin_for_name(info_extractor.ie_key())
if dropin: if dropin:
@@ -353,7 +352,7 @@ class GenericExtractor(Extractor):
if not dropin: if not dropin:
# TODO: add a proper link to 'how to create your own dropin' # TODO: add a proper link to 'how to create your own dropin'
logger.debug(f"""Could not find valid dropin for {info_extractor.ie_key()}. logger.debug(f"""could not find valid dropin for {info_extractor.ie_key()}.
Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""") Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""")
return False return False
@@ -389,7 +388,7 @@ class GenericExtractor(Extractor):
# file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies. # file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies.
continue continue
logger.debug(f"Using filename {filename} for entry {entry.get('id', 'unknown')}") logger.debug(f"using filename {filename} for entry {entry.get('id', 'unknown')}")
new_media = Media(filename) new_media = Media(filename)
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]: for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
@@ -404,12 +403,12 @@ class GenericExtractor(Extractor):
text = " ".join([line.text for line in subs]) text = " ".join([line.text for line in subs])
new_media.set(f"subtitles_{lang}", text) new_media.set(f"subtitles_{lang}", text)
except Exception as e: except Exception as e:
logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}") logger.error(f"error loading subtitle file {val.get('filepath')}: {e}")
result.add_media(new_media) result.add_media(new_media)
except Exception as e: except Exception as e:
logger.error(f"Error processing entry {entry}: {e}") logger.error(f"error processing entry {entry}: {e}")
if not len(result.media): if not len(result.media):
logger.info(f"No media found for entry {entry}, skipping.") logger.info(f"no media found for entry {entry}, skipping.")
return False return False
return self.add_metadata(data, info_extractor, url, result) return self.add_metadata(data, info_extractor, url, result)
@@ -471,14 +470,14 @@ class GenericExtractor(Extractor):
def _helper_for_successful_extract_info(data, info_extractor, url, ydl): def _helper_for_successful_extract_info(data, info_extractor, url, ydl):
if data.get("is_live", False) and not self.livestreams: if data.get("is_live", False) and not self.livestreams:
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting") logger.warning("livestream detected, skipping due to 'livestreams' configuration setting")
return False return False
# it's a valid video, that the youtubdedl can download out of the box # it's a valid video, that the youtubdedl can download out of the box
return self.get_metadata_for_video(data, info_extractor, url, ydl) return self.get_metadata_for_video(data, info_extractor, url, ydl)
try: try:
if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor): if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}") logger.debug(f"skipping using ytdlp to download files for {info_extractor.ie_key()}")
raise SkipYtdlp() raise SkipYtdlp()
# don't download since it can be a live stream # don't download since it can be a live stream
@@ -497,17 +496,17 @@ class GenericExtractor(Extractor):
if not isinstance(e, SkipYtdlp): if not isinstance(e, SkipYtdlp):
logger.debug( logger.debug(
f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead' f'issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead'
) )
try: try:
result = self.get_metadata_for_post(info_extractor, url, ydl) result = self.get_metadata_for_post(info_extractor, url, ydl)
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e: except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
logger.error("Error downloading metadata for post: {error}", error=str(post_e)) logger.error("error downloading metadata for post: {error}", error=str(post_e))
return False return False
except Exception as generic_e: except Exception as generic_e:
logger.debug( logger.debug(
'Attempt to extract using ytdlp extractor "{name}" failed: \n {error}', 'attempt to extract using ytdlp extractor "{name}" failed: \n {error}',
name=info_extractor.IE_NAME, name=info_extractor.IE_NAME,
error=str(generic_e), error=str(generic_e),
exc_info=True, exc_info=True,
@@ -560,17 +559,17 @@ class GenericExtractor(Extractor):
# order of importance: username/password -> api_key -> cookie -> cookies_from_browser -> cookies_file # order of importance: username/password -> api_key -> cookie -> cookies_from_browser -> cookies_file
if auth: if auth:
if "username" in auth and "password" in auth: if "username" in auth and "password" in auth:
logger.debug(f"Using provided auth username and password for {url}") logger.debug("using provided auth username and password")
ydl_options.extend(("--username", auth["username"])) ydl_options.extend(("--username", auth["username"]))
ydl_options.extend(("--password", auth["password"])) ydl_options.extend(("--password", auth["password"]))
elif "cookie" in auth: elif "cookie" in auth:
logger.debug(f"Using provided auth cookie for {url}") logger.debug("using provided auth cookie")
yt_dlp.utils.std_headers["cookie"] = auth["cookie"] yt_dlp.utils.std_headers["cookie"] = auth["cookie"]
elif "cookies_from_browser" in auth: elif "cookies_from_browser" in auth:
logger.debug(f"Using extracted cookies from browser {auth['cookies_from_browser']} for {url}") logger.debug(f"using extracted cookies from browser {auth['cookies_from_browser']}")
ydl_options.extend(("--cookies-from-browser", auth["cookies_from_browser"])) ydl_options.extend(("--cookies-from-browser", auth["cookies_from_browser"]))
elif "cookies_file" in auth: elif "cookies_file" in auth:
logger.debug(f"Using cookies from file {auth['cookies_file']} for {url}") logger.debug(f"using cookies from file {auth['cookies_file']}")
ydl_options.extend(("--cookies", auth["cookies_file"])) ydl_options.extend(("--cookies", auth["cookies_file"]))
# Applying user-defined extractor_args # Applying user-defined extractor_args
@@ -580,11 +579,11 @@ class GenericExtractor(Extractor):
arg_str = ";".join(f"{k}={v}" for k, v in args.items()) arg_str = ";".join(f"{k}={v}" for k, v in args.items())
else: else:
arg_str = str(args) arg_str = str(args)
logger.debug(f"Setting extractor_args: {key}:{arg_str}") logger.debug(f"setting extractor_args: {key}:{arg_str}")
ydl_options.extend(["--extractor-args", f"{key}:{arg_str}"]) ydl_options.extend(["--extractor-args", f"{key}:{arg_str}"])
if self.ytdlp_args: if self.ytdlp_args:
logger.debug("Adding additional ytdlp arguments: {self.ytdlp_args}") logger.debug(f"adding additional ytdlp arguments: {self.ytdlp_args}")
ydl_options += self.ytdlp_args.split(" ") ydl_options += self.ytdlp_args.split(" ")
*_, validated_options = yt_dlp.parse_options(ydl_options) *_, validated_options = yt_dlp.parse_options(ydl_options)

View File

@@ -1,5 +1,5 @@
import requests import requests
from loguru import logger from auto_archiver.utils.custom_logger import logger
from yt_dlp.extractor.tiktok import TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE from yt_dlp.extractor.tiktok import TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE
@@ -22,7 +22,7 @@ class Tiktok(GenericDropin):
return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE)) return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE))
def extract_post(self, url: str, ie_instance): def extract_post(self, url: str, ie_instance):
logger.debug(f"Using Tikwm API to attempt to download tiktok video from {url=}") logger.debug(f"using Tikwm API to attempt to download tiktok video from {url=}")
endpoint = self.TIKWM_ENDPOINT.format(url=url) endpoint = self.TIKWM_ENDPOINT.format(url=url)

View File

@@ -1,7 +1,7 @@
import re import re
import mimetypes import mimetypes
from loguru import logger from auto_archiver.utils.custom_logger import logger
from slugify import slugify from slugify import slugify
from auto_archiver.core.metadata import Metadata, Media from auto_archiver.core.metadata import Metadata, Media
@@ -40,7 +40,7 @@ class Twitter(GenericDropin):
raise ValueError("Error retreiving post. Are you sure it exists?") raise ValueError("Error retreiving post. Are you sure it exists?")
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y") timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
except (ValueError, KeyError) as ex: except (ValueError, KeyError) as ex:
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}") logger.warning(f"unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
return False return False
full_text = tweet.pop("full_text", "") full_text = tweet.pop("full_text", "")
@@ -49,7 +49,7 @@ class Twitter(GenericDropin):
result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp) result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp)
if not tweet.get("entities", {}).get("media"): if not tweet.get("entities", {}).get("media"):
logger.debug("No media found, archiving tweet text only") logger.debug("no media found, archiving tweet text only")
result.status = "twitter-ytdl" result.status = "twitter-ytdl"
return result return result
for i, tw_media in enumerate(tweet["entities"]["media"]): for i, tw_media in enumerate(tweet["entities"]["media"]):

View File

@@ -10,11 +10,12 @@ The filtered rows are processed into `Metadata` objects.
""" """
import os import os
import traceback
from typing import Tuple, Union, Iterator from typing import Tuple, Union, Iterator
from urllib.parse import quote from urllib.parse import quote
import gspread import gspread
from loguru import logger from auto_archiver.utils.custom_logger import logger
from slugify import slugify from slugify import slugify
from retrying import retry from retrying import retry
@@ -41,19 +42,19 @@ class GsheetsFeederDB(Feeder, Database):
sh = self.open_sheet() sh = self.open_sheet()
for ii, worksheet in enumerate(sh.worksheets()): for ii, worksheet in enumerate(sh.worksheets()):
if not self.should_process_sheet(worksheet.title): if not self.should_process_sheet(worksheet.title):
logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules") logger.debug(f"skipped worksheet '{worksheet.title}' due to allow/block rules")
continue continue
logger.info(f"Opening worksheet {ii=}: {worksheet.title=} header={self.header}") logger.info(f"opening worksheet {ii=}: {worksheet.title=} header={self.header}")
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns) gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
if len(missing_cols := self.missing_required_columns(gw)): if len(missing_cols := self.missing_required_columns(gw)):
logger.debug( logger.debug(
f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}" f"skipped worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}"
) )
continue continue
with logger.contextualize(worksheet=f"{sh.title}:{worksheet.title}"):
# process and yield metadata here: # process and yield metadata here:
yield from self._process_rows(gw) yield from self._process_rows(gw)
logger.info(f"Finished worksheet {worksheet.title}") logger.info(f"finished worksheet {worksheet.title}")
def _process_rows(self, gw: GWorksheet): def _process_rows(self, gw: GWorksheet):
for row in range(1 + self.header, gw.count_rows() + 1): for row in range(1 + self.header, gw.count_rows() + 1):
@@ -69,6 +70,8 @@ class GsheetsFeederDB(Feeder, Database):
# All checks done - archival process starts here # All checks done - archival process starts here
m = Metadata().set_url(url) m = Metadata().set_url(url)
self._set_context(m, gw, row) self._set_context(m, gw, row)
with logger.contextualize(row=row):
yield m yield m
def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata: def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
@@ -99,16 +102,16 @@ class GsheetsFeederDB(Feeder, Database):
return missing return missing
def started(self, item: Metadata) -> None: def started(self, item: Metadata) -> None:
logger.info(f"STARTED {item}") logger.info("STARTED")
gw, row = self._retrieve_gsheet(item) gw, row = self._retrieve_gsheet(item)
gw.set_cell(row, "status", "Archive in progress") gw.set_cell(row, "status", "Archive in progress")
def failed(self, item: Metadata, reason: str) -> None: def failed(self, item: Metadata, reason: str) -> None:
logger.error(f"FAILED {item}") logger.error("FAILED")
self._safe_status_update(item, f"Archive failed {reason}") self._safe_status_update(item, f"Archive failed {reason}")
def aborted(self, item: Metadata) -> None: def aborted(self, item: Metadata) -> None:
logger.warning(f"ABORTED {item}") logger.warning("ABORTED")
self._safe_status_update(item, "") self._safe_status_update(item, "")
def fetch(self, item: Metadata) -> Union[Metadata, bool]: def fetch(self, item: Metadata) -> Union[Metadata, bool]:
@@ -122,9 +125,7 @@ class GsheetsFeederDB(Feeder, Database):
cell_updates = [] cell_updates = []
row_values = gw.get_row(row) row_values = gw.get_row(row)
spreadsheet = gw.wks.spreadsheet.title logger.info("DONE")
worksheet = gw.wks.title
logger.info(f"DONE url='{item.get_url()}' {row=} on {spreadsheet=} : {worksheet=}")
def batch_if_valid(col, val, final_value=None): def batch_if_valid(col, val, final_value=None):
final_value = final_value or val final_value = final_value or val
@@ -132,7 +133,7 @@ class GsheetsFeederDB(Feeder, Database):
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "": if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
cell_updates.append((row, col, final_value)) cell_updates.append((row, col, final_value))
except Exception as e: except Exception as e:
logger.error(f"Unable to batch {col}={final_value} due to {e}") logger.error(f"unable to batch {col}={final_value} due to {e}")
status_message = item.status status_message = item.status
if cached: if cached:
@@ -192,15 +193,13 @@ class GsheetsFeederDB(Feeder, Database):
gw, row = self._retrieve_gsheet(item) gw, row = self._retrieve_gsheet(item)
gw.set_cell(row, "status", new_status) gw.set_cell(row, "status", new_status)
except Exception as e: except Exception as e:
logger.debug(f"Unable to update sheet: {e}") logger.debug(f"unable to update sheet: {e}: {traceback.format_exc()}")
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
if gsheet := item.get_context("gsheet"): if gsheet := item.get_context("gsheet"):
gw: GWorksheet = gsheet.get("worksheet") gw: GWorksheet = gsheet.get("worksheet")
row: int = gsheet.get("row") row: int = gsheet.get("row")
elif self.sheet_id: elif self.sheet_id:
logger.error( logger.error("unable to retrieve Gsheet, GsheetDB must be used alongside GsheetFeeder.")
f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder."
)
return gw, row return gw, row

View File

@@ -9,7 +9,7 @@ making it suitable for handling large files efficiently.
""" """
import hashlib import hashlib
from loguru import logger from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Enricher from auto_archiver.core import Enricher
from auto_archiver.core import Metadata from auto_archiver.core import Metadata
@@ -22,8 +22,7 @@ class HashEnricher(Enricher):
""" """
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url() logger.debug(f"calculating media hashes with algo={self.algorithm}")
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
for i, m in enumerate(to_enrich.media): for i, m in enumerate(to_enrich.media):
if len(hd := self.calculate_hash(m.filename)): if len(hd := self.calculate_hash(m.filename)):

View File

@@ -4,7 +4,7 @@ import os
import pathlib import pathlib
from jinja2 import Environment, FileSystemLoader from jinja2 import Environment, FileSystemLoader
from urllib.parse import quote from urllib.parse import quote
from loguru import logger from auto_archiver.utils.custom_logger import logger
import json import json
import base64 import base64
@@ -35,7 +35,7 @@ class HtmlFormatter(Formatter):
def format(self, item: Metadata) -> Media: def format(self, item: Metadata) -> Media:
url = item.get_url() url = item.get_url()
if item.is_empty(): if item.is_empty():
logger.debug(f"[SKIP] FORMAT there is no media or metadata to format: {url=}") logger.debug("nothing to format, skipping")
return return
content = self.template.render( content = self.template.render(

View File

@@ -14,7 +14,7 @@ from datetime import datetime
import traceback import traceback
import requests import requests
from loguru import logger from auto_archiver.utils.custom_logger import logger
from retrying import retry from retrying import retry
from tqdm import tqdm from tqdm import tqdm
@@ -45,11 +45,11 @@ class InstagramAPIExtractor(Extractor):
url = item.get_url() url = item.get_url()
url.replace("instagr.com", "instagram.com").replace("instagr.am", "instagram.com") url.replace("instagr.com", "instagram.com").replace("instagr.am", "instagram.com")
insta_matches = self.valid_url.findall(url) insta_matches = self.valid_url.findall(url)
logger.info(f"{insta_matches=}")
if not len(insta_matches) or len(insta_matches[0]) != 3: if not len(insta_matches) or len(insta_matches[0]) != 3:
return return
if len(insta_matches) > 1: if len(insta_matches) > 1:
logger.warning(f"Multiple instagram matches found in {url=}, using the first one") logger.debug("multiple instagram matches found, using the first one")
return return
g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2] g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2]
if g1 == "": if g1 == "":
@@ -65,7 +65,7 @@ class InstagramAPIExtractor(Extractor):
return self.download_post(item, id=g3, context="story") return self.download_post(item, id=g3, context="story")
return self.download_stories(item, g2) return self.download_stories(item, g2)
else: else:
logger.warning(f"Unknown instagram regex group match {g1=} found in {url=}") logger.warning(f"unknown instagram regex group match {g1=}")
return return
@retry(wait_random_min=1000, wait_random_max=3000, stop_max_attempt_number=5) @retry(wait_random_min=1000, wait_random_max=3000, stop_max_attempt_number=5)
@@ -112,8 +112,8 @@ class InstagramAPIExtractor(Extractor):
count_posts += len(stories) count_posts += len(stories)
result.set("#stories", len(stories)) result.set("#stories", len(stories))
except Exception as e: except Exception as e:
result.append("errors", f"Error downloading stories for {username}") result.append("errors", f"error downloading stories for {username}")
logger.error(f"Error downloading stories for {username}: {e} {traceback.format_exc()}") logger.error(f"error downloading stories for {username}: {e} {traceback.format_exc()}")
# download all posts # download all posts
try: try:
@@ -122,8 +122,8 @@ class InstagramAPIExtractor(Extractor):
result, user_id, max_to_download=self.full_profile_max_posts - count_posts result, user_id, max_to_download=self.full_profile_max_posts - count_posts
) )
except Exception as e: except Exception as e:
result.append("errors", f"Error downloading posts for {username}") result.append("errors", f"error downloading posts for {username}")
logger.error(f"Error downloading posts for {username}: {e} {traceback.format_exc()}") logger.error(f"error downloading posts for {username}: {e} {traceback.format_exc()}")
# download all tagged # download all tagged
try: try:
@@ -132,8 +132,8 @@ class InstagramAPIExtractor(Extractor):
result, user_id, max_to_download=self.full_profile_max_posts - count_posts result, user_id, max_to_download=self.full_profile_max_posts - count_posts
) )
except Exception as e: except Exception as e:
result.append("errors", f"Error downloading tagged posts for {username}") result.append("errors", f"error downloading tagged posts for {username}")
logger.error(f"Error downloading tagged posts for {username}: {e} {traceback.format_exc()}") logger.error(f"error downloading tagged posts for {username}: {e} {traceback.format_exc()}")
# download all highlights # download all highlights
try: try:
@@ -159,10 +159,10 @@ class InstagramAPIExtractor(Extractor):
except Exception as e: except Exception as e:
result.append( result.append(
"errors", "errors",
f"Error downloading highlight id{h.get('pk')} for {username}", f"error downloading highlight id{h.get('pk')} for {username}",
) )
logger.error( logger.error(
f"Error downloading highlight id{h.get('pk')} for {username}: {e} {traceback.format_exc()}" f"error downloading highlight id{h.get('pk')} for {username}: {e} {traceback.format_exc()}"
) )
if count_highlights >= max_to_download: if count_highlights >= max_to_download:
logger.debug(f"HIGHLIGHTS reached max_to_download={self.full_profile_max_posts}") logger.debug(f"HIGHLIGHTS reached max_to_download={self.full_profile_max_posts}")
@@ -208,8 +208,8 @@ class InstagramAPIExtractor(Extractor):
try: try:
self.scrape_item(result, h, "highlight") self.scrape_item(result, h, "highlight")
except Exception as e: except Exception as e:
result.append("errors", f"Error downloading highlight {h.get('id')}") result.append("errors", f"error downloading highlight {h.get('id')}")
logger.error(f"Error downloading highlight, skipping {h.get('id')}: {e} {traceback.format_exc()}") logger.error(f"error downloading highlight, skipping {h.get('id')}: {e} {traceback.format_exc()}")
return h_info return h_info
@@ -251,8 +251,8 @@ class InstagramAPIExtractor(Extractor):
try: try:
self.scrape_item(result, p, "post") self.scrape_item(result, p, "post")
except Exception as e: except Exception as e:
result.append("errors", f"Error downloading post {p.get('id')}") result.append("errors", f"error downloading post {p.get('id')}")
logger.error(f"Error downloading post, skipping {p.get('id')}: {e} {traceback.format_exc()}") logger.error(f"error downloading post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
pbar.update(1) pbar.update(1)
post_count += 1 post_count += 1
if post_count >= max_to_download: if post_count >= max_to_download:
@@ -279,8 +279,8 @@ class InstagramAPIExtractor(Extractor):
try: try:
self.scrape_item(result, p, "tagged") self.scrape_item(result, p, "tagged")
except Exception as e: except Exception as e:
result.append("errors", f"Error downloading tagged post {p.get('id')}") result.append("errors", f"error downloading tagged post {p.get('id')}")
logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e} {traceback.format_exc()}") logger.error(f"error downloading tagged post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
pbar.update(1) pbar.update(1)
tagged_count += 1 tagged_count += 1
if tagged_count >= max_to_download: if tagged_count >= max_to_download:

View File

@@ -8,7 +8,7 @@ import re
import os import os
import shutil import shutil
import instaloader import instaloader
from loguru import logger from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Extractor from auto_archiver.core import Extractor
from auto_archiver.core import Metadata from auto_archiver.core import Metadata
@@ -29,8 +29,9 @@ class InstagramExtractor(Extractor):
# TODO: links to stories # TODO: links to stories
def setup(self) -> None: def setup(self) -> None:
logger.warning("Instagram Extractor is not actively maintained, and may not work as expected.") logger.warning(
logger.warning("Please consider using the Instagram Tbot Extractor or Instagram API Extractor instead.") "Instagram Extractor is not actively maintained, and may not work as expected.\nPlease consider using the Instagram Tbot Extractor or Instagram API Extractor instead."
)
self.insta = instaloader.Instaloader( self.insta = instaloader.Instaloader(
download_geotags=True, download_geotags=True,
@@ -43,12 +44,11 @@ class InstagramExtractor(Extractor):
self.insta.load_session_from_file(self.username, self.session_file) self.insta.load_session_from_file(self.username, self.session_file)
except Exception: except Exception:
try: try:
logger.debug("Session file failed", exc_info=True) logger.info("no valid session file found - Attempting login with use and password.")
logger.info("No valid session file found - Attempting login with use and password.")
self.insta.login(self.username, self.password) self.insta.login(self.username, self.password)
self.insta.save_session_to_file(self.session_file) self.insta.save_session_to_file(self.session_file)
except Exception as e: except Exception as e:
logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}") logger.error(f"failed to setup Instagram Extractor with Instagrapi. {e}")
def download(self, item: Metadata) -> Metadata: def download(self, item: Metadata) -> Metadata:
url = item.get_url() url = item.get_url()
@@ -72,14 +72,14 @@ class InstagramExtractor(Extractor):
result = self.download_profile(url, profile_matches[0]) result = self.download_profile(url, profile_matches[0])
except Exception as e: except Exception as e:
logger.error( logger.error(
f"Failed to download with instagram extractor due to: {e}, make sure your account credentials are valid." f"failed to download with instagram extractor due to: {e}, make sure your account credentials are valid."
) )
finally: finally:
shutil.rmtree(self.download_folder, ignore_errors=True) shutil.rmtree(self.download_folder, ignore_errors=True)
return result return result
def download_post(self, url: str, post_id: str) -> Metadata: def download_post(self, url: str, post_id: str) -> Metadata:
logger.debug(f"Instagram {post_id=} detected in {url=}") logger.debug(f"Instagram {post_id=} detected")
post = instaloader.Post.from_shortcode(self.insta.context, post_id) post = instaloader.Post.from_shortcode(self.insta.context, post_id)
if self.insta.download_post(post, target=post.owner_username): if self.insta.download_post(post, target=post.owner_username):
@@ -87,7 +87,7 @@ class InstagramExtractor(Extractor):
def download_profile(self, url: str, username: str) -> Metadata: def download_profile(self, url: str, username: str) -> Metadata:
# gets posts, posts where username is tagged, igtv postss, stories, and highlights # gets posts, posts where username is tagged, igtv postss, stories, and highlights
logger.debug(f"Instagram {username=} detected in {url=}") logger.debug(f"Instagram {username=} detected")
profile = instaloader.Profile.from_username(self.insta.context, username) profile = instaloader.Profile.from_username(self.insta.context, username)
try: try:
@@ -95,27 +95,27 @@ class InstagramExtractor(Extractor):
try: try:
self.insta.download_post(post, target=f"profile_post_{post.owner_username}") self.insta.download_post(post, target=f"profile_post_{post.owner_username}")
except Exception as e: except Exception as e:
logger.error(f"Failed to download post: {post.shortcode}: {e}") logger.error(f"failed to download post: {post.shortcode}: {e}")
except Exception as e: except Exception as e:
logger.error(f"Failed profile.get_posts: {e}") logger.error(f"failed profile.get_posts: {e}")
try: try:
for post in profile.get_tagged_posts(): for post in profile.get_tagged_posts():
try: try:
self.insta.download_post(post, target=f"tagged_post_{post.owner_username}") self.insta.download_post(post, target=f"tagged_post_{post.owner_username}")
except Exception as e: except Exception as e:
logger.error(f"Failed to download tagged post: {post.shortcode}: {e}") logger.error(f"failed to download tagged post: {post.shortcode}: {e}")
except Exception as e: except Exception as e:
logger.error(f"Failed profile.get_tagged_posts: {e}") logger.error(f"failed profile.get_tagged_posts: {e}")
try: try:
for post in profile.get_igtv_posts(): for post in profile.get_igtv_posts():
try: try:
self.insta.download_post(post, target=f"igtv_post_{post.owner_username}") self.insta.download_post(post, target=f"igtv_post_{post.owner_username}")
except Exception as e: except Exception as e:
logger.error(f"Failed to download igtv post: {post.shortcode}: {e}") logger.error(f"failed to download igtv post: {post.shortcode}: {e}")
except Exception as e: except Exception as e:
logger.error(f"Failed profile.get_igtv_posts: {e}") logger.error(f"failed profile.get_igtv_posts: {e}")
try: try:
for story in self.insta.get_stories([profile.userid]): for story in self.insta.get_stories([profile.userid]):
@@ -123,9 +123,9 @@ class InstagramExtractor(Extractor):
try: try:
self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}") self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}")
except Exception as e: except Exception as e:
logger.error(f"Failed to download story item: {item}: {e}") logger.error(f"failed to download story item: {item}: {e}")
except Exception as e: except Exception as e:
logger.error(f"Failed get_stories: {e}") logger.error(f"failed get_stories: {e}")
try: try:
for highlight in self.insta.get_highlights(profile.userid): for highlight in self.insta.get_highlights(profile.userid):
@@ -133,9 +133,9 @@ class InstagramExtractor(Extractor):
try: try:
self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}") self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}")
except Exception as e: except Exception as e:
logger.error(f"Failed to download highlight item: {item}: {e}") logger.error(f"failed to download highlight item: {item}: {e}")
except Exception as e: except Exception as e:
logger.error(f"Failed get_highlights: {e}") logger.error(f"failed get_highlights: {e}")
return self.process_downloads(url, f"@{username}", profile._asdict(), None) return self.process_downloads(url, f"@{username}", profile._asdict(), None)
@@ -158,4 +158,4 @@ class InstagramExtractor(Extractor):
return result.success("instagram") return result.success("instagram")
except Exception as e: except Exception as e:
logger.error(f"Could not fetch instagram post {url} due to: {e}") logger.error(f"could not fetch instagram post due to: {e}")

View File

@@ -12,7 +12,7 @@ import shutil
import time import time
from sqlite3 import OperationalError from sqlite3 import OperationalError
from loguru import logger from auto_archiver.utils.custom_logger import logger
from telethon.sync import TelegramClient from telethon.sync import TelegramClient
from auto_archiver.core import Extractor from auto_archiver.core import Extractor

View File

@@ -1,5 +1,5 @@
import json import json
from loguru import logger from auto_archiver.utils.custom_logger import logger
import os import os
from auto_archiver.core import Enricher from auto_archiver.core import Enricher

View File

@@ -1,7 +1,7 @@
import shutil import shutil
from typing import IO from typing import IO
import os import os
from loguru import logger from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Media from auto_archiver.core import Media
from auto_archiver.core import Storage from auto_archiver.core import Storage

View File

@@ -1,6 +1,6 @@
import datetime import datetime
import os import os
from loguru import logger from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Enricher from auto_archiver.core import Enricher
from auto_archiver.core import Metadata from auto_archiver.core import Metadata

View File

@@ -1,6 +1,6 @@
import subprocess import subprocess
import traceback import traceback
from loguru import logger from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Enricher from auto_archiver.core import Enricher
from auto_archiver.core import Metadata from auto_archiver.core import Metadata

View File

@@ -1,6 +1,6 @@
import os import os
from loguru import logger from auto_archiver.utils.custom_logger import logger
import opentimestamps import opentimestamps
from opentimestamps.calendar import RemoteCalendar, DEFAULT_CALENDAR_WHITELIST from opentimestamps.calendar import RemoteCalendar, DEFAULT_CALENDAR_WHITELIST
from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile

View File

@@ -15,7 +15,7 @@ import traceback
import pdqhash import pdqhash
import numpy as np import numpy as np
from PIL import Image, UnidentifiedImageError from PIL import Image, UnidentifiedImageError
from loguru import logger from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Enricher from auto_archiver.core import Enricher
from auto_archiver.core import Metadata from auto_archiver.core import Metadata

View File

@@ -2,7 +2,7 @@ from typing import IO
import boto3 import boto3
import os import os
from loguru import logger from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Media from auto_archiver.core import Media
from auto_archiver.core import Storage from auto_archiver.core import Storage

View File

@@ -2,7 +2,7 @@ import ssl
import os import os
from slugify import slugify from slugify import slugify
from urllib.parse import urlparse from urllib.parse import urlparse
from loguru import logger from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Enricher from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, Media from auto_archiver.core import Metadata, Media

View File

@@ -2,7 +2,7 @@ import requests
import re import re
import html import html
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from loguru import logger from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Extractor from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media from auto_archiver.core import Metadata, Media

View File

@@ -17,7 +17,7 @@ from telethon.errors.rpcerrorlist import (
) )
from tqdm import tqdm from tqdm import tqdm
from loguru import logger from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Extractor from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media from auto_archiver.core import Metadata, Media

View File

@@ -9,7 +9,7 @@ and identify important moments without watching the entire video.
import ffmpeg import ffmpeg
import os import os
from loguru import logger from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Enricher from auto_archiver.core import Enricher
from auto_archiver.core import Media, Metadata from auto_archiver.core import Media, Metadata

View File

@@ -5,7 +5,7 @@ import hashlib
from slugify import slugify from slugify import slugify
import requests import requests
from loguru import logger from auto_archiver.utils.custom_logger import logger
from rfc3161_client import (decode_timestamp_response,TimestampRequestBuilder,TimeStampResponse, VerifierBuilder) from rfc3161_client import (decode_timestamp_response,TimestampRequestBuilder,TimeStampResponse, VerifierBuilder)
from rfc3161_client import VerificationError as Rfc3161VerificationError from rfc3161_client import VerificationError as Rfc3161VerificationError

View File

@@ -4,7 +4,7 @@ import re
import mimetypes import mimetypes
import requests import requests
from loguru import logger from auto_archiver.utils.custom_logger import logger
from pytwitter import Api from pytwitter import Api
from slugify import slugify from slugify import slugify

View File

@@ -4,7 +4,7 @@ import os
import shutil import shutil
import subprocess import subprocess
from zipfile import ZipFile from zipfile import ZipFile
from loguru import logger from auto_archiver.utils.custom_logger import logger
from warcio.archiveiterator import ArchiveIterator from warcio.archiveiterator import ArchiveIterator
from auto_archiver.core import Media, Metadata from auto_archiver.core import Media, Metadata

View File

@@ -1,5 +1,5 @@
import json import json
from loguru import logger from auto_archiver.utils.custom_logger import logger
import time import time
import requests import requests

View File

@@ -1,7 +1,7 @@
import traceback import traceback
import requests import requests
import time import time
from loguru import logger from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Enricher from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, Media from auto_archiver.core import Metadata, Media

View File

@@ -0,0 +1,37 @@
from loguru import logger
import json
def extract_log_data(record):
subset = {
"level": record["level"].name,
"time": record["time"].isoformat(timespec="seconds"),
}
subset["loc"] = f"{record['file'].name}:{record['function']}:{record['line']}"
for extra_key in ["trace", "url", "worksheet", "row"]:
if extra_val := record.get("extra", {}).get(extra_key):
subset[extra_key] = extra_val
subset["message"] = record["message"]
if exception := record.get("exception"):
subset["exception"] = exception
return subset
def serialize_no_message(record):
subset = extract_log_data(record)
subset.pop("message", None)
return json.dumps(subset, ensure_ascii=False)
def serialize(record):
return json.dumps(extract_log_data(record), ensure_ascii=False)
def patching(record):
record["extra"]["serialized"] = serialize(record)
record["extra"]["serialize_no_message"] = serialize_no_message(record)
logger = logger.patch(patching)

View File

@@ -7,7 +7,7 @@ from datetime import datetime, timezone
from dateutil.parser import parse as parse_dt from dateutil.parser import parse as parse_dt
import requests import requests
from loguru import logger from auto_archiver.utils.custom_logger import logger
def mkdir_if_not_exists(folder): def mkdir_if_not_exists(folder):

View File

@@ -9,7 +9,7 @@ from tempfile import TemporaryDirectory
from typing import Dict, Tuple from typing import Dict, Tuple
import hashlib import hashlib
from loguru import logger from auto_archiver.utils.custom_logger import logger
import pytest import pytest
from auto_archiver.core.metadata import Metadata, Media from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.core.module import ModuleFactory from auto_archiver.core.module import ModuleFactory

View File

@@ -1,6 +1,6 @@
from auto_archiver.core import Extractor from auto_archiver.core import Extractor
from loguru import logger from auto_archiver.utils.custom_logger import logger
class ExampleExtractor(Extractor): class ExampleExtractor(Extractor):

View File

@@ -1,6 +1,6 @@
from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata
from loguru import logger from auto_archiver.utils.custom_logger import logger
class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter): class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):

View File

@@ -25,7 +25,7 @@ def orchestration_file(orchestration_file_path):
def autoarchiver(tmp_path, monkeypatch, request): def autoarchiver(tmp_path, monkeypatch, request):
def _autoarchiver(args=[]): def _autoarchiver(args=[]):
def cleanup(): def cleanup():
from loguru import logger from auto_archiver.utils.custom_logger import logger
if not logger._core.handlers.get(0): if not logger._core.handlers.get(0):
logger._core.handlers_count = 0 logger._core.handlers_count = 0