mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-07 19:08:30 +03:00
WIP refactor logging
This commit is contained in:
@@ -14,7 +14,7 @@ You will need to provide your phone number and a 2FA code the first time you run
|
||||
|
||||
import os
|
||||
from telethon.sync import TelegramClient
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
|
||||
# Create a
|
||||
|
||||
@@ -7,7 +7,7 @@ from tempfile import TemporaryDirectory
|
||||
from auto_archiver.utils import url as UrlUtil
|
||||
from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES
|
||||
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .module import ModuleFactory
|
||||
|
||||
@@ -10,7 +10,7 @@ from ruamel.yaml import YAML, CommentedMap
|
||||
import json
|
||||
import os
|
||||
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from copy import deepcopy
|
||||
from auto_archiver.core.consts import MODULE_TYPES
|
||||
@@ -118,8 +118,7 @@ class DefaultValidatingParser(argparse.ArgumentParser):
|
||||
"""
|
||||
Override of error to format a nicer looking error message using logger
|
||||
"""
|
||||
logger.error("Problem with configuration file (tip: use --help to see the available options):")
|
||||
logger.error(message)
|
||||
logger.error(f"Problem with configuration file (tip: use --help to see the available options): \n{message}")
|
||||
self.exit(2)
|
||||
|
||||
def parse_known_args(self, args=None, namespace=None):
|
||||
@@ -136,8 +135,7 @@ class DefaultValidatingParser(argparse.ArgumentParser):
|
||||
try:
|
||||
self._check_value(action, action.default)
|
||||
except argparse.ArgumentError as e:
|
||||
logger.error(f"You have an invalid setting in your configuration file ({action.dest}):")
|
||||
logger.error(e)
|
||||
logger.error(f"You have an invalid setting in your configuration file ({action.dest}):\n {e}")
|
||||
exit()
|
||||
|
||||
return super().parse_known_args(args, namespace)
|
||||
|
||||
@@ -12,7 +12,7 @@ from contextlib import suppress
|
||||
import mimetypes
|
||||
import os
|
||||
import requests
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
from retrying import retry
|
||||
import re
|
||||
|
||||
@@ -94,7 +94,7 @@ class Extractor(BaseModule):
|
||||
to_filename = to_filename[-64:]
|
||||
to_filename = os.path.join(self.tmp_dir, to_filename)
|
||||
if verbose:
|
||||
logger.debug(f"downloading {url[0:50]=} {to_filename=}")
|
||||
logger.debug(f"downloading {to_filename=}")
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
|
||||
}
|
||||
@@ -117,7 +117,7 @@ class Extractor(BaseModule):
|
||||
return to_filename
|
||||
|
||||
except requests.RequestException as e:
|
||||
logger.warning(f"Failed to fetch the Media URL: {str(e)[:250]}")
|
||||
logger.warning(f"Failed to fetch the Media URL: {e}")
|
||||
if try_best_quality:
|
||||
return None, url
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ from dataclasses import dataclass, field
|
||||
from dataclasses_json import dataclass_json, config
|
||||
import mimetypes
|
||||
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
|
||||
@dataclass_json # annotation order matters
|
||||
@@ -121,8 +121,7 @@ class Media:
|
||||
except Error:
|
||||
return False # ffmpeg errors when reading bad files
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
logger.error(traceback.format_exc())
|
||||
logger.error(f"{e}: {traceback.format_exc()}")
|
||||
try:
|
||||
fsize = os.path.getsize(self.filename)
|
||||
return fsize > 20_000
|
||||
|
||||
@@ -17,7 +17,7 @@ from dataclasses_json import dataclass_json
|
||||
import datetime
|
||||
from urllib.parse import urlparse
|
||||
from dateutil.parser import parse as parse_dt
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from .media import Media
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ import sys
|
||||
from importlib.util import find_spec
|
||||
import os
|
||||
from os.path import join
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
import auto_archiver
|
||||
from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE, SetupError
|
||||
|
||||
|
||||
@@ -15,9 +15,11 @@ import traceback
|
||||
from copy import copy
|
||||
|
||||
from rich_argparse import RichHelpFormatter
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
import requests
|
||||
|
||||
from auto_archiver.utils.misc import random_str
|
||||
|
||||
from .metadata import Metadata, Media
|
||||
from auto_archiver.version import __version__
|
||||
from .config import (
|
||||
@@ -342,7 +344,12 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
# add other logging info
|
||||
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
|
||||
use_level = logging_config["level"]
|
||||
self.logger_id = logger.add(sys.stderr, level=use_level)
|
||||
self.logger_id = logger.add(
|
||||
sys.stderr,
|
||||
level=use_level,
|
||||
catch=True,
|
||||
format="<level>{level}</level>: <fg #64FFDA>{message}</fg #64FFDA> {extra[serialize_no_message]}",
|
||||
)
|
||||
|
||||
rotation = logging_config["rotation"]
|
||||
log_file = logging_config["file"]
|
||||
@@ -356,9 +363,10 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
f"{log_file}.{i}_{level.lower()}",
|
||||
filter=lambda rec, lvl=level: rec["level"].name == lvl,
|
||||
rotation=rotation,
|
||||
format="{extra[serialized]}",
|
||||
)
|
||||
elif log_file:
|
||||
logger.add(log_file, rotation=rotation, level=use_level)
|
||||
logger.add(log_file, rotation=rotation, level=use_level, format="{extra[serialized]}")
|
||||
|
||||
def install_modules(self, modules_by_type):
|
||||
"""
|
||||
@@ -466,13 +474,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
update_cmd = "`docker pull bellingcat/auto-archiver:latest`"
|
||||
else:
|
||||
update_cmd = "`pip install --upgrade auto-archiver`"
|
||||
logger.warning("")
|
||||
logger.warning("********* IMPORTANT: UPDATE AVAILABLE ********")
|
||||
logger.warning(
|
||||
f"A new version of auto-archiver is available (v{latest_version}, you have v{current_version})"
|
||||
f"\n********* IMPORTANT: UPDATE AVAILABLE ********\nA new version of auto-archiver is available (v{latest_version}, you have v{current_version})\nMake sure to update to the latest version using: {update_cmd}\n"
|
||||
)
|
||||
logger.warning(f"Make sure to update to the latest version using: {update_cmd}")
|
||||
logger.warning("")
|
||||
|
||||
def setup(self, args: list):
|
||||
"""
|
||||
@@ -522,7 +526,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
self.setup(args)
|
||||
return self.feed()
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
logger.error(f"{e}: {traceback.format_exc()}")
|
||||
exit(1)
|
||||
|
||||
def cleanup(self) -> None:
|
||||
@@ -534,10 +538,12 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
url_count = 0
|
||||
for feeder in self.feeders:
|
||||
for item in feeder:
|
||||
yield self.feed_item(item)
|
||||
url_count += 1
|
||||
with logger.contextualize(url=item.get_url(), trace=random_str(12)):
|
||||
logger.info("started processing")
|
||||
yield self.feed_item(item)
|
||||
url_count += 1
|
||||
|
||||
logger.info(f"Processed {url_count} URL(s)")
|
||||
logger.info(f"processed {url_count} URL(s)")
|
||||
self.cleanup()
|
||||
|
||||
def feed_item(self, item: Metadata) -> Metadata:
|
||||
@@ -555,13 +561,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
return self.archive(item)
|
||||
except KeyboardInterrupt:
|
||||
# catches keyboard interruptions to do a clean exit
|
||||
logger.warning(f"caught interrupt on {item=}")
|
||||
logger.warning("caught interrupt")
|
||||
for d in self.databases:
|
||||
d.aborted(item)
|
||||
self.cleanup()
|
||||
exit()
|
||||
except Exception as e:
|
||||
logger.error(f"Got unexpected error on item {item}: {e}\n{traceback.format_exc()}")
|
||||
logger.error(f"Got unexpected error: {e}\n{traceback.format_exc()}")
|
||||
for d in self.databases:
|
||||
if isinstance(e, AssertionError):
|
||||
d.failed(item, str(e))
|
||||
@@ -589,7 +595,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
try:
|
||||
check_url_or_raise(original_url)
|
||||
except ValueError as e:
|
||||
logger.error(f"Error archiving URL {original_url}: {e}")
|
||||
logger.error(f"Error archiving: {e}")
|
||||
raise e
|
||||
|
||||
# 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
|
||||
@@ -599,7 +605,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
|
||||
result.set_url(url)
|
||||
if original_url != url:
|
||||
logger.debug(f"Sanitized URL from {original_url} to {url}")
|
||||
logger.debug(f"Sanitized URL to {url}")
|
||||
result.set("original_url", original_url)
|
||||
|
||||
# 2 - notify start to DBs, propagate already archived if feature enabled in DBs
|
||||
@@ -614,25 +620,25 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
try:
|
||||
d.done(cached_result, cached=True)
|
||||
except Exception as e:
|
||||
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
||||
logger.error(f"database {d.name}: {e}: {traceback.format_exc()}")
|
||||
return cached_result
|
||||
|
||||
# 3 - call extractors until one succeeds
|
||||
for a in self.extractors:
|
||||
logger.info(f"Trying extractor {a.name} for {url}")
|
||||
logger.info(f"trying extractor {a.name}")
|
||||
try:
|
||||
result.merge(a.download(result))
|
||||
if result.is_success():
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")
|
||||
logger.error(f"archiver {a.name}: {e}: {traceback.format_exc()}")
|
||||
|
||||
# 4 - call enrichers to work with archived content
|
||||
for e in self.enrichers:
|
||||
try:
|
||||
e.enrich(result)
|
||||
except Exception as exc:
|
||||
logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
|
||||
logger.error(f"enricher {e.name}: {exc}: {traceback.format_exc()}")
|
||||
|
||||
# 5 - store all downloaded/generated media
|
||||
result.store(storages=self.storages)
|
||||
@@ -651,7 +657,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
try:
|
||||
d.done(result)
|
||||
except Exception as e:
|
||||
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
||||
logger.error(f"database {d.name}: {e}: {traceback.format_exc()}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ from abc import abstractmethod
|
||||
from typing import IO
|
||||
import os
|
||||
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
from slugify import slugify
|
||||
|
||||
from auto_archiver.utils.misc import random_str
|
||||
|
||||
@@ -7,7 +7,7 @@ from urllib.parse import urljoin
|
||||
import glob
|
||||
import importlib.util
|
||||
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
import selenium
|
||||
from seleniumbase import SB
|
||||
|
||||
@@ -57,7 +57,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
continue # Skip imported modules/classes/functions
|
||||
if isinstance(obj, type) and issubclass(obj, Dropin):
|
||||
dropins.append(obj)
|
||||
logger.debug(f"ANTIBOT loaded drop-in classes: {', '.join([d.__name__ for d in dropins])}")
|
||||
logger.debug(f"loaded drop-in classes: {', '.join([d.__name__ for d in dropins])}")
|
||||
return dropins
|
||||
|
||||
def sanitize_url(self, url: str) -> str:
|
||||
@@ -83,14 +83,13 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool:
|
||||
using_user_data_dir = self.user_data_dir if custom_data_dir else None
|
||||
url = to_enrich.get_url()
|
||||
url_sample = url[:75]
|
||||
|
||||
try:
|
||||
with SB(uc=True, agent=self.agent, headed=None, user_data_dir=using_user_data_dir, proxy=self.proxy) as sb:
|
||||
logger.info(f"ANTIBOT selenium browser is up with agent {self.agent}, opening {url_sample}...")
|
||||
logger.info(f"selenium browser is up with agent {self.agent}, opening url...")
|
||||
sb.uc_open_with_reconnect(url, 4)
|
||||
|
||||
logger.debug(f"ANTIBOT handling CAPTCHAs for {url_sample}...")
|
||||
logger.debug("handling CAPTCHAs for...")
|
||||
sb.uc_gui_handle_cf()
|
||||
sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future
|
||||
|
||||
@@ -98,7 +97,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
dropin.open_page(url)
|
||||
|
||||
if self.detect_auth_wall and self._hit_auth_wall(sb):
|
||||
logger.warning(f"ANTIBOT SKIP since auth wall or CAPTCHA was detected for {url_sample}")
|
||||
logger.warning("skipping since auth wall or CAPTCHA was detected")
|
||||
return False
|
||||
|
||||
sb.wait_for_ready_state_complete()
|
||||
@@ -125,18 +124,18 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
js_css_selector=dropin.js_for_video_css_selectors(),
|
||||
max_media=self.max_download_videos - downloaded_videos,
|
||||
)
|
||||
logger.info(f"ANTIBOT completed for {url_sample}")
|
||||
logger.info("completed")
|
||||
|
||||
return to_enrich
|
||||
except selenium.common.exceptions.SessionNotCreatedException as e:
|
||||
if custom_data_dir: # the retry logic only works once
|
||||
logger.error(
|
||||
f"ANTIBOT session not created error: {e}. Please remove the user_data_dir {self.user_data_dir} and try again, will retry without user data dir though."
|
||||
f"session not created error: {e}. Please remove the user_data_dir {self.user_data_dir} and try again, will retry without user data dir though."
|
||||
)
|
||||
return self.enrich(to_enrich, custom_data_dir=False)
|
||||
raise e # re-raise
|
||||
except Exception as e:
|
||||
logger.error(f"ANTIBOT runtime error: {e}: {traceback.format_exc()}")
|
||||
logger.error(f"runtime error: {e}: {traceback.format_exc()}")
|
||||
return False
|
||||
|
||||
def _get_suitable_dropin(self, url: str, sb: SB):
|
||||
@@ -146,7 +145,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
"""
|
||||
for dropin in self.dropins:
|
||||
if dropin.suitable(url):
|
||||
logger.debug(f"ANTIBOT using drop-in {dropin.__name__} for {url}")
|
||||
logger.debug(f"using drop-in {dropin.__name__}")
|
||||
return dropin(sb, self)
|
||||
|
||||
return DefaultDropin(sb, self)
|
||||
@@ -241,7 +240,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
|
||||
x = max(sb.execute_script("return document.documentElement.scrollWidth"), w)
|
||||
y = min(max(sb.execute_script("return document.documentElement.scrollHeight"), h), 25_000)
|
||||
logger.debug(f"Setting window size to {x}x{y} for full page screenshot.")
|
||||
logger.debug(f"setting window size to {x}x{y} for full page screenshot.")
|
||||
sb.set_window_size(x, y)
|
||||
|
||||
screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png")
|
||||
@@ -280,7 +279,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
# js_for_css_selectors
|
||||
for src in sources:
|
||||
if len(all_urls) >= max_media:
|
||||
logger.debug(f"Reached max download limit of {max_media} images/videos.")
|
||||
logger.debug(f"reached max download limit of {max_media} images/videos.")
|
||||
break
|
||||
if not is_relevant_url(src):
|
||||
continue
|
||||
|
||||
@@ -0,0 +1,60 @@
|
||||
# def solve_captcha(image_url):
|
||||
# # Download image
|
||||
# img_data = requests.get(image_url).content
|
||||
# encoded_image = base64.b64encode(img_data).decode()
|
||||
|
||||
# # Submit to AntiCaptcha
|
||||
# task = {
|
||||
# "clientKey": ANTI_CAPTCHA_KEY,
|
||||
# "task": {
|
||||
# "type": "ImageToTextTask",
|
||||
# "body": encoded_image
|
||||
# }
|
||||
# }
|
||||
# print("[*] Sending captcha request to anti-captcha...")
|
||||
|
||||
# task_response = requests.post("https://api.anti-captcha.com/createTask", json=task).json()
|
||||
# task_id = task_response["taskId"]
|
||||
# print(f"[*] Anti-captcha response: {task_response}")
|
||||
|
||||
# # Poll for result
|
||||
# while True:
|
||||
# time.sleep(5)
|
||||
# res = requests.post("https://api.anti-captcha.com/getTaskResult", json={
|
||||
# "clientKey": ANTI_CAPTCHA_KEY,
|
||||
# "taskId": task_id
|
||||
# }).json()
|
||||
# if res["status"] == "ready":
|
||||
# print(f"[*] Captcha solved: {res}")
|
||||
# return res["solution"]["text"]
|
||||
# print(f"[*] Polling for captcha solution: {res['status']}")
|
||||
|
||||
|
||||
# def solve_recaptcha(site_key, page_url):
|
||||
# print("[*] Sending captcha request to anti-captcha...")
|
||||
# # Step 1: Send captcha request
|
||||
# task_payload = {
|
||||
# "clientKey": ANTI_CAPTCHA_KEY,
|
||||
# "task": {
|
||||
# "type": "NoCaptchaTaskProxyless",
|
||||
# "websiteURL": page_url,
|
||||
# "websiteKey": site_key
|
||||
# }
|
||||
# }
|
||||
# response = requests.post("https://api.anti-captcha.com/createTask", json=task_payload).json()
|
||||
# print(f"[*] Anti-captcha response: {response}")
|
||||
# task_id = response["taskId"]
|
||||
|
||||
# # Step 2: Poll for solution
|
||||
# print("[*] Polling for captcha solution...")
|
||||
# for i in range(40): # ~80 seconds
|
||||
# time.sleep(2)
|
||||
# result = requests.post("https://api.anti-captcha.com/getTaskResult", json={
|
||||
# "clientKey": ANTI_CAPTCHA_KEY,
|
||||
# "taskId": task_id
|
||||
# }).json()
|
||||
# print(f" Poll {i+1}: status={result['status']}")
|
||||
# if result["status"] == "ready":
|
||||
# print("[*] Captcha solved!")
|
||||
# return result["solution"]["gRecaptchaResponse"]
|
||||
# raise TimeoutError("AntiCaptcha took too long")
|
||||
@@ -1,6 +1,7 @@
|
||||
import os
|
||||
import traceback
|
||||
from typing import Mapping
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
from seleniumbase import SB
|
||||
import yt_dlp
|
||||
|
||||
@@ -143,7 +144,7 @@ class Dropin:
|
||||
with yt_dlp.YoutubeDL(validated_options) as ydl:
|
||||
for url in video_urls:
|
||||
try:
|
||||
logger.debug(f"Downloading video from URL: {url}")
|
||||
logger.debug("downloading video from url")
|
||||
info = ydl.extract_info(url, download=True)
|
||||
filename = ydl_entry_to_filename(ydl, info)
|
||||
if not filename: # Failed to download video.
|
||||
@@ -155,5 +156,5 @@ class Dropin:
|
||||
to_enrich.add_media(media)
|
||||
downloaded += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading {url}: {e}")
|
||||
logger.error(f"download failed: {e} {traceback.format_exc()}")
|
||||
return downloaded
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from typing import Mapping
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||
|
||||
|
||||
@@ -62,7 +62,7 @@ class LinkedinDropin(Dropin):
|
||||
self.sb.wait_for_ready_state_complete()
|
||||
|
||||
username, password = self._get_username_password("linkedin.com")
|
||||
logger.debug("LinkedinDropin Logging in to Linkedin with username: {}", username)
|
||||
logger.debug("logging in to Linkedin with username: {}", username)
|
||||
self.sb.type("#username", username)
|
||||
self.sb.type("#password", password)
|
||||
self.sb.click_if_visible("#password-visibility-toggle", timeout=0.5)
|
||||
|
||||
@@ -3,7 +3,7 @@ from typing import Mapping
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
|
||||
class RedditDropin(Dropin):
|
||||
@@ -50,7 +50,7 @@ class RedditDropin(Dropin):
|
||||
self._close_cookies_banner()
|
||||
|
||||
username, password = self._get_username_password("reddit.com")
|
||||
logger.debug("RedditDropin Logging in to Reddit with username: {}", username)
|
||||
logger.debug("logging in to Reddit with username: {}", username)
|
||||
|
||||
self.sb.type("#login-username", username)
|
||||
self.sb.type("#login-password", password)
|
||||
@@ -68,7 +68,7 @@ class RedditDropin(Dropin):
|
||||
self.sb.click_link_text("Log in")
|
||||
self.sb.wait_for_ready_state_complete()
|
||||
if self.sb.is_text_visible("Welcome back"):
|
||||
logger.debug("RedditDropin Login successful")
|
||||
logger.debug("login successful")
|
||||
self.sb.click_if_visible("this link")
|
||||
|
||||
def _close_cookies_banner(self):
|
||||
@@ -88,5 +88,5 @@ class RedditDropin(Dropin):
|
||||
.map(el => el.src || el.href)
|
||||
.filter(url => url && /\.(m3u8|mpd|ism)$/.test(url));
|
||||
""")
|
||||
logger.debug("RedditDropin Found {} video URLs", len(filtered_urls))
|
||||
logger.debug("found {} video URLs", len(filtered_urls))
|
||||
return 0, self._download_videos_with_ytdlp(filtered_urls, to_enrich)
|
||||
|
||||
@@ -4,7 +4,7 @@ from typing import Mapping
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
|
||||
class VkDropin(Dropin):
|
||||
@@ -57,12 +57,12 @@ class VkDropin(Dropin):
|
||||
self.sb.open("https://vk.com")
|
||||
self.sb.wait_for_ready_state_complete()
|
||||
if "/feed" in self.sb.get_current_url():
|
||||
logger.debug("Already logged in to VK.")
|
||||
logger.debug("already logged in to VK.")
|
||||
return True
|
||||
|
||||
# need to login
|
||||
username, password = self._get_username_password("vk.com")
|
||||
logger.debug("Logging in to VK with username: {}", username)
|
||||
logger.debug("logging in to VK with username: {}", username)
|
||||
|
||||
self.sb.click('[data-testid="enter-another-way"]', timeout=10)
|
||||
self.sb.clear('input[name="login"][type="tel"]', by="css selector", timeout=10)
|
||||
|
||||
@@ -2,7 +2,7 @@ from typing import Union
|
||||
|
||||
import os
|
||||
import requests
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from auto_archiver.core import Database
|
||||
from auto_archiver.core import Metadata
|
||||
@@ -36,9 +36,9 @@ class AAApiDb(Database):
|
||||
if not self.store_results:
|
||||
return
|
||||
if cached:
|
||||
logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
|
||||
logger.debug("skipping saving archive to AA API because it was cached")
|
||||
return
|
||||
logger.debug(f"saving archive of {item.get_url()} to the AA API.")
|
||||
logger.debug("saving archive to the AA API.")
|
||||
|
||||
payload = {
|
||||
"author_id": self.author_id,
|
||||
|
||||
@@ -3,7 +3,7 @@ import os
|
||||
from typing import IO, Iterator, Optional, Union
|
||||
|
||||
import requests
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from auto_archiver.core import Database, Feeder, Media, Metadata, Storage
|
||||
from auto_archiver.utils import calculate_file_hash
|
||||
@@ -66,13 +66,13 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
||||
"""Mark an item as failed in Atlos, if the ID exists."""
|
||||
atlos_id = item.metadata.get("atlos_id")
|
||||
if not atlos_id:
|
||||
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
|
||||
logger.info("No Atlos ID available, skipping")
|
||||
return
|
||||
self._post(
|
||||
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
|
||||
json={"metadata": {"processed": True, "status": "error", "error": reason}},
|
||||
)
|
||||
logger.info(f"Stored failure for {item.get_url()} (ID {atlos_id}) on Atlos: {reason}")
|
||||
logger.info(f"stored failure ID {atlos_id} on Atlos: {reason}")
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
"""check and fetch if the given item has been archived already, each
|
||||
@@ -88,7 +88,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
||||
"""Mark an item as successfully archived in Atlos."""
|
||||
atlos_id = item.metadata.get("atlos_id")
|
||||
if not atlos_id:
|
||||
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
|
||||
logger.info("item has no Atlos ID, skipping")
|
||||
return
|
||||
self._post(
|
||||
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
|
||||
@@ -100,7 +100,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
||||
}
|
||||
},
|
||||
)
|
||||
logger.info(f"Stored success for {item.get_url()} (ID {atlos_id}) on Atlos")
|
||||
logger.info(f"stored success ID {atlos_id} on Atlos")
|
||||
|
||||
# ! Atlos Module - Storage Methods
|
||||
|
||||
@@ -111,12 +111,12 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
||||
def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool:
|
||||
"""Upload a media file to Atlos if it has not been uploaded already."""
|
||||
if metadata is None:
|
||||
logger.error(f"No metadata provided for {media.filename}")
|
||||
logger.error(f"no metadata provided for {media.filename}")
|
||||
return False
|
||||
|
||||
atlos_id = metadata.get("atlos_id")
|
||||
if not atlos_id:
|
||||
logger.error(f"No Atlos ID found in metadata; can't store {media.filename} in Atlos.")
|
||||
logger.error(f"no Atlos ID found in metadata; can't store {media.filename} in Atlos.")
|
||||
return False
|
||||
|
||||
media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096)
|
||||
@@ -135,7 +135,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
||||
params={"title": media.properties},
|
||||
files={"file": (os.path.basename(media.filename), file_obj)},
|
||||
)
|
||||
logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}")
|
||||
logger.info(f"uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}")
|
||||
return True
|
||||
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core.feeder import Feeder
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.core.consts import SetupError
|
||||
@@ -16,8 +14,5 @@ class CLIFeeder(Feeder):
|
||||
def __iter__(self) -> Metadata:
|
||||
urls = self.config["urls"]
|
||||
for url in urls:
|
||||
logger.debug(f"Processing {url}")
|
||||
m = Metadata().set_url(url)
|
||||
yield m
|
||||
|
||||
logger.success(f"Processed {len(urls)} URL(s)")
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from auto_archiver.core import Database
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import os
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
from csv import DictWriter
|
||||
from dataclasses import asdict
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
import csv
|
||||
|
||||
from auto_archiver.core import Feeder
|
||||
@@ -20,20 +20,19 @@ class CSVFeeder(Feeder):
|
||||
url_column = first_row.index(url_column)
|
||||
except ValueError:
|
||||
logger.error(
|
||||
f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?"
|
||||
f"column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?"
|
||||
)
|
||||
return
|
||||
elif not (url_or_none(first_row[url_column])):
|
||||
# it's a header row, but we've been given a column number already
|
||||
logger.debug(f"Skipping header row: {first_row}")
|
||||
logger.debug(f"skipping header row: {first_row}")
|
||||
else:
|
||||
# first row isn't a header row, rewind the file
|
||||
f.seek(0)
|
||||
|
||||
for row in reader:
|
||||
if not url_or_none(row[url_column]):
|
||||
logger.warning(f"Not a valid URL in row: {row}, skipping")
|
||||
logger.warning(f"not a valid URL in row: {row}, skipping")
|
||||
continue
|
||||
url = row[url_column]
|
||||
logger.debug(f"Processing {url}")
|
||||
yield Metadata().set_url(url)
|
||||
|
||||
@@ -8,7 +8,7 @@ from google.oauth2 import service_account
|
||||
from google.oauth2.credentials import Credentials
|
||||
from googleapiclient.discovery import build
|
||||
from googleapiclient.http import MediaFileUpload
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.core import Storage
|
||||
@@ -23,10 +23,10 @@ class GDriveStorage(Storage):
|
||||
def _setup_google_drive_service(self):
|
||||
"""Initialize Google Drive service based on provided credentials."""
|
||||
if self.oauth_token:
|
||||
logger.debug(f"Using Google Drive OAuth token: {self.oauth_token}")
|
||||
logger.debug(f"using Google Drive OAuth token: {self.oauth_token}")
|
||||
self.service = self._initialize_with_oauth_token()
|
||||
elif self.service_account:
|
||||
logger.debug(f"Using Google Drive service account: {self.service_account}")
|
||||
logger.debug(f"using Google Drive service account: {self.service_account}")
|
||||
self.service = self._initialize_with_service_account()
|
||||
else:
|
||||
raise ValueError("Missing credentials: either `oauth_token` or `service_account` must be provided.")
|
||||
@@ -41,7 +41,7 @@ class GDriveStorage(Storage):
|
||||
if not creds.valid and creds.expired and creds.refresh_token:
|
||||
creds.refresh(Request())
|
||||
with open(self.oauth_token, "w") as token_file:
|
||||
logger.debug("Saving refreshed OAuth token.")
|
||||
logger.debug("saving refreshed OAuth token.")
|
||||
token_file.write(creds.to_json())
|
||||
elif not creds.valid:
|
||||
raise ValueError("Invalid OAuth token. Please regenerate the token.")
|
||||
@@ -180,7 +180,7 @@ class GDriveStorage(Storage):
|
||||
Creates a new GDrive folder @name inside folder @parent_id
|
||||
Returns id of the created folder
|
||||
"""
|
||||
logger.debug(f"Creating new folder with {name=} inside {parent_id=}")
|
||||
logger.debug(f"creating new folder with {name=} inside {parent_id=}")
|
||||
file_metadata = {"name": [name], "mimeType": "application/vnd.google-apps.folder", "parents": [parent_id]}
|
||||
gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields="id").execute()
|
||||
return gd_folder.get("id")
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
@@ -18,7 +18,7 @@ class Bluesky(GenericDropin):
|
||||
# download if embeds present (1 video XOR >=1 images)
|
||||
for media in self._download_bsky_embeds(post, archiver):
|
||||
result.add_media(media)
|
||||
logger.debug(f"Downloaded {len(result.media)} media files")
|
||||
logger.debug(f"downloaded {len(result.media)} media files")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ from yt_dlp.extractor.common import InfoExtractor
|
||||
from yt_dlp.utils import MaxDownloadsReached
|
||||
import pysubs2
|
||||
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
@@ -63,12 +63,11 @@ class GenericExtractor(Extractor):
|
||||
if os.environ.get("AUTO_ARCHIVER_ALLOW_RESTART", "1") != "1":
|
||||
logger.warning("yt-dlp or plugin was updated — please restart auto-archiver manually")
|
||||
else:
|
||||
logger.warning("yt-dlp or plugin was updated — restarting auto-archiver")
|
||||
logger.warning(" ======= RESTARTING ======= ")
|
||||
logger.warning("yt-dlp or plugin was updated — restarting auto-archiver\n ======= RESTARTING ======= ")
|
||||
os.execv(sys.executable, [sys.executable] + sys.argv)
|
||||
|
||||
def update_package(self, package_name: str) -> bool:
|
||||
logger.info(f"Checking and updating {package_name}...")
|
||||
logger.info(f"checking and updating {package_name}...")
|
||||
from importlib.metadata import version as get_version
|
||||
|
||||
old_version = get_version(package_name)
|
||||
@@ -80,7 +79,7 @@ class GenericExtractor(Extractor):
|
||||
return True
|
||||
logger.info(f"{package_name} already up to date")
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating {package_name}: {e}")
|
||||
logger.error(f"failed to update {package_name}: {e}")
|
||||
return False
|
||||
|
||||
def setup_po_tokens(self) -> None:
|
||||
@@ -111,7 +110,7 @@ class GenericExtractor(Extractor):
|
||||
missing_tools = [tool for tool in ("node", "yarn", "npx") if shutil.which(tool) is None]
|
||||
if missing_tools:
|
||||
logger.error(
|
||||
f"Cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. "
|
||||
f"cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. "
|
||||
"Install these tools or run bgutils via Docker. "
|
||||
"See: https://github.com/Brainicism/bgutil-ytdlp-pot-provider"
|
||||
)
|
||||
@@ -140,7 +139,7 @@ class GenericExtractor(Extractor):
|
||||
f"https://github.com/Brainicism/bgutil-ytdlp-pot-provider/archive/refs/tags/{plugin_version}.zip"
|
||||
)
|
||||
zip_path = os.path.join(base_dir, f"{plugin_version}.zip")
|
||||
logger.info(f"Downloading bgutils release zip for version {plugin_version}...")
|
||||
logger.info(f"downloading bgutils release zip for version {plugin_version}...")
|
||||
urlretrieve(zip_url, zip_path)
|
||||
with zipfile.ZipFile(zip_path, "r") as z:
|
||||
z.extractall(base_dir)
|
||||
@@ -149,7 +148,7 @@ class GenericExtractor(Extractor):
|
||||
extracted_root = os.path.join(base_dir, f"bgutil-ytdlp-pot-provider-{plugin_version}")
|
||||
shutil.move(os.path.join(extracted_root, "server"), server_dir)
|
||||
shutil.rmtree(extracted_root)
|
||||
logger.info("Installing dependencies and transpiling PoT Generator script...")
|
||||
logger.info("installing dependencies and transpiling PoT Generator script...")
|
||||
subprocess.run(["yarn", "install", "--frozen-lockfile"], cwd=server_dir, check=True)
|
||||
subprocess.run(["npx", "tsc"], cwd=server_dir, check=True)
|
||||
|
||||
@@ -165,7 +164,7 @@ class GenericExtractor(Extractor):
|
||||
logger.info(f"PO Token script configured at: {script_path}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to set up PO Token script: {e}")
|
||||
logger.error(f"failed to set up PO Token script: {e}")
|
||||
|
||||
def suitable_extractors(self, url: str) -> Generator[str, None, None]:
|
||||
"""
|
||||
@@ -206,7 +205,7 @@ class GenericExtractor(Extractor):
|
||||
media = Media(cover_image_path)
|
||||
metadata.add_media(media, id="cover")
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading cover image {thumbnail_url}: {e}")
|
||||
logger.error(f"could not download cover image {thumbnail_url}: {e}")
|
||||
|
||||
dropin = self.dropin_for_name(info_extractor.ie_key())
|
||||
if dropin:
|
||||
@@ -353,7 +352,7 @@ class GenericExtractor(Extractor):
|
||||
|
||||
if not dropin:
|
||||
# TODO: add a proper link to 'how to create your own dropin'
|
||||
logger.debug(f"""Could not find valid dropin for {info_extractor.ie_key()}.
|
||||
logger.debug(f"""could not find valid dropin for {info_extractor.ie_key()}.
|
||||
Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""")
|
||||
return False
|
||||
|
||||
@@ -389,7 +388,7 @@ class GenericExtractor(Extractor):
|
||||
# file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies.
|
||||
continue
|
||||
|
||||
logger.debug(f"Using filename {filename} for entry {entry.get('id', 'unknown')}")
|
||||
logger.debug(f"using filename {filename} for entry {entry.get('id', 'unknown')}")
|
||||
|
||||
new_media = Media(filename)
|
||||
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
|
||||
@@ -404,12 +403,12 @@ class GenericExtractor(Extractor):
|
||||
text = " ".join([line.text for line in subs])
|
||||
new_media.set(f"subtitles_{lang}", text)
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}")
|
||||
logger.error(f"error loading subtitle file {val.get('filepath')}: {e}")
|
||||
result.add_media(new_media)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing entry {entry}: {e}")
|
||||
logger.error(f"error processing entry {entry}: {e}")
|
||||
if not len(result.media):
|
||||
logger.info(f"No media found for entry {entry}, skipping.")
|
||||
logger.info(f"no media found for entry {entry}, skipping.")
|
||||
return False
|
||||
|
||||
return self.add_metadata(data, info_extractor, url, result)
|
||||
@@ -471,14 +470,14 @@ class GenericExtractor(Extractor):
|
||||
|
||||
def _helper_for_successful_extract_info(data, info_extractor, url, ydl):
|
||||
if data.get("is_live", False) and not self.livestreams:
|
||||
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
|
||||
logger.warning("livestream detected, skipping due to 'livestreams' configuration setting")
|
||||
return False
|
||||
# it's a valid video, that the youtubdedl can download out of the box
|
||||
return self.get_metadata_for_video(data, info_extractor, url, ydl)
|
||||
|
||||
try:
|
||||
if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
|
||||
logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
|
||||
logger.debug(f"skipping using ytdlp to download files for {info_extractor.ie_key()}")
|
||||
raise SkipYtdlp()
|
||||
|
||||
# don't download since it can be a live stream
|
||||
@@ -497,17 +496,17 @@ class GenericExtractor(Extractor):
|
||||
|
||||
if not isinstance(e, SkipYtdlp):
|
||||
logger.debug(
|
||||
f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead'
|
||||
f'issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead'
|
||||
)
|
||||
|
||||
try:
|
||||
result = self.get_metadata_for_post(info_extractor, url, ydl)
|
||||
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
|
||||
logger.error("Error downloading metadata for post: {error}", error=str(post_e))
|
||||
logger.error("error downloading metadata for post: {error}", error=str(post_e))
|
||||
return False
|
||||
except Exception as generic_e:
|
||||
logger.debug(
|
||||
'Attempt to extract using ytdlp extractor "{name}" failed: \n {error}',
|
||||
'attempt to extract using ytdlp extractor "{name}" failed: \n {error}',
|
||||
name=info_extractor.IE_NAME,
|
||||
error=str(generic_e),
|
||||
exc_info=True,
|
||||
@@ -560,17 +559,17 @@ class GenericExtractor(Extractor):
|
||||
# order of importance: username/password -> api_key -> cookie -> cookies_from_browser -> cookies_file
|
||||
if auth:
|
||||
if "username" in auth and "password" in auth:
|
||||
logger.debug(f"Using provided auth username and password for {url}")
|
||||
logger.debug("using provided auth username and password")
|
||||
ydl_options.extend(("--username", auth["username"]))
|
||||
ydl_options.extend(("--password", auth["password"]))
|
||||
elif "cookie" in auth:
|
||||
logger.debug(f"Using provided auth cookie for {url}")
|
||||
logger.debug("using provided auth cookie")
|
||||
yt_dlp.utils.std_headers["cookie"] = auth["cookie"]
|
||||
elif "cookies_from_browser" in auth:
|
||||
logger.debug(f"Using extracted cookies from browser {auth['cookies_from_browser']} for {url}")
|
||||
logger.debug(f"using extracted cookies from browser {auth['cookies_from_browser']}")
|
||||
ydl_options.extend(("--cookies-from-browser", auth["cookies_from_browser"]))
|
||||
elif "cookies_file" in auth:
|
||||
logger.debug(f"Using cookies from file {auth['cookies_file']} for {url}")
|
||||
logger.debug(f"using cookies from file {auth['cookies_file']}")
|
||||
ydl_options.extend(("--cookies", auth["cookies_file"]))
|
||||
|
||||
# Applying user-defined extractor_args
|
||||
@@ -580,11 +579,11 @@ class GenericExtractor(Extractor):
|
||||
arg_str = ";".join(f"{k}={v}" for k, v in args.items())
|
||||
else:
|
||||
arg_str = str(args)
|
||||
logger.debug(f"Setting extractor_args: {key}:{arg_str}")
|
||||
logger.debug(f"setting extractor_args: {key}:{arg_str}")
|
||||
ydl_options.extend(["--extractor-args", f"{key}:{arg_str}"])
|
||||
|
||||
if self.ytdlp_args:
|
||||
logger.debug("Adding additional ytdlp arguments: {self.ytdlp_args}")
|
||||
logger.debug(f"adding additional ytdlp arguments: {self.ytdlp_args}")
|
||||
ydl_options += self.ytdlp_args.split(" ")
|
||||
|
||||
*_, validated_options = yt_dlp.parse_options(ydl_options)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import requests
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from yt_dlp.extractor.tiktok import TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE
|
||||
|
||||
@@ -22,7 +22,7 @@ class Tiktok(GenericDropin):
|
||||
return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE))
|
||||
|
||||
def extract_post(self, url: str, ie_instance):
|
||||
logger.debug(f"Using Tikwm API to attempt to download tiktok video from {url=}")
|
||||
logger.debug(f"using Tikwm API to attempt to download tiktok video from {url=}")
|
||||
|
||||
endpoint = self.TIKWM_ENDPOINT.format(url=url)
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import re
|
||||
import mimetypes
|
||||
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
from slugify import slugify
|
||||
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
@@ -40,7 +40,7 @@ class Twitter(GenericDropin):
|
||||
raise ValueError("Error retreiving post. Are you sure it exists?")
|
||||
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
||||
except (ValueError, KeyError) as ex:
|
||||
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
||||
logger.warning(f"unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
||||
return False
|
||||
|
||||
full_text = tweet.pop("full_text", "")
|
||||
@@ -49,7 +49,7 @@ class Twitter(GenericDropin):
|
||||
|
||||
result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp)
|
||||
if not tweet.get("entities", {}).get("media"):
|
||||
logger.debug("No media found, archiving tweet text only")
|
||||
logger.debug("no media found, archiving tweet text only")
|
||||
result.status = "twitter-ytdl"
|
||||
return result
|
||||
for i, tw_media in enumerate(tweet["entities"]["media"]):
|
||||
|
||||
@@ -10,11 +10,12 @@ The filtered rows are processed into `Metadata` objects.
|
||||
"""
|
||||
|
||||
import os
|
||||
import traceback
|
||||
from typing import Tuple, Union, Iterator
|
||||
from urllib.parse import quote
|
||||
|
||||
import gspread
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
from slugify import slugify
|
||||
from retrying import retry
|
||||
|
||||
@@ -41,19 +42,19 @@ class GsheetsFeederDB(Feeder, Database):
|
||||
sh = self.open_sheet()
|
||||
for ii, worksheet in enumerate(sh.worksheets()):
|
||||
if not self.should_process_sheet(worksheet.title):
|
||||
logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
|
||||
logger.debug(f"skipped worksheet '{worksheet.title}' due to allow/block rules")
|
||||
continue
|
||||
logger.info(f"Opening worksheet {ii=}: {worksheet.title=} header={self.header}")
|
||||
logger.info(f"opening worksheet {ii=}: {worksheet.title=} header={self.header}")
|
||||
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
|
||||
if len(missing_cols := self.missing_required_columns(gw)):
|
||||
logger.debug(
|
||||
f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}"
|
||||
f"skipped worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}"
|
||||
)
|
||||
continue
|
||||
|
||||
# process and yield metadata here:
|
||||
yield from self._process_rows(gw)
|
||||
logger.info(f"Finished worksheet {worksheet.title}")
|
||||
with logger.contextualize(worksheet=f"{sh.title}:{worksheet.title}"):
|
||||
# process and yield metadata here:
|
||||
yield from self._process_rows(gw)
|
||||
logger.info(f"finished worksheet {worksheet.title}")
|
||||
|
||||
def _process_rows(self, gw: GWorksheet):
|
||||
for row in range(1 + self.header, gw.count_rows() + 1):
|
||||
@@ -69,7 +70,9 @@ class GsheetsFeederDB(Feeder, Database):
|
||||
# All checks done - archival process starts here
|
||||
m = Metadata().set_url(url)
|
||||
self._set_context(m, gw, row)
|
||||
yield m
|
||||
|
||||
with logger.contextualize(row=row):
|
||||
yield m
|
||||
|
||||
def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
|
||||
# TODO: Check folder value not being recognised
|
||||
@@ -99,16 +102,16 @@ class GsheetsFeederDB(Feeder, Database):
|
||||
return missing
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.info(f"STARTED {item}")
|
||||
logger.info("STARTED")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, "status", "Archive in progress")
|
||||
|
||||
def failed(self, item: Metadata, reason: str) -> None:
|
||||
logger.error(f"FAILED {item}")
|
||||
logger.error("FAILED")
|
||||
self._safe_status_update(item, f"Archive failed {reason}")
|
||||
|
||||
def aborted(self, item: Metadata) -> None:
|
||||
logger.warning(f"ABORTED {item}")
|
||||
logger.warning("ABORTED")
|
||||
self._safe_status_update(item, "")
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
@@ -122,9 +125,7 @@ class GsheetsFeederDB(Feeder, Database):
|
||||
cell_updates = []
|
||||
row_values = gw.get_row(row)
|
||||
|
||||
spreadsheet = gw.wks.spreadsheet.title
|
||||
worksheet = gw.wks.title
|
||||
logger.info(f"DONE url='{item.get_url()}' {row=} on {spreadsheet=} : {worksheet=}")
|
||||
logger.info("DONE")
|
||||
|
||||
def batch_if_valid(col, val, final_value=None):
|
||||
final_value = final_value or val
|
||||
@@ -132,7 +133,7 @@ class GsheetsFeederDB(Feeder, Database):
|
||||
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
|
||||
cell_updates.append((row, col, final_value))
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to batch {col}={final_value} due to {e}")
|
||||
logger.error(f"unable to batch {col}={final_value} due to {e}")
|
||||
|
||||
status_message = item.status
|
||||
if cached:
|
||||
@@ -192,15 +193,13 @@ class GsheetsFeederDB(Feeder, Database):
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, "status", new_status)
|
||||
except Exception as e:
|
||||
logger.debug(f"Unable to update sheet: {e}")
|
||||
logger.debug(f"unable to update sheet: {e}: {traceback.format_exc()}")
|
||||
|
||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||
if gsheet := item.get_context("gsheet"):
|
||||
gw: GWorksheet = gsheet.get("worksheet")
|
||||
row: int = gsheet.get("row")
|
||||
elif self.sheet_id:
|
||||
logger.error(
|
||||
f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder."
|
||||
)
|
||||
logger.error("unable to retrieve Gsheet, GsheetDB must be used alongside GsheetFeeder.")
|
||||
|
||||
return gw, row
|
||||
|
||||
@@ -9,7 +9,7 @@ making it suitable for handling large files efficiently.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata
|
||||
@@ -22,8 +22,7 @@ class HashEnricher(Enricher):
|
||||
"""
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
|
||||
logger.debug(f"calculating media hashes with algo={self.algorithm}")
|
||||
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
if len(hd := self.calculate_hash(m.filename)):
|
||||
|
||||
@@ -4,7 +4,7 @@ import os
|
||||
import pathlib
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from urllib.parse import quote
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
import json
|
||||
import base64
|
||||
|
||||
@@ -35,7 +35,7 @@ class HtmlFormatter(Formatter):
|
||||
def format(self, item: Metadata) -> Media:
|
||||
url = item.get_url()
|
||||
if item.is_empty():
|
||||
logger.debug(f"[SKIP] FORMAT there is no media or metadata to format: {url=}")
|
||||
logger.debug("nothing to format, skipping")
|
||||
return
|
||||
|
||||
content = self.template.render(
|
||||
|
||||
@@ -14,7 +14,7 @@ from datetime import datetime
|
||||
import traceback
|
||||
|
||||
import requests
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
from retrying import retry
|
||||
from tqdm import tqdm
|
||||
|
||||
@@ -45,11 +45,11 @@ class InstagramAPIExtractor(Extractor):
|
||||
url = item.get_url()
|
||||
url.replace("instagr.com", "instagram.com").replace("instagr.am", "instagram.com")
|
||||
insta_matches = self.valid_url.findall(url)
|
||||
logger.info(f"{insta_matches=}")
|
||||
|
||||
if not len(insta_matches) or len(insta_matches[0]) != 3:
|
||||
return
|
||||
if len(insta_matches) > 1:
|
||||
logger.warning(f"Multiple instagram matches found in {url=}, using the first one")
|
||||
logger.debug("multiple instagram matches found, using the first one")
|
||||
return
|
||||
g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2]
|
||||
if g1 == "":
|
||||
@@ -65,7 +65,7 @@ class InstagramAPIExtractor(Extractor):
|
||||
return self.download_post(item, id=g3, context="story")
|
||||
return self.download_stories(item, g2)
|
||||
else:
|
||||
logger.warning(f"Unknown instagram regex group match {g1=} found in {url=}")
|
||||
logger.warning(f"unknown instagram regex group match {g1=}")
|
||||
return
|
||||
|
||||
@retry(wait_random_min=1000, wait_random_max=3000, stop_max_attempt_number=5)
|
||||
@@ -112,8 +112,8 @@ class InstagramAPIExtractor(Extractor):
|
||||
count_posts += len(stories)
|
||||
result.set("#stories", len(stories))
|
||||
except Exception as e:
|
||||
result.append("errors", f"Error downloading stories for {username}")
|
||||
logger.error(f"Error downloading stories for {username}: {e} {traceback.format_exc()}")
|
||||
result.append("errors", f"error downloading stories for {username}")
|
||||
logger.error(f"error downloading stories for {username}: {e} {traceback.format_exc()}")
|
||||
|
||||
# download all posts
|
||||
try:
|
||||
@@ -122,8 +122,8 @@ class InstagramAPIExtractor(Extractor):
|
||||
result, user_id, max_to_download=self.full_profile_max_posts - count_posts
|
||||
)
|
||||
except Exception as e:
|
||||
result.append("errors", f"Error downloading posts for {username}")
|
||||
logger.error(f"Error downloading posts for {username}: {e} {traceback.format_exc()}")
|
||||
result.append("errors", f"error downloading posts for {username}")
|
||||
logger.error(f"error downloading posts for {username}: {e} {traceback.format_exc()}")
|
||||
|
||||
# download all tagged
|
||||
try:
|
||||
@@ -132,8 +132,8 @@ class InstagramAPIExtractor(Extractor):
|
||||
result, user_id, max_to_download=self.full_profile_max_posts - count_posts
|
||||
)
|
||||
except Exception as e:
|
||||
result.append("errors", f"Error downloading tagged posts for {username}")
|
||||
logger.error(f"Error downloading tagged posts for {username}: {e} {traceback.format_exc()}")
|
||||
result.append("errors", f"error downloading tagged posts for {username}")
|
||||
logger.error(f"error downloading tagged posts for {username}: {e} {traceback.format_exc()}")
|
||||
|
||||
# download all highlights
|
||||
try:
|
||||
@@ -159,10 +159,10 @@ class InstagramAPIExtractor(Extractor):
|
||||
except Exception as e:
|
||||
result.append(
|
||||
"errors",
|
||||
f"Error downloading highlight id{h.get('pk')} for {username}",
|
||||
f"error downloading highlight id{h.get('pk')} for {username}",
|
||||
)
|
||||
logger.error(
|
||||
f"Error downloading highlight id{h.get('pk')} for {username}: {e} {traceback.format_exc()}"
|
||||
f"error downloading highlight id{h.get('pk')} for {username}: {e} {traceback.format_exc()}"
|
||||
)
|
||||
if count_highlights >= max_to_download:
|
||||
logger.debug(f"HIGHLIGHTS reached max_to_download={self.full_profile_max_posts}")
|
||||
@@ -208,8 +208,8 @@ class InstagramAPIExtractor(Extractor):
|
||||
try:
|
||||
self.scrape_item(result, h, "highlight")
|
||||
except Exception as e:
|
||||
result.append("errors", f"Error downloading highlight {h.get('id')}")
|
||||
logger.error(f"Error downloading highlight, skipping {h.get('id')}: {e} {traceback.format_exc()}")
|
||||
result.append("errors", f"error downloading highlight {h.get('id')}")
|
||||
logger.error(f"error downloading highlight, skipping {h.get('id')}: {e} {traceback.format_exc()}")
|
||||
|
||||
return h_info
|
||||
|
||||
@@ -251,8 +251,8 @@ class InstagramAPIExtractor(Extractor):
|
||||
try:
|
||||
self.scrape_item(result, p, "post")
|
||||
except Exception as e:
|
||||
result.append("errors", f"Error downloading post {p.get('id')}")
|
||||
logger.error(f"Error downloading post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
|
||||
result.append("errors", f"error downloading post {p.get('id')}")
|
||||
logger.error(f"error downloading post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
|
||||
pbar.update(1)
|
||||
post_count += 1
|
||||
if post_count >= max_to_download:
|
||||
@@ -279,8 +279,8 @@ class InstagramAPIExtractor(Extractor):
|
||||
try:
|
||||
self.scrape_item(result, p, "tagged")
|
||||
except Exception as e:
|
||||
result.append("errors", f"Error downloading tagged post {p.get('id')}")
|
||||
logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
|
||||
result.append("errors", f"error downloading tagged post {p.get('id')}")
|
||||
logger.error(f"error downloading tagged post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
|
||||
pbar.update(1)
|
||||
tagged_count += 1
|
||||
if tagged_count >= max_to_download:
|
||||
|
||||
@@ -8,7 +8,7 @@ import re
|
||||
import os
|
||||
import shutil
|
||||
import instaloader
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata
|
||||
@@ -29,8 +29,9 @@ class InstagramExtractor(Extractor):
|
||||
# TODO: links to stories
|
||||
|
||||
def setup(self) -> None:
|
||||
logger.warning("Instagram Extractor is not actively maintained, and may not work as expected.")
|
||||
logger.warning("Please consider using the Instagram Tbot Extractor or Instagram API Extractor instead.")
|
||||
logger.warning(
|
||||
"Instagram Extractor is not actively maintained, and may not work as expected.\nPlease consider using the Instagram Tbot Extractor or Instagram API Extractor instead."
|
||||
)
|
||||
|
||||
self.insta = instaloader.Instaloader(
|
||||
download_geotags=True,
|
||||
@@ -43,12 +44,11 @@ class InstagramExtractor(Extractor):
|
||||
self.insta.load_session_from_file(self.username, self.session_file)
|
||||
except Exception:
|
||||
try:
|
||||
logger.debug("Session file failed", exc_info=True)
|
||||
logger.info("No valid session file found - Attempting login with use and password.")
|
||||
logger.info("no valid session file found - Attempting login with use and password.")
|
||||
self.insta.login(self.username, self.password)
|
||||
self.insta.save_session_to_file(self.session_file)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}")
|
||||
logger.error(f"failed to setup Instagram Extractor with Instagrapi. {e}")
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
@@ -72,14 +72,14 @@ class InstagramExtractor(Extractor):
|
||||
result = self.download_profile(url, profile_matches[0])
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to download with instagram extractor due to: {e}, make sure your account credentials are valid."
|
||||
f"failed to download with instagram extractor due to: {e}, make sure your account credentials are valid."
|
||||
)
|
||||
finally:
|
||||
shutil.rmtree(self.download_folder, ignore_errors=True)
|
||||
return result
|
||||
|
||||
def download_post(self, url: str, post_id: str) -> Metadata:
|
||||
logger.debug(f"Instagram {post_id=} detected in {url=}")
|
||||
logger.debug(f"Instagram {post_id=} detected")
|
||||
|
||||
post = instaloader.Post.from_shortcode(self.insta.context, post_id)
|
||||
if self.insta.download_post(post, target=post.owner_username):
|
||||
@@ -87,7 +87,7 @@ class InstagramExtractor(Extractor):
|
||||
|
||||
def download_profile(self, url: str, username: str) -> Metadata:
|
||||
# gets posts, posts where username is tagged, igtv postss, stories, and highlights
|
||||
logger.debug(f"Instagram {username=} detected in {url=}")
|
||||
logger.debug(f"Instagram {username=} detected")
|
||||
|
||||
profile = instaloader.Profile.from_username(self.insta.context, username)
|
||||
try:
|
||||
@@ -95,27 +95,27 @@ class InstagramExtractor(Extractor):
|
||||
try:
|
||||
self.insta.download_post(post, target=f"profile_post_{post.owner_username}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download post: {post.shortcode}: {e}")
|
||||
logger.error(f"failed to download post: {post.shortcode}: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed profile.get_posts: {e}")
|
||||
logger.error(f"failed profile.get_posts: {e}")
|
||||
|
||||
try:
|
||||
for post in profile.get_tagged_posts():
|
||||
try:
|
||||
self.insta.download_post(post, target=f"tagged_post_{post.owner_username}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download tagged post: {post.shortcode}: {e}")
|
||||
logger.error(f"failed to download tagged post: {post.shortcode}: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed profile.get_tagged_posts: {e}")
|
||||
logger.error(f"failed profile.get_tagged_posts: {e}")
|
||||
|
||||
try:
|
||||
for post in profile.get_igtv_posts():
|
||||
try:
|
||||
self.insta.download_post(post, target=f"igtv_post_{post.owner_username}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download igtv post: {post.shortcode}: {e}")
|
||||
logger.error(f"failed to download igtv post: {post.shortcode}: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed profile.get_igtv_posts: {e}")
|
||||
logger.error(f"failed profile.get_igtv_posts: {e}")
|
||||
|
||||
try:
|
||||
for story in self.insta.get_stories([profile.userid]):
|
||||
@@ -123,9 +123,9 @@ class InstagramExtractor(Extractor):
|
||||
try:
|
||||
self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download story item: {item}: {e}")
|
||||
logger.error(f"failed to download story item: {item}: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed get_stories: {e}")
|
||||
logger.error(f"failed get_stories: {e}")
|
||||
|
||||
try:
|
||||
for highlight in self.insta.get_highlights(profile.userid):
|
||||
@@ -133,9 +133,9 @@ class InstagramExtractor(Extractor):
|
||||
try:
|
||||
self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download highlight item: {item}: {e}")
|
||||
logger.error(f"failed to download highlight item: {item}: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed get_highlights: {e}")
|
||||
logger.error(f"failed get_highlights: {e}")
|
||||
|
||||
return self.process_downloads(url, f"@{username}", profile._asdict(), None)
|
||||
|
||||
@@ -158,4 +158,4 @@ class InstagramExtractor(Extractor):
|
||||
|
||||
return result.success("instagram")
|
||||
except Exception as e:
|
||||
logger.error(f"Could not fetch instagram post {url} due to: {e}")
|
||||
logger.error(f"could not fetch instagram post due to: {e}")
|
||||
|
||||
@@ -12,7 +12,7 @@ import shutil
|
||||
import time
|
||||
from sqlite3 import OperationalError
|
||||
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
from telethon.sync import TelegramClient
|
||||
|
||||
from auto_archiver.core import Extractor
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import json
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
import os
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import shutil
|
||||
from typing import IO
|
||||
import os
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.core import Storage
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import datetime
|
||||
import os
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import subprocess
|
||||
import traceback
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import os
|
||||
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
import opentimestamps
|
||||
from opentimestamps.calendar import RemoteCalendar, DEFAULT_CALENDAR_WHITELIST
|
||||
from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile
|
||||
|
||||
@@ -15,7 +15,7 @@ import traceback
|
||||
import pdqhash
|
||||
import numpy as np
|
||||
from PIL import Image, UnidentifiedImageError
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
@@ -2,7 +2,7 @@ from typing import IO
|
||||
|
||||
import boto3
|
||||
import os
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.core import Storage
|
||||
|
||||
@@ -2,7 +2,7 @@ import ssl
|
||||
import os
|
||||
from slugify import slugify
|
||||
from urllib.parse import urlparse
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
@@ -2,7 +2,7 @@ import requests
|
||||
import re
|
||||
import html
|
||||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
@@ -17,7 +17,7 @@ from telethon.errors.rpcerrorlist import (
|
||||
)
|
||||
|
||||
from tqdm import tqdm
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
@@ -9,7 +9,7 @@ and identify important moments without watching the entire video.
|
||||
|
||||
import ffmpeg
|
||||
import os
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Media, Metadata
|
||||
|
||||
@@ -5,7 +5,7 @@ import hashlib
|
||||
|
||||
from slugify import slugify
|
||||
import requests
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from rfc3161_client import (decode_timestamp_response,TimestampRequestBuilder,TimeStampResponse, VerifierBuilder)
|
||||
from rfc3161_client import VerificationError as Rfc3161VerificationError
|
||||
|
||||
@@ -4,7 +4,7 @@ import re
|
||||
import mimetypes
|
||||
import requests
|
||||
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
from pytwitter import Api
|
||||
from slugify import slugify
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ import os
|
||||
import shutil
|
||||
import subprocess
|
||||
from zipfile import ZipFile
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
from warcio.archiveiterator import ArchiveIterator
|
||||
|
||||
from auto_archiver.core import Media, Metadata
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import json
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
import time
|
||||
import requests
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import traceback
|
||||
import requests
|
||||
import time
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
37
src/auto_archiver/utils/custom_logger.py
Normal file
37
src/auto_archiver/utils/custom_logger.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from loguru import logger
|
||||
import json
|
||||
|
||||
|
||||
def extract_log_data(record):
|
||||
subset = {
|
||||
"level": record["level"].name,
|
||||
"time": record["time"].isoformat(timespec="seconds"),
|
||||
}
|
||||
subset["loc"] = f"{record['file'].name}:{record['function']}:{record['line']}"
|
||||
|
||||
for extra_key in ["trace", "url", "worksheet", "row"]:
|
||||
if extra_val := record.get("extra", {}).get(extra_key):
|
||||
subset[extra_key] = extra_val
|
||||
|
||||
subset["message"] = record["message"]
|
||||
if exception := record.get("exception"):
|
||||
subset["exception"] = exception
|
||||
return subset
|
||||
|
||||
|
||||
def serialize_no_message(record):
|
||||
subset = extract_log_data(record)
|
||||
subset.pop("message", None)
|
||||
return json.dumps(subset, ensure_ascii=False)
|
||||
|
||||
|
||||
def serialize(record):
|
||||
return json.dumps(extract_log_data(record), ensure_ascii=False)
|
||||
|
||||
|
||||
def patching(record):
|
||||
record["extra"]["serialized"] = serialize(record)
|
||||
record["extra"]["serialize_no_message"] = serialize_no_message(record)
|
||||
|
||||
|
||||
logger = logger.patch(patching)
|
||||
@@ -7,7 +7,7 @@ from datetime import datetime, timezone
|
||||
from dateutil.parser import parse as parse_dt
|
||||
|
||||
import requests
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
|
||||
def mkdir_if_not_exists(folder):
|
||||
|
||||
@@ -9,7 +9,7 @@ from tempfile import TemporaryDirectory
|
||||
from typing import Dict, Tuple
|
||||
import hashlib
|
||||
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
import pytest
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from auto_archiver.core.module import ModuleFactory
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from auto_archiver.core import Extractor
|
||||
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
|
||||
class ExampleExtractor(Extractor):
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata
|
||||
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
|
||||
class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):
|
||||
|
||||
@@ -25,7 +25,7 @@ def orchestration_file(orchestration_file_path):
|
||||
def autoarchiver(tmp_path, monkeypatch, request):
|
||||
def _autoarchiver(args=[]):
|
||||
def cleanup():
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
if not logger._core.handlers.get(0):
|
||||
logger._core.handlers_count = 0
|
||||
|
||||
Reference in New Issue
Block a user