WIP refactor logging

This commit is contained in:
msramalho
2025-06-21 15:54:51 +01:00
parent ade7feb5a0
commit ce4d7ac649
54 changed files with 298 additions and 207 deletions

View File

@@ -14,7 +14,7 @@ You will need to provide your phone number and a 2FA code the first time you run
import os
from telethon.sync import TelegramClient
from loguru import logger
from auto_archiver.utils.custom_logger import logger
# Create a

View File

@@ -7,7 +7,7 @@ from tempfile import TemporaryDirectory
from auto_archiver.utils import url as UrlUtil
from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES
from loguru import logger
from auto_archiver.utils.custom_logger import logger
if TYPE_CHECKING:
from .module import ModuleFactory

View File

@@ -10,7 +10,7 @@ from ruamel.yaml import YAML, CommentedMap
import json
import os
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from copy import deepcopy
from auto_archiver.core.consts import MODULE_TYPES
@@ -118,8 +118,7 @@ class DefaultValidatingParser(argparse.ArgumentParser):
"""
Override of error to format a nicer looking error message using logger
"""
logger.error("Problem with configuration file (tip: use --help to see the available options):")
logger.error(message)
logger.error(f"Problem with configuration file (tip: use --help to see the available options): \n{message}")
self.exit(2)
def parse_known_args(self, args=None, namespace=None):
@@ -136,8 +135,7 @@ class DefaultValidatingParser(argparse.ArgumentParser):
try:
self._check_value(action, action.default)
except argparse.ArgumentError as e:
logger.error(f"You have an invalid setting in your configuration file ({action.dest}):")
logger.error(e)
logger.error(f"You have an invalid setting in your configuration file ({action.dest}):\n {e}")
exit()
return super().parse_known_args(args, namespace)

View File

@@ -12,7 +12,7 @@ from contextlib import suppress
import mimetypes
import os
import requests
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from retrying import retry
import re
@@ -94,7 +94,7 @@ class Extractor(BaseModule):
to_filename = to_filename[-64:]
to_filename = os.path.join(self.tmp_dir, to_filename)
if verbose:
logger.debug(f"downloading {url[0:50]=} {to_filename=}")
logger.debug(f"downloading {to_filename=}")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
}
@@ -117,7 +117,7 @@ class Extractor(BaseModule):
return to_filename
except requests.RequestException as e:
logger.warning(f"Failed to fetch the Media URL: {str(e)[:250]}")
logger.warning(f"Failed to fetch the Media URL: {e}")
if try_best_quality:
return None, url

View File

@@ -11,7 +11,7 @@ from dataclasses import dataclass, field
from dataclasses_json import dataclass_json, config
import mimetypes
from loguru import logger
from auto_archiver.utils.custom_logger import logger
@dataclass_json # annotation order matters
@@ -121,8 +121,7 @@ class Media:
except Error:
return False # ffmpeg errors when reading bad files
except Exception as e:
logger.error(e)
logger.error(traceback.format_exc())
logger.error(f"{e}: {traceback.format_exc()}")
try:
fsize = os.path.getsize(self.filename)
return fsize > 20_000

View File

@@ -17,7 +17,7 @@ from dataclasses_json import dataclass_json
import datetime
from urllib.parse import urlparse
from dateutil.parser import parse as parse_dt
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from .media import Media

View File

@@ -16,7 +16,7 @@ import sys
from importlib.util import find_spec
import os
from os.path import join
from loguru import logger
from auto_archiver.utils.custom_logger import logger
import auto_archiver
from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE, SetupError

View File

@@ -15,9 +15,11 @@ import traceback
from copy import copy
from rich_argparse import RichHelpFormatter
from loguru import logger
from auto_archiver.utils.custom_logger import logger
import requests
from auto_archiver.utils.misc import random_str
from .metadata import Metadata, Media
from auto_archiver.version import __version__
from .config import (
@@ -342,7 +344,12 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
# add other logging info
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
use_level = logging_config["level"]
self.logger_id = logger.add(sys.stderr, level=use_level)
self.logger_id = logger.add(
sys.stderr,
level=use_level,
catch=True,
format="<level>{level}</level>: <fg #64FFDA>{message}</fg #64FFDA> {extra[serialize_no_message]}",
)
rotation = logging_config["rotation"]
log_file = logging_config["file"]
@@ -356,9 +363,10 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
f"{log_file}.{i}_{level.lower()}",
filter=lambda rec, lvl=level: rec["level"].name == lvl,
rotation=rotation,
format="{extra[serialized]}",
)
elif log_file:
logger.add(log_file, rotation=rotation, level=use_level)
logger.add(log_file, rotation=rotation, level=use_level, format="{extra[serialized]}")
def install_modules(self, modules_by_type):
"""
@@ -466,13 +474,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
update_cmd = "`docker pull bellingcat/auto-archiver:latest`"
else:
update_cmd = "`pip install --upgrade auto-archiver`"
logger.warning("")
logger.warning("********* IMPORTANT: UPDATE AVAILABLE ********")
logger.warning(
f"A new version of auto-archiver is available (v{latest_version}, you have v{current_version})"
f"\n********* IMPORTANT: UPDATE AVAILABLE ********\nA new version of auto-archiver is available (v{latest_version}, you have v{current_version})\nMake sure to update to the latest version using: {update_cmd}\n"
)
logger.warning(f"Make sure to update to the latest version using: {update_cmd}")
logger.warning("")
def setup(self, args: list):
"""
@@ -522,7 +526,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
self.setup(args)
return self.feed()
except Exception as e:
logger.error(e)
logger.error(f"{e}: {traceback.format_exc()}")
exit(1)
def cleanup(self) -> None:
@@ -534,10 +538,12 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
url_count = 0
for feeder in self.feeders:
for item in feeder:
yield self.feed_item(item)
url_count += 1
with logger.contextualize(url=item.get_url(), trace=random_str(12)):
logger.info("started processing")
yield self.feed_item(item)
url_count += 1
logger.info(f"Processed {url_count} URL(s)")
logger.info(f"processed {url_count} URL(s)")
self.cleanup()
def feed_item(self, item: Metadata) -> Metadata:
@@ -555,13 +561,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
return self.archive(item)
except KeyboardInterrupt:
# catches keyboard interruptions to do a clean exit
logger.warning(f"caught interrupt on {item=}")
logger.warning("caught interrupt")
for d in self.databases:
d.aborted(item)
self.cleanup()
exit()
except Exception as e:
logger.error(f"Got unexpected error on item {item}: {e}\n{traceback.format_exc()}")
logger.error(f"Got unexpected error: {e}\n{traceback.format_exc()}")
for d in self.databases:
if isinstance(e, AssertionError):
d.failed(item, str(e))
@@ -589,7 +595,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
try:
check_url_or_raise(original_url)
except ValueError as e:
logger.error(f"Error archiving URL {original_url}: {e}")
logger.error(f"Error archiving: {e}")
raise e
# 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
@@ -599,7 +605,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
result.set_url(url)
if original_url != url:
logger.debug(f"Sanitized URL from {original_url} to {url}")
logger.debug(f"Sanitized URL to {url}")
result.set("original_url", original_url)
# 2 - notify start to DBs, propagate already archived if feature enabled in DBs
@@ -614,25 +620,25 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
try:
d.done(cached_result, cached=True)
except Exception as e:
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
logger.error(f"database {d.name}: {e}: {traceback.format_exc()}")
return cached_result
# 3 - call extractors until one succeeds
for a in self.extractors:
logger.info(f"Trying extractor {a.name} for {url}")
logger.info(f"trying extractor {a.name}")
try:
result.merge(a.download(result))
if result.is_success():
break
except Exception as e:
logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")
logger.error(f"archiver {a.name}: {e}: {traceback.format_exc()}")
# 4 - call enrichers to work with archived content
for e in self.enrichers:
try:
e.enrich(result)
except Exception as exc:
logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
logger.error(f"enricher {e.name}: {exc}: {traceback.format_exc()}")
# 5 - store all downloaded/generated media
result.store(storages=self.storages)
@@ -651,7 +657,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
try:
d.done(result)
except Exception as e:
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
logger.error(f"database {d.name}: {e}: {traceback.format_exc()}")
return result

View File

@@ -24,7 +24,7 @@ from abc import abstractmethod
from typing import IO
import os
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from slugify import slugify
from auto_archiver.utils.misc import random_str

View File

@@ -7,7 +7,7 @@ from urllib.parse import urljoin
import glob
import importlib.util
from loguru import logger
from auto_archiver.utils.custom_logger import logger
import selenium
from seleniumbase import SB
@@ -57,7 +57,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
continue # Skip imported modules/classes/functions
if isinstance(obj, type) and issubclass(obj, Dropin):
dropins.append(obj)
logger.debug(f"ANTIBOT loaded drop-in classes: {', '.join([d.__name__ for d in dropins])}")
logger.debug(f"loaded drop-in classes: {', '.join([d.__name__ for d in dropins])}")
return dropins
def sanitize_url(self, url: str) -> str:
@@ -83,14 +83,13 @@ class AntibotExtractorEnricher(Extractor, Enricher):
def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool:
using_user_data_dir = self.user_data_dir if custom_data_dir else None
url = to_enrich.get_url()
url_sample = url[:75]
try:
with SB(uc=True, agent=self.agent, headed=None, user_data_dir=using_user_data_dir, proxy=self.proxy) as sb:
logger.info(f"ANTIBOT selenium browser is up with agent {self.agent}, opening {url_sample}...")
logger.info(f"selenium browser is up with agent {self.agent}, opening url...")
sb.uc_open_with_reconnect(url, 4)
logger.debug(f"ANTIBOT handling CAPTCHAs for {url_sample}...")
logger.debug("handling CAPTCHAs for...")
sb.uc_gui_handle_cf()
sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future
@@ -98,7 +97,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
dropin.open_page(url)
if self.detect_auth_wall and self._hit_auth_wall(sb):
logger.warning(f"ANTIBOT SKIP since auth wall or CAPTCHA was detected for {url_sample}")
logger.warning("skipping since auth wall or CAPTCHA was detected")
return False
sb.wait_for_ready_state_complete()
@@ -125,18 +124,18 @@ class AntibotExtractorEnricher(Extractor, Enricher):
js_css_selector=dropin.js_for_video_css_selectors(),
max_media=self.max_download_videos - downloaded_videos,
)
logger.info(f"ANTIBOT completed for {url_sample}")
logger.info("completed")
return to_enrich
except selenium.common.exceptions.SessionNotCreatedException as e:
if custom_data_dir: # the retry logic only works once
logger.error(
f"ANTIBOT session not created error: {e}. Please remove the user_data_dir {self.user_data_dir} and try again, will retry without user data dir though."
f"session not created error: {e}. Please remove the user_data_dir {self.user_data_dir} and try again, will retry without user data dir though."
)
return self.enrich(to_enrich, custom_data_dir=False)
raise e # re-raise
except Exception as e:
logger.error(f"ANTIBOT runtime error: {e}: {traceback.format_exc()}")
logger.error(f"runtime error: {e}: {traceback.format_exc()}")
return False
def _get_suitable_dropin(self, url: str, sb: SB):
@@ -146,7 +145,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
"""
for dropin in self.dropins:
if dropin.suitable(url):
logger.debug(f"ANTIBOT using drop-in {dropin.__name__} for {url}")
logger.debug(f"using drop-in {dropin.__name__}")
return dropin(sb, self)
return DefaultDropin(sb, self)
@@ -241,7 +240,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
x = max(sb.execute_script("return document.documentElement.scrollWidth"), w)
y = min(max(sb.execute_script("return document.documentElement.scrollHeight"), h), 25_000)
logger.debug(f"Setting window size to {x}x{y} for full page screenshot.")
logger.debug(f"setting window size to {x}x{y} for full page screenshot.")
sb.set_window_size(x, y)
screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png")
@@ -280,7 +279,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
# js_for_css_selectors
for src in sources:
if len(all_urls) >= max_media:
logger.debug(f"Reached max download limit of {max_media} images/videos.")
logger.debug(f"reached max download limit of {max_media} images/videos.")
break
if not is_relevant_url(src):
continue

View File

@@ -0,0 +1,60 @@
# def solve_captcha(image_url):
# # Download image
# img_data = requests.get(image_url).content
# encoded_image = base64.b64encode(img_data).decode()
# # Submit to AntiCaptcha
# task = {
# "clientKey": ANTI_CAPTCHA_KEY,
# "task": {
# "type": "ImageToTextTask",
# "body": encoded_image
# }
# }
# print("[*] Sending captcha request to anti-captcha...")
# task_response = requests.post("https://api.anti-captcha.com/createTask", json=task).json()
# task_id = task_response["taskId"]
# print(f"[*] Anti-captcha response: {task_response}")
# # Poll for result
# while True:
# time.sleep(5)
# res = requests.post("https://api.anti-captcha.com/getTaskResult", json={
# "clientKey": ANTI_CAPTCHA_KEY,
# "taskId": task_id
# }).json()
# if res["status"] == "ready":
# print(f"[*] Captcha solved: {res}")
# return res["solution"]["text"]
# print(f"[*] Polling for captcha solution: {res['status']}")
# def solve_recaptcha(site_key, page_url):
# print("[*] Sending captcha request to anti-captcha...")
# # Step 1: Send captcha request
# task_payload = {
# "clientKey": ANTI_CAPTCHA_KEY,
# "task": {
# "type": "NoCaptchaTaskProxyless",
# "websiteURL": page_url,
# "websiteKey": site_key
# }
# }
# response = requests.post("https://api.anti-captcha.com/createTask", json=task_payload).json()
# print(f"[*] Anti-captcha response: {response}")
# task_id = response["taskId"]
# # Step 2: Poll for solution
# print("[*] Polling for captcha solution...")
# for i in range(40): # ~80 seconds
# time.sleep(2)
# result = requests.post("https://api.anti-captcha.com/getTaskResult", json={
# "clientKey": ANTI_CAPTCHA_KEY,
# "taskId": task_id
# }).json()
# print(f" Poll {i+1}: status={result['status']}")
# if result["status"] == "ready":
# print("[*] Captcha solved!")
# return result["solution"]["gRecaptchaResponse"]
# raise TimeoutError("AntiCaptcha took too long")

View File

@@ -1,6 +1,7 @@
import os
import traceback
from typing import Mapping
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from seleniumbase import SB
import yt_dlp
@@ -143,7 +144,7 @@ class Dropin:
with yt_dlp.YoutubeDL(validated_options) as ydl:
for url in video_urls:
try:
logger.debug(f"Downloading video from URL: {url}")
logger.debug("downloading video from url")
info = ydl.extract_info(url, download=True)
filename = ydl_entry_to_filename(ydl, info)
if not filename: # Failed to download video.
@@ -155,5 +156,5 @@ class Dropin:
to_enrich.add_media(media)
downloaded += 1
except Exception as e:
logger.error(f"Error downloading {url}: {e}")
logger.error(f"download failed: {e} {traceback.format_exc()}")
return downloaded

View File

@@ -1,5 +1,5 @@
from typing import Mapping
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
@@ -62,7 +62,7 @@ class LinkedinDropin(Dropin):
self.sb.wait_for_ready_state_complete()
username, password = self._get_username_password("linkedin.com")
logger.debug("LinkedinDropin Logging in to Linkedin with username: {}", username)
logger.debug("logging in to Linkedin with username: {}", username)
self.sb.type("#username", username)
self.sb.type("#password", password)
self.sb.click_if_visible("#password-visibility-toggle", timeout=0.5)

View File

@@ -3,7 +3,7 @@ from typing import Mapping
from auto_archiver.core.metadata import Metadata
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
from loguru import logger
from auto_archiver.utils.custom_logger import logger
class RedditDropin(Dropin):
@@ -50,7 +50,7 @@ class RedditDropin(Dropin):
self._close_cookies_banner()
username, password = self._get_username_password("reddit.com")
logger.debug("RedditDropin Logging in to Reddit with username: {}", username)
logger.debug("logging in to Reddit with username: {}", username)
self.sb.type("#login-username", username)
self.sb.type("#login-password", password)
@@ -68,7 +68,7 @@ class RedditDropin(Dropin):
self.sb.click_link_text("Log in")
self.sb.wait_for_ready_state_complete()
if self.sb.is_text_visible("Welcome back"):
logger.debug("RedditDropin Login successful")
logger.debug("login successful")
self.sb.click_if_visible("this link")
def _close_cookies_banner(self):
@@ -88,5 +88,5 @@ class RedditDropin(Dropin):
.map(el => el.src || el.href)
.filter(url => url && /\.(m3u8|mpd|ism)$/.test(url));
""")
logger.debug("RedditDropin Found {} video URLs", len(filtered_urls))
logger.debug("found {} video URLs", len(filtered_urls))
return 0, self._download_videos_with_ytdlp(filtered_urls, to_enrich)

View File

@@ -4,7 +4,7 @@ from typing import Mapping
from auto_archiver.core.metadata import Metadata
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
from loguru import logger
from auto_archiver.utils.custom_logger import logger
class VkDropin(Dropin):
@@ -57,12 +57,12 @@ class VkDropin(Dropin):
self.sb.open("https://vk.com")
self.sb.wait_for_ready_state_complete()
if "/feed" in self.sb.get_current_url():
logger.debug("Already logged in to VK.")
logger.debug("already logged in to VK.")
return True
# need to login
username, password = self._get_username_password("vk.com")
logger.debug("Logging in to VK with username: {}", username)
logger.debug("logging in to VK with username: {}", username)
self.sb.click('[data-testid="enter-another-way"]', timeout=10)
self.sb.clear('input[name="login"][type="tel"]', by="css selector", timeout=10)

View File

@@ -2,7 +2,7 @@ from typing import Union
import os
import requests
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Database
from auto_archiver.core import Metadata
@@ -36,9 +36,9 @@ class AAApiDb(Database):
if not self.store_results:
return
if cached:
logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
logger.debug("skipping saving archive to AA API because it was cached")
return
logger.debug(f"saving archive of {item.get_url()} to the AA API.")
logger.debug("saving archive to the AA API.")
payload = {
"author_id": self.author_id,

View File

@@ -3,7 +3,7 @@ import os
from typing import IO, Iterator, Optional, Union
import requests
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Database, Feeder, Media, Metadata, Storage
from auto_archiver.utils import calculate_file_hash
@@ -66,13 +66,13 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
"""Mark an item as failed in Atlos, if the ID exists."""
atlos_id = item.metadata.get("atlos_id")
if not atlos_id:
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
logger.info("No Atlos ID available, skipping")
return
self._post(
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
json={"metadata": {"processed": True, "status": "error", "error": reason}},
)
logger.info(f"Stored failure for {item.get_url()} (ID {atlos_id}) on Atlos: {reason}")
logger.info(f"stored failure ID {atlos_id} on Atlos: {reason}")
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
"""check and fetch if the given item has been archived already, each
@@ -88,7 +88,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
"""Mark an item as successfully archived in Atlos."""
atlos_id = item.metadata.get("atlos_id")
if not atlos_id:
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
logger.info("item has no Atlos ID, skipping")
return
self._post(
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
@@ -100,7 +100,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
}
},
)
logger.info(f"Stored success for {item.get_url()} (ID {atlos_id}) on Atlos")
logger.info(f"stored success ID {atlos_id} on Atlos")
# ! Atlos Module - Storage Methods
@@ -111,12 +111,12 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool:
"""Upload a media file to Atlos if it has not been uploaded already."""
if metadata is None:
logger.error(f"No metadata provided for {media.filename}")
logger.error(f"no metadata provided for {media.filename}")
return False
atlos_id = metadata.get("atlos_id")
if not atlos_id:
logger.error(f"No Atlos ID found in metadata; can't store {media.filename} in Atlos.")
logger.error(f"no Atlos ID found in metadata; can't store {media.filename} in Atlos.")
return False
media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096)
@@ -135,7 +135,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
params={"title": media.properties},
files={"file": (os.path.basename(media.filename), file_obj)},
)
logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}")
logger.info(f"uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}")
return True
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:

View File

@@ -1,5 +1,3 @@
from loguru import logger
from auto_archiver.core.feeder import Feeder
from auto_archiver.core.metadata import Metadata
from auto_archiver.core.consts import SetupError
@@ -16,8 +14,5 @@ class CLIFeeder(Feeder):
def __iter__(self) -> Metadata:
urls = self.config["urls"]
for url in urls:
logger.debug(f"Processing {url}")
m = Metadata().set_url(url)
yield m
logger.success(f"Processed {len(urls)} URL(s)")

View File

@@ -1,4 +1,4 @@
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Database
from auto_archiver.core import Metadata

View File

@@ -1,5 +1,5 @@
import os
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from csv import DictWriter
from dataclasses import asdict

View File

@@ -1,4 +1,4 @@
from loguru import logger
from auto_archiver.utils.custom_logger import logger
import csv
from auto_archiver.core import Feeder
@@ -20,20 +20,19 @@ class CSVFeeder(Feeder):
url_column = first_row.index(url_column)
except ValueError:
logger.error(
f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?"
f"column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?"
)
return
elif not (url_or_none(first_row[url_column])):
# it's a header row, but we've been given a column number already
logger.debug(f"Skipping header row: {first_row}")
logger.debug(f"skipping header row: {first_row}")
else:
# first row isn't a header row, rewind the file
f.seek(0)
for row in reader:
if not url_or_none(row[url_column]):
logger.warning(f"Not a valid URL in row: {row}, skipping")
logger.warning(f"not a valid URL in row: {row}, skipping")
continue
url = row[url_column]
logger.debug(f"Processing {url}")
yield Metadata().set_url(url)

View File

@@ -8,7 +8,7 @@ from google.oauth2 import service_account
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Media
from auto_archiver.core import Storage
@@ -23,10 +23,10 @@ class GDriveStorage(Storage):
def _setup_google_drive_service(self):
"""Initialize Google Drive service based on provided credentials."""
if self.oauth_token:
logger.debug(f"Using Google Drive OAuth token: {self.oauth_token}")
logger.debug(f"using Google Drive OAuth token: {self.oauth_token}")
self.service = self._initialize_with_oauth_token()
elif self.service_account:
logger.debug(f"Using Google Drive service account: {self.service_account}")
logger.debug(f"using Google Drive service account: {self.service_account}")
self.service = self._initialize_with_service_account()
else:
raise ValueError("Missing credentials: either `oauth_token` or `service_account` must be provided.")
@@ -41,7 +41,7 @@ class GDriveStorage(Storage):
if not creds.valid and creds.expired and creds.refresh_token:
creds.refresh(Request())
with open(self.oauth_token, "w") as token_file:
logger.debug("Saving refreshed OAuth token.")
logger.debug("saving refreshed OAuth token.")
token_file.write(creds.to_json())
elif not creds.valid:
raise ValueError("Invalid OAuth token. Please regenerate the token.")
@@ -180,7 +180,7 @@ class GDriveStorage(Storage):
Creates a new GDrive folder @name inside folder @parent_id
Returns id of the created folder
"""
logger.debug(f"Creating new folder with {name=} inside {parent_id=}")
logger.debug(f"creating new folder with {name=} inside {parent_id=}")
file_metadata = {"name": [name], "mimeType": "application/vnd.google-apps.folder", "parents": [parent_id]}
gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields="id").execute()
return gd_folder.get("id")

View File

@@ -1,4 +1,4 @@
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from auto_archiver.core.extractor import Extractor
from auto_archiver.core.metadata import Metadata, Media
@@ -18,7 +18,7 @@ class Bluesky(GenericDropin):
# download if embeds present (1 video XOR >=1 images)
for media in self._download_bsky_embeds(post, archiver):
result.add_media(media)
logger.debug(f"Downloaded {len(result.media)} media files")
logger.debug(f"downloaded {len(result.media)} media files")
return result

View File

@@ -14,7 +14,7 @@ from yt_dlp.extractor.common import InfoExtractor
from yt_dlp.utils import MaxDownloadsReached
import pysubs2
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from auto_archiver.core.extractor import Extractor
from auto_archiver.core import Metadata, Media
@@ -63,12 +63,11 @@ class GenericExtractor(Extractor):
if os.environ.get("AUTO_ARCHIVER_ALLOW_RESTART", "1") != "1":
logger.warning("yt-dlp or plugin was updated — please restart auto-archiver manually")
else:
logger.warning("yt-dlp or plugin was updated — restarting auto-archiver")
logger.warning(" ======= RESTARTING ======= ")
logger.warning("yt-dlp or plugin was updated — restarting auto-archiver\n ======= RESTARTING ======= ")
os.execv(sys.executable, [sys.executable] + sys.argv)
def update_package(self, package_name: str) -> bool:
logger.info(f"Checking and updating {package_name}...")
logger.info(f"checking and updating {package_name}...")
from importlib.metadata import version as get_version
old_version = get_version(package_name)
@@ -80,7 +79,7 @@ class GenericExtractor(Extractor):
return True
logger.info(f"{package_name} already up to date")
except Exception as e:
logger.error(f"Error updating {package_name}: {e}")
logger.error(f"failed to update {package_name}: {e}")
return False
def setup_po_tokens(self) -> None:
@@ -111,7 +110,7 @@ class GenericExtractor(Extractor):
missing_tools = [tool for tool in ("node", "yarn", "npx") if shutil.which(tool) is None]
if missing_tools:
logger.error(
f"Cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. "
f"cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. "
"Install these tools or run bgutils via Docker. "
"See: https://github.com/Brainicism/bgutil-ytdlp-pot-provider"
)
@@ -140,7 +139,7 @@ class GenericExtractor(Extractor):
f"https://github.com/Brainicism/bgutil-ytdlp-pot-provider/archive/refs/tags/{plugin_version}.zip"
)
zip_path = os.path.join(base_dir, f"{plugin_version}.zip")
logger.info(f"Downloading bgutils release zip for version {plugin_version}...")
logger.info(f"downloading bgutils release zip for version {plugin_version}...")
urlretrieve(zip_url, zip_path)
with zipfile.ZipFile(zip_path, "r") as z:
z.extractall(base_dir)
@@ -149,7 +148,7 @@ class GenericExtractor(Extractor):
extracted_root = os.path.join(base_dir, f"bgutil-ytdlp-pot-provider-{plugin_version}")
shutil.move(os.path.join(extracted_root, "server"), server_dir)
shutil.rmtree(extracted_root)
logger.info("Installing dependencies and transpiling PoT Generator script...")
logger.info("installing dependencies and transpiling PoT Generator script...")
subprocess.run(["yarn", "install", "--frozen-lockfile"], cwd=server_dir, check=True)
subprocess.run(["npx", "tsc"], cwd=server_dir, check=True)
@@ -165,7 +164,7 @@ class GenericExtractor(Extractor):
logger.info(f"PO Token script configured at: {script_path}")
except Exception as e:
logger.error(f"Failed to set up PO Token script: {e}")
logger.error(f"failed to set up PO Token script: {e}")
def suitable_extractors(self, url: str) -> Generator[str, None, None]:
"""
@@ -206,7 +205,7 @@ class GenericExtractor(Extractor):
media = Media(cover_image_path)
metadata.add_media(media, id="cover")
except Exception as e:
logger.error(f"Error downloading cover image {thumbnail_url}: {e}")
logger.error(f"could not download cover image {thumbnail_url}: {e}")
dropin = self.dropin_for_name(info_extractor.ie_key())
if dropin:
@@ -353,7 +352,7 @@ class GenericExtractor(Extractor):
if not dropin:
# TODO: add a proper link to 'how to create your own dropin'
logger.debug(f"""Could not find valid dropin for {info_extractor.ie_key()}.
logger.debug(f"""could not find valid dropin for {info_extractor.ie_key()}.
Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""")
return False
@@ -389,7 +388,7 @@ class GenericExtractor(Extractor):
# file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies.
continue
logger.debug(f"Using filename {filename} for entry {entry.get('id', 'unknown')}")
logger.debug(f"using filename {filename} for entry {entry.get('id', 'unknown')}")
new_media = Media(filename)
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
@@ -404,12 +403,12 @@ class GenericExtractor(Extractor):
text = " ".join([line.text for line in subs])
new_media.set(f"subtitles_{lang}", text)
except Exception as e:
logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}")
logger.error(f"error loading subtitle file {val.get('filepath')}: {e}")
result.add_media(new_media)
except Exception as e:
logger.error(f"Error processing entry {entry}: {e}")
logger.error(f"error processing entry {entry}: {e}")
if not len(result.media):
logger.info(f"No media found for entry {entry}, skipping.")
logger.info(f"no media found for entry {entry}, skipping.")
return False
return self.add_metadata(data, info_extractor, url, result)
@@ -471,14 +470,14 @@ class GenericExtractor(Extractor):
def _helper_for_successful_extract_info(data, info_extractor, url, ydl):
if data.get("is_live", False) and not self.livestreams:
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
logger.warning("livestream detected, skipping due to 'livestreams' configuration setting")
return False
# it's a valid video, that the youtubdedl can download out of the box
return self.get_metadata_for_video(data, info_extractor, url, ydl)
try:
if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
logger.debug(f"skipping using ytdlp to download files for {info_extractor.ie_key()}")
raise SkipYtdlp()
# don't download since it can be a live stream
@@ -497,17 +496,17 @@ class GenericExtractor(Extractor):
if not isinstance(e, SkipYtdlp):
logger.debug(
f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead'
f'issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead'
)
try:
result = self.get_metadata_for_post(info_extractor, url, ydl)
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
logger.error("Error downloading metadata for post: {error}", error=str(post_e))
logger.error("error downloading metadata for post: {error}", error=str(post_e))
return False
except Exception as generic_e:
logger.debug(
'Attempt to extract using ytdlp extractor "{name}" failed: \n {error}',
'attempt to extract using ytdlp extractor "{name}" failed: \n {error}',
name=info_extractor.IE_NAME,
error=str(generic_e),
exc_info=True,
@@ -560,17 +559,17 @@ class GenericExtractor(Extractor):
# order of importance: username/password -> api_key -> cookie -> cookies_from_browser -> cookies_file
if auth:
if "username" in auth and "password" in auth:
logger.debug(f"Using provided auth username and password for {url}")
logger.debug("using provided auth username and password")
ydl_options.extend(("--username", auth["username"]))
ydl_options.extend(("--password", auth["password"]))
elif "cookie" in auth:
logger.debug(f"Using provided auth cookie for {url}")
logger.debug("using provided auth cookie")
yt_dlp.utils.std_headers["cookie"] = auth["cookie"]
elif "cookies_from_browser" in auth:
logger.debug(f"Using extracted cookies from browser {auth['cookies_from_browser']} for {url}")
logger.debug(f"using extracted cookies from browser {auth['cookies_from_browser']}")
ydl_options.extend(("--cookies-from-browser", auth["cookies_from_browser"]))
elif "cookies_file" in auth:
logger.debug(f"Using cookies from file {auth['cookies_file']} for {url}")
logger.debug(f"using cookies from file {auth['cookies_file']}")
ydl_options.extend(("--cookies", auth["cookies_file"]))
# Applying user-defined extractor_args
@@ -580,11 +579,11 @@ class GenericExtractor(Extractor):
arg_str = ";".join(f"{k}={v}" for k, v in args.items())
else:
arg_str = str(args)
logger.debug(f"Setting extractor_args: {key}:{arg_str}")
logger.debug(f"setting extractor_args: {key}:{arg_str}")
ydl_options.extend(["--extractor-args", f"{key}:{arg_str}"])
if self.ytdlp_args:
logger.debug("Adding additional ytdlp arguments: {self.ytdlp_args}")
logger.debug(f"adding additional ytdlp arguments: {self.ytdlp_args}")
ydl_options += self.ytdlp_args.split(" ")
*_, validated_options = yt_dlp.parse_options(ydl_options)

View File

@@ -1,5 +1,5 @@
import requests
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from yt_dlp.extractor.tiktok import TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE
@@ -22,7 +22,7 @@ class Tiktok(GenericDropin):
return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE))
def extract_post(self, url: str, ie_instance):
logger.debug(f"Using Tikwm API to attempt to download tiktok video from {url=}")
logger.debug(f"using Tikwm API to attempt to download tiktok video from {url=}")
endpoint = self.TIKWM_ENDPOINT.format(url=url)

View File

@@ -1,7 +1,7 @@
import re
import mimetypes
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from slugify import slugify
from auto_archiver.core.metadata import Metadata, Media
@@ -40,7 +40,7 @@ class Twitter(GenericDropin):
raise ValueError("Error retreiving post. Are you sure it exists?")
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
except (ValueError, KeyError) as ex:
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
logger.warning(f"unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
return False
full_text = tweet.pop("full_text", "")
@@ -49,7 +49,7 @@ class Twitter(GenericDropin):
result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp)
if not tweet.get("entities", {}).get("media"):
logger.debug("No media found, archiving tweet text only")
logger.debug("no media found, archiving tweet text only")
result.status = "twitter-ytdl"
return result
for i, tw_media in enumerate(tweet["entities"]["media"]):

View File

@@ -10,11 +10,12 @@ The filtered rows are processed into `Metadata` objects.
"""
import os
import traceback
from typing import Tuple, Union, Iterator
from urllib.parse import quote
import gspread
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from slugify import slugify
from retrying import retry
@@ -41,19 +42,19 @@ class GsheetsFeederDB(Feeder, Database):
sh = self.open_sheet()
for ii, worksheet in enumerate(sh.worksheets()):
if not self.should_process_sheet(worksheet.title):
logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
logger.debug(f"skipped worksheet '{worksheet.title}' due to allow/block rules")
continue
logger.info(f"Opening worksheet {ii=}: {worksheet.title=} header={self.header}")
logger.info(f"opening worksheet {ii=}: {worksheet.title=} header={self.header}")
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
if len(missing_cols := self.missing_required_columns(gw)):
logger.debug(
f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}"
f"skipped worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}"
)
continue
# process and yield metadata here:
yield from self._process_rows(gw)
logger.info(f"Finished worksheet {worksheet.title}")
with logger.contextualize(worksheet=f"{sh.title}:{worksheet.title}"):
# process and yield metadata here:
yield from self._process_rows(gw)
logger.info(f"finished worksheet {worksheet.title}")
def _process_rows(self, gw: GWorksheet):
for row in range(1 + self.header, gw.count_rows() + 1):
@@ -69,7 +70,9 @@ class GsheetsFeederDB(Feeder, Database):
# All checks done - archival process starts here
m = Metadata().set_url(url)
self._set_context(m, gw, row)
yield m
with logger.contextualize(row=row):
yield m
def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
# TODO: Check folder value not being recognised
@@ -99,16 +102,16 @@ class GsheetsFeederDB(Feeder, Database):
return missing
def started(self, item: Metadata) -> None:
logger.info(f"STARTED {item}")
logger.info("STARTED")
gw, row = self._retrieve_gsheet(item)
gw.set_cell(row, "status", "Archive in progress")
def failed(self, item: Metadata, reason: str) -> None:
logger.error(f"FAILED {item}")
logger.error("FAILED")
self._safe_status_update(item, f"Archive failed {reason}")
def aborted(self, item: Metadata) -> None:
logger.warning(f"ABORTED {item}")
logger.warning("ABORTED")
self._safe_status_update(item, "")
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
@@ -122,9 +125,7 @@ class GsheetsFeederDB(Feeder, Database):
cell_updates = []
row_values = gw.get_row(row)
spreadsheet = gw.wks.spreadsheet.title
worksheet = gw.wks.title
logger.info(f"DONE url='{item.get_url()}' {row=} on {spreadsheet=} : {worksheet=}")
logger.info("DONE")
def batch_if_valid(col, val, final_value=None):
final_value = final_value or val
@@ -132,7 +133,7 @@ class GsheetsFeederDB(Feeder, Database):
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
cell_updates.append((row, col, final_value))
except Exception as e:
logger.error(f"Unable to batch {col}={final_value} due to {e}")
logger.error(f"unable to batch {col}={final_value} due to {e}")
status_message = item.status
if cached:
@@ -192,15 +193,13 @@ class GsheetsFeederDB(Feeder, Database):
gw, row = self._retrieve_gsheet(item)
gw.set_cell(row, "status", new_status)
except Exception as e:
logger.debug(f"Unable to update sheet: {e}")
logger.debug(f"unable to update sheet: {e}: {traceback.format_exc()}")
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
if gsheet := item.get_context("gsheet"):
gw: GWorksheet = gsheet.get("worksheet")
row: int = gsheet.get("row")
elif self.sheet_id:
logger.error(
f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder."
)
logger.error("unable to retrieve Gsheet, GsheetDB must be used alongside GsheetFeeder.")
return gw, row

View File

@@ -9,7 +9,7 @@ making it suitable for handling large files efficiently.
"""
import hashlib
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata
@@ -22,8 +22,7 @@ class HashEnricher(Enricher):
"""
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
logger.debug(f"calculating media hashes with algo={self.algorithm}")
for i, m in enumerate(to_enrich.media):
if len(hd := self.calculate_hash(m.filename)):

View File

@@ -4,7 +4,7 @@ import os
import pathlib
from jinja2 import Environment, FileSystemLoader
from urllib.parse import quote
from loguru import logger
from auto_archiver.utils.custom_logger import logger
import json
import base64
@@ -35,7 +35,7 @@ class HtmlFormatter(Formatter):
def format(self, item: Metadata) -> Media:
url = item.get_url()
if item.is_empty():
logger.debug(f"[SKIP] FORMAT there is no media or metadata to format: {url=}")
logger.debug("nothing to format, skipping")
return
content = self.template.render(

View File

@@ -14,7 +14,7 @@ from datetime import datetime
import traceback
import requests
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from retrying import retry
from tqdm import tqdm
@@ -45,11 +45,11 @@ class InstagramAPIExtractor(Extractor):
url = item.get_url()
url.replace("instagr.com", "instagram.com").replace("instagr.am", "instagram.com")
insta_matches = self.valid_url.findall(url)
logger.info(f"{insta_matches=}")
if not len(insta_matches) or len(insta_matches[0]) != 3:
return
if len(insta_matches) > 1:
logger.warning(f"Multiple instagram matches found in {url=}, using the first one")
logger.debug("multiple instagram matches found, using the first one")
return
g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2]
if g1 == "":
@@ -65,7 +65,7 @@ class InstagramAPIExtractor(Extractor):
return self.download_post(item, id=g3, context="story")
return self.download_stories(item, g2)
else:
logger.warning(f"Unknown instagram regex group match {g1=} found in {url=}")
logger.warning(f"unknown instagram regex group match {g1=}")
return
@retry(wait_random_min=1000, wait_random_max=3000, stop_max_attempt_number=5)
@@ -112,8 +112,8 @@ class InstagramAPIExtractor(Extractor):
count_posts += len(stories)
result.set("#stories", len(stories))
except Exception as e:
result.append("errors", f"Error downloading stories for {username}")
logger.error(f"Error downloading stories for {username}: {e} {traceback.format_exc()}")
result.append("errors", f"error downloading stories for {username}")
logger.error(f"error downloading stories for {username}: {e} {traceback.format_exc()}")
# download all posts
try:
@@ -122,8 +122,8 @@ class InstagramAPIExtractor(Extractor):
result, user_id, max_to_download=self.full_profile_max_posts - count_posts
)
except Exception as e:
result.append("errors", f"Error downloading posts for {username}")
logger.error(f"Error downloading posts for {username}: {e} {traceback.format_exc()}")
result.append("errors", f"error downloading posts for {username}")
logger.error(f"error downloading posts for {username}: {e} {traceback.format_exc()}")
# download all tagged
try:
@@ -132,8 +132,8 @@ class InstagramAPIExtractor(Extractor):
result, user_id, max_to_download=self.full_profile_max_posts - count_posts
)
except Exception as e:
result.append("errors", f"Error downloading tagged posts for {username}")
logger.error(f"Error downloading tagged posts for {username}: {e} {traceback.format_exc()}")
result.append("errors", f"error downloading tagged posts for {username}")
logger.error(f"error downloading tagged posts for {username}: {e} {traceback.format_exc()}")
# download all highlights
try:
@@ -159,10 +159,10 @@ class InstagramAPIExtractor(Extractor):
except Exception as e:
result.append(
"errors",
f"Error downloading highlight id{h.get('pk')} for {username}",
f"error downloading highlight id{h.get('pk')} for {username}",
)
logger.error(
f"Error downloading highlight id{h.get('pk')} for {username}: {e} {traceback.format_exc()}"
f"error downloading highlight id{h.get('pk')} for {username}: {e} {traceback.format_exc()}"
)
if count_highlights >= max_to_download:
logger.debug(f"HIGHLIGHTS reached max_to_download={self.full_profile_max_posts}")
@@ -208,8 +208,8 @@ class InstagramAPIExtractor(Extractor):
try:
self.scrape_item(result, h, "highlight")
except Exception as e:
result.append("errors", f"Error downloading highlight {h.get('id')}")
logger.error(f"Error downloading highlight, skipping {h.get('id')}: {e} {traceback.format_exc()}")
result.append("errors", f"error downloading highlight {h.get('id')}")
logger.error(f"error downloading highlight, skipping {h.get('id')}: {e} {traceback.format_exc()}")
return h_info
@@ -251,8 +251,8 @@ class InstagramAPIExtractor(Extractor):
try:
self.scrape_item(result, p, "post")
except Exception as e:
result.append("errors", f"Error downloading post {p.get('id')}")
logger.error(f"Error downloading post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
result.append("errors", f"error downloading post {p.get('id')}")
logger.error(f"error downloading post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
pbar.update(1)
post_count += 1
if post_count >= max_to_download:
@@ -279,8 +279,8 @@ class InstagramAPIExtractor(Extractor):
try:
self.scrape_item(result, p, "tagged")
except Exception as e:
result.append("errors", f"Error downloading tagged post {p.get('id')}")
logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
result.append("errors", f"error downloading tagged post {p.get('id')}")
logger.error(f"error downloading tagged post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
pbar.update(1)
tagged_count += 1
if tagged_count >= max_to_download:

View File

@@ -8,7 +8,7 @@ import re
import os
import shutil
import instaloader
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata
@@ -29,8 +29,9 @@ class InstagramExtractor(Extractor):
# TODO: links to stories
def setup(self) -> None:
logger.warning("Instagram Extractor is not actively maintained, and may not work as expected.")
logger.warning("Please consider using the Instagram Tbot Extractor or Instagram API Extractor instead.")
logger.warning(
"Instagram Extractor is not actively maintained, and may not work as expected.\nPlease consider using the Instagram Tbot Extractor or Instagram API Extractor instead."
)
self.insta = instaloader.Instaloader(
download_geotags=True,
@@ -43,12 +44,11 @@ class InstagramExtractor(Extractor):
self.insta.load_session_from_file(self.username, self.session_file)
except Exception:
try:
logger.debug("Session file failed", exc_info=True)
logger.info("No valid session file found - Attempting login with use and password.")
logger.info("no valid session file found - Attempting login with use and password.")
self.insta.login(self.username, self.password)
self.insta.save_session_to_file(self.session_file)
except Exception as e:
logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}")
logger.error(f"failed to setup Instagram Extractor with Instagrapi. {e}")
def download(self, item: Metadata) -> Metadata:
url = item.get_url()
@@ -72,14 +72,14 @@ class InstagramExtractor(Extractor):
result = self.download_profile(url, profile_matches[0])
except Exception as e:
logger.error(
f"Failed to download with instagram extractor due to: {e}, make sure your account credentials are valid."
f"failed to download with instagram extractor due to: {e}, make sure your account credentials are valid."
)
finally:
shutil.rmtree(self.download_folder, ignore_errors=True)
return result
def download_post(self, url: str, post_id: str) -> Metadata:
logger.debug(f"Instagram {post_id=} detected in {url=}")
logger.debug(f"Instagram {post_id=} detected")
post = instaloader.Post.from_shortcode(self.insta.context, post_id)
if self.insta.download_post(post, target=post.owner_username):
@@ -87,7 +87,7 @@ class InstagramExtractor(Extractor):
def download_profile(self, url: str, username: str) -> Metadata:
# gets posts, posts where username is tagged, igtv postss, stories, and highlights
logger.debug(f"Instagram {username=} detected in {url=}")
logger.debug(f"Instagram {username=} detected")
profile = instaloader.Profile.from_username(self.insta.context, username)
try:
@@ -95,27 +95,27 @@ class InstagramExtractor(Extractor):
try:
self.insta.download_post(post, target=f"profile_post_{post.owner_username}")
except Exception as e:
logger.error(f"Failed to download post: {post.shortcode}: {e}")
logger.error(f"failed to download post: {post.shortcode}: {e}")
except Exception as e:
logger.error(f"Failed profile.get_posts: {e}")
logger.error(f"failed profile.get_posts: {e}")
try:
for post in profile.get_tagged_posts():
try:
self.insta.download_post(post, target=f"tagged_post_{post.owner_username}")
except Exception as e:
logger.error(f"Failed to download tagged post: {post.shortcode}: {e}")
logger.error(f"failed to download tagged post: {post.shortcode}: {e}")
except Exception as e:
logger.error(f"Failed profile.get_tagged_posts: {e}")
logger.error(f"failed profile.get_tagged_posts: {e}")
try:
for post in profile.get_igtv_posts():
try:
self.insta.download_post(post, target=f"igtv_post_{post.owner_username}")
except Exception as e:
logger.error(f"Failed to download igtv post: {post.shortcode}: {e}")
logger.error(f"failed to download igtv post: {post.shortcode}: {e}")
except Exception as e:
logger.error(f"Failed profile.get_igtv_posts: {e}")
logger.error(f"failed profile.get_igtv_posts: {e}")
try:
for story in self.insta.get_stories([profile.userid]):
@@ -123,9 +123,9 @@ class InstagramExtractor(Extractor):
try:
self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}")
except Exception as e:
logger.error(f"Failed to download story item: {item}: {e}")
logger.error(f"failed to download story item: {item}: {e}")
except Exception as e:
logger.error(f"Failed get_stories: {e}")
logger.error(f"failed get_stories: {e}")
try:
for highlight in self.insta.get_highlights(profile.userid):
@@ -133,9 +133,9 @@ class InstagramExtractor(Extractor):
try:
self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}")
except Exception as e:
logger.error(f"Failed to download highlight item: {item}: {e}")
logger.error(f"failed to download highlight item: {item}: {e}")
except Exception as e:
logger.error(f"Failed get_highlights: {e}")
logger.error(f"failed get_highlights: {e}")
return self.process_downloads(url, f"@{username}", profile._asdict(), None)
@@ -158,4 +158,4 @@ class InstagramExtractor(Extractor):
return result.success("instagram")
except Exception as e:
logger.error(f"Could not fetch instagram post {url} due to: {e}")
logger.error(f"could not fetch instagram post due to: {e}")

View File

@@ -12,7 +12,7 @@ import shutil
import time
from sqlite3 import OperationalError
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from telethon.sync import TelegramClient
from auto_archiver.core import Extractor

View File

@@ -1,5 +1,5 @@
import json
from loguru import logger
from auto_archiver.utils.custom_logger import logger
import os
from auto_archiver.core import Enricher

View File

@@ -1,7 +1,7 @@
import shutil
from typing import IO
import os
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Media
from auto_archiver.core import Storage

View File

@@ -1,6 +1,6 @@
import datetime
import os
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata

View File

@@ -1,6 +1,6 @@
import subprocess
import traceback
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata

View File

@@ -1,6 +1,6 @@
import os
from loguru import logger
from auto_archiver.utils.custom_logger import logger
import opentimestamps
from opentimestamps.calendar import RemoteCalendar, DEFAULT_CALENDAR_WHITELIST
from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile

View File

@@ -15,7 +15,7 @@ import traceback
import pdqhash
import numpy as np
from PIL import Image, UnidentifiedImageError
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata

View File

@@ -2,7 +2,7 @@ from typing import IO
import boto3
import os
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Media
from auto_archiver.core import Storage

View File

@@ -2,7 +2,7 @@ import ssl
import os
from slugify import slugify
from urllib.parse import urlparse
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, Media

View File

@@ -2,7 +2,7 @@ import requests
import re
import html
from bs4 import BeautifulSoup
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media

View File

@@ -17,7 +17,7 @@ from telethon.errors.rpcerrorlist import (
)
from tqdm import tqdm
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media

View File

@@ -9,7 +9,7 @@ and identify important moments without watching the entire video.
import ffmpeg
import os
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Media, Metadata

View File

@@ -5,7 +5,7 @@ import hashlib
from slugify import slugify
import requests
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from rfc3161_client import (decode_timestamp_response,TimestampRequestBuilder,TimeStampResponse, VerifierBuilder)
from rfc3161_client import VerificationError as Rfc3161VerificationError

View File

@@ -4,7 +4,7 @@ import re
import mimetypes
import requests
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from pytwitter import Api
from slugify import slugify

View File

@@ -4,7 +4,7 @@ import os
import shutil
import subprocess
from zipfile import ZipFile
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from warcio.archiveiterator import ArchiveIterator
from auto_archiver.core import Media, Metadata

View File

@@ -1,5 +1,5 @@
import json
from loguru import logger
from auto_archiver.utils.custom_logger import logger
import time
import requests

View File

@@ -1,7 +1,7 @@
import traceback
import requests
import time
from loguru import logger
from auto_archiver.utils.custom_logger import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, Media

View File

@@ -0,0 +1,37 @@
from loguru import logger
import json
def extract_log_data(record):
subset = {
"level": record["level"].name,
"time": record["time"].isoformat(timespec="seconds"),
}
subset["loc"] = f"{record['file'].name}:{record['function']}:{record['line']}"
for extra_key in ["trace", "url", "worksheet", "row"]:
if extra_val := record.get("extra", {}).get(extra_key):
subset[extra_key] = extra_val
subset["message"] = record["message"]
if exception := record.get("exception"):
subset["exception"] = exception
return subset
def serialize_no_message(record):
subset = extract_log_data(record)
subset.pop("message", None)
return json.dumps(subset, ensure_ascii=False)
def serialize(record):
return json.dumps(extract_log_data(record), ensure_ascii=False)
def patching(record):
record["extra"]["serialized"] = serialize(record)
record["extra"]["serialize_no_message"] = serialize_no_message(record)
logger = logger.patch(patching)

View File

@@ -7,7 +7,7 @@ from datetime import datetime, timezone
from dateutil.parser import parse as parse_dt
import requests
from loguru import logger
from auto_archiver.utils.custom_logger import logger
def mkdir_if_not_exists(folder):

View File

@@ -9,7 +9,7 @@ from tempfile import TemporaryDirectory
from typing import Dict, Tuple
import hashlib
from loguru import logger
from auto_archiver.utils.custom_logger import logger
import pytest
from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.core.module import ModuleFactory

View File

@@ -1,6 +1,6 @@
from auto_archiver.core import Extractor
from loguru import logger
from auto_archiver.utils.custom_logger import logger
class ExampleExtractor(Extractor):

View File

@@ -1,6 +1,6 @@
from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata
from loguru import logger
from auto_archiver.utils.custom_logger import logger
class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):

View File

@@ -25,7 +25,7 @@ def orchestration_file(orchestration_file_path):
def autoarchiver(tmp_path, monkeypatch, request):
def _autoarchiver(args=[]):
def cleanup():
from loguru import logger
from auto_archiver.utils.custom_logger import logger
if not logger._core.handlers.get(0):
logger._core.handlers_count = 0