concludes logging standardization refactor

This commit is contained in:
msramalho
2025-06-26 17:20:04 +01:00
parent ad29cb4447
commit afd9090a4c
48 changed files with 207 additions and 252 deletions

View File

@@ -24,4 +24,4 @@ SESSION_FILE = "secrets/anon-insta"
os.makedirs("secrets", exist_ok=True) os.makedirs("secrets", exist_ok=True)
with TelegramClient(SESSION_FILE, API_ID, API_HASH) as client: with TelegramClient(SESSION_FILE, API_ID, API_HASH) as client:
logger.success(f"New session file created: {SESSION_FILE}.session") logger.success(f"new session file created: {SESSION_FILE}.session")

View File

@@ -94,7 +94,7 @@ class Extractor(BaseModule):
to_filename = to_filename[-64:] to_filename = to_filename[-64:]
to_filename = os.path.join(self.tmp_dir, to_filename) to_filename = os.path.join(self.tmp_dir, to_filename)
if verbose: if verbose:
logger.debug(f"downloading {to_filename=}") logger.debug(f"Downloading {to_filename=}")
headers = { headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36" "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
} }

View File

@@ -86,7 +86,7 @@ class Media:
@property # getter .mimetype @property # getter .mimetype
def mimetype(self) -> str: def mimetype(self) -> str:
if not self.filename or len(self.filename) == 0: if not self.filename or len(self.filename) == 0:
logger.warning(f"cannot get mimetype from media without filename: {self}") logger.warning(f"Cannot get mimetype from media without filename: {self}")
return "" return ""
if not self._mimetype: if not self._mimetype:
self._mimetype = mimetypes.guess_type(self.filename)[0] self._mimetype = mimetypes.guess_type(self.filename)[0]
@@ -116,7 +116,7 @@ class Media:
# self.is_video() should be used together with this method # self.is_video() should be used together with this method
try: try:
streams = ffmpeg.probe(self.filename, select_streams="v")["streams"] streams = ffmpeg.probe(self.filename, select_streams="v")["streams"]
logger.debug(f"STREAMS FOR {self.filename} {streams}") logger.debug(f"Streams for {self.filename}: {streams}")
return any(s.get("duration_ts", 0) > 0 for s in streams) return any(s.get("duration_ts", 0) > 0 for s in streams)
except Error: except Error:
return False # ffmpeg errors when reading bad files return False # ffmpeg errors when reading bad files

View File

@@ -539,11 +539,11 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
for feeder in self.feeders: for feeder in self.feeders:
for item in feeder: for item in feeder:
with logger.contextualize(url=item.get_url(), trace=random_str(12)): with logger.contextualize(url=item.get_url(), trace=random_str(12)):
logger.info("started processing") logger.info("Started processing")
yield self.feed_item(item) yield self.feed_item(item)
url_count += 1 url_count += 1
logger.info(f"processed {url_count} URL(s)") logger.info(f"Processed {url_count} URL(s)")
self.cleanup() self.cleanup()
def feed_item(self, item: Metadata) -> Metadata: def feed_item(self, item: Metadata) -> Metadata:
@@ -561,7 +561,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
return self.archive(item) return self.archive(item)
except KeyboardInterrupt: except KeyboardInterrupt:
# catches keyboard interruptions to do a clean exit # catches keyboard interruptions to do a clean exit
logger.warning("caught interrupt") logger.warning("Caught interrupt")
for d in self.databases: for d in self.databases:
d.aborted(item) d.aborted(item)
self.cleanup() self.cleanup()
@@ -620,25 +620,25 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
try: try:
d.done(cached_result, cached=True) d.done(cached_result, cached=True)
except Exception as e: except Exception as e:
logger.error(f"database {d.name}: {e}: {traceback.format_exc()}") logger.error(f"Database {d.name}: {e}: {traceback.format_exc()}")
return cached_result return cached_result
# 3 - call extractors until one succeeds # 3 - call extractors until one succeeds
for a in self.extractors: for a in self.extractors:
logger.info(f"trying extractor {a.name}") logger.info(f"Trying extractor {a.name}")
try: try:
result.merge(a.download(result)) result.merge(a.download(result))
if result.is_success(): if result.is_success():
break break
except Exception as e: except Exception as e:
logger.error(f"archiver {a.name}: {e}: {traceback.format_exc()}") logger.error(f"Extractor {a.name}: {e}: {traceback.format_exc()}")
# 4 - call enrichers to work with archived content # 4 - call enrichers to work with archived content
for e in self.enrichers: for e in self.enrichers:
try: try:
e.enrich(result) e.enrich(result)
except Exception as exc: except Exception as exc:
logger.error(f"enricher {e.name}: {exc}: {traceback.format_exc()}") logger.error(f"Enricher {e.name}: {exc}: {traceback.format_exc()}")
# 5 - store all downloaded/generated media # 5 - store all downloaded/generated media
result.store(storages=self.storages) result.store(storages=self.storages)
@@ -657,7 +657,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
try: try:
d.done(result) d.done(result)
except Exception as e: except Exception as e:
logger.error(f"database {d.name}: {e}: {traceback.format_exc()}") logger.error(f"Database {d.name}: {e}: {traceback.format_exc()}")
return result return result

View File

@@ -57,7 +57,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
continue # Skip imported modules/classes/functions continue # Skip imported modules/classes/functions
if isinstance(obj, type) and issubclass(obj, Dropin): if isinstance(obj, type) and issubclass(obj, Dropin):
dropins.append(obj) dropins.append(obj)
logger.debug(f"loaded drop-in classes: {', '.join([d.__name__ for d in dropins])}") logger.debug(f"Loaded drop-in classes: {', '.join([d.__name__ for d in dropins])}")
return dropins return dropins
def sanitize_url(self, url: str) -> str: def sanitize_url(self, url: str) -> str:
@@ -86,10 +86,10 @@ class AntibotExtractorEnricher(Extractor, Enricher):
try: try:
with SB(uc=True, agent=self.agent, headed=None, user_data_dir=using_user_data_dir, proxy=self.proxy) as sb: with SB(uc=True, agent=self.agent, headed=None, user_data_dir=using_user_data_dir, proxy=self.proxy) as sb:
logger.info(f"selenium browser is up with agent {self.agent}, opening url...") logger.info(f"Selenium browser is up with agent {self.agent}, opening url...")
sb.uc_open_with_reconnect(url, 4) sb.uc_open_with_reconnect(url, 4)
logger.debug("handling CAPTCHAs for...") logger.debug("Handling CAPTCHAs for...")
sb.uc_gui_handle_cf() sb.uc_gui_handle_cf()
sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future
@@ -97,7 +97,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
dropin.open_page(url) dropin.open_page(url)
if self.detect_auth_wall and self._hit_auth_wall(sb): if self.detect_auth_wall and self._hit_auth_wall(sb):
logger.warning("skipping since auth wall or CAPTCHA was detected") logger.warning("Skipping since auth wall or CAPTCHA was detected")
return False return False
sb.wait_for_ready_state_complete() sb.wait_for_ready_state_complete()
@@ -124,18 +124,18 @@ class AntibotExtractorEnricher(Extractor, Enricher):
js_css_selector=dropin.js_for_video_css_selectors(), js_css_selector=dropin.js_for_video_css_selectors(),
max_media=self.max_download_videos - downloaded_videos, max_media=self.max_download_videos - downloaded_videos,
) )
logger.info("completed") logger.info("Completed")
return to_enrich return to_enrich
except selenium.common.exceptions.SessionNotCreatedException as e: except selenium.common.exceptions.SessionNotCreatedException as e:
if custom_data_dir: # the retry logic only works once if custom_data_dir: # the retry logic only works once
logger.error( logger.error(
f"session not created error: {e}. Please remove the user_data_dir {self.user_data_dir} and try again, will retry without user data dir though." f"Session not created error: {e}. Please remove the user_data_dir {self.user_data_dir} and try again, will retry without user data dir though."
) )
return self.enrich(to_enrich, custom_data_dir=False) return self.enrich(to_enrich, custom_data_dir=False)
raise e # re-raise raise e # re-raise
except Exception as e: except Exception as e:
logger.error(f"runtime error: {e}: {traceback.format_exc()}") logger.error(f"Runtime error: {e}: {traceback.format_exc()}")
return False return False
def _get_suitable_dropin(self, url: str, sb: SB): def _get_suitable_dropin(self, url: str, sb: SB):
@@ -145,7 +145,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
""" """
for dropin in self.dropins: for dropin in self.dropins:
if dropin.suitable(url): if dropin.suitable(url):
logger.debug(f"using drop-in {dropin.__name__}") logger.debug(f"Using drop-in {dropin.__name__}")
return dropin(sb, self) return dropin(sb, self)
return DefaultDropin(sb, self) return DefaultDropin(sb, self)
@@ -240,7 +240,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
x = max(sb.execute_script("return document.documentElement.scrollWidth"), w) x = max(sb.execute_script("return document.documentElement.scrollWidth"), w)
y = min(max(sb.execute_script("return document.documentElement.scrollHeight"), h), 25_000) y = min(max(sb.execute_script("return document.documentElement.scrollHeight"), h), 25_000)
logger.debug(f"setting window size to {x}x{y} for full page screenshot.") logger.debug(f"Setting window size to {x}x{y} for full page screenshot.")
sb.set_window_size(x, y) sb.set_window_size(x, y)
screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png") screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png")
@@ -279,7 +279,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
# js_for_css_selectors # js_for_css_selectors
for src in sources: for src in sources:
if len(all_urls) >= max_media: if len(all_urls) >= max_media:
logger.debug(f"reached max download limit of {max_media} images/videos.") logger.debug(f"Reached max download limit of {max_media} images/videos.")
break break
if not is_relevant_url(src): if not is_relevant_url(src):
continue continue

View File

@@ -144,7 +144,7 @@ class Dropin:
with yt_dlp.YoutubeDL(validated_options) as ydl: with yt_dlp.YoutubeDL(validated_options) as ydl:
for url in video_urls: for url in video_urls:
try: try:
logger.debug("downloading video from url") logger.debug(f"Downloading video from url: {url}")
info = ydl.extract_info(url, download=True) info = ydl.extract_info(url, download=True)
filename = ydl_entry_to_filename(ydl, info) filename = ydl_entry_to_filename(ydl, info)
if not filename: # Failed to download video. if not filename: # Failed to download video.
@@ -156,5 +156,5 @@ class Dropin:
to_enrich.add_media(media) to_enrich.add_media(media)
downloaded += 1 downloaded += 1
except Exception as e: except Exception as e:
logger.error(f"download failed: {e} {traceback.format_exc()}") logger.error(f"Download failed: {e} {traceback.format_exc()}")
return downloaded return downloaded

View File

@@ -62,7 +62,7 @@ class LinkedinDropin(Dropin):
self.sb.wait_for_ready_state_complete() self.sb.wait_for_ready_state_complete()
username, password = self._get_username_password("linkedin.com") username, password = self._get_username_password("linkedin.com")
logger.debug("logging in to Linkedin with username: {}", username) logger.debug("Logging in to Linkedin with username: {}", username)
self.sb.type("#username", username) self.sb.type("#username", username)
self.sb.type("#password", password) self.sb.type("#password", password)
self.sb.click_if_visible("#password-visibility-toggle", timeout=0.5) self.sb.click_if_visible("#password-visibility-toggle", timeout=0.5)

View File

@@ -50,7 +50,7 @@ class RedditDropin(Dropin):
self._close_cookies_banner() self._close_cookies_banner()
username, password = self._get_username_password("reddit.com") username, password = self._get_username_password("reddit.com")
logger.debug("logging in to Reddit with username: {}", username) logger.debug("Logging in to Reddit with username: {}", username)
self.sb.type("#login-username", username) self.sb.type("#login-username", username)
self.sb.type("#login-password", password) self.sb.type("#login-password", password)
@@ -68,7 +68,7 @@ class RedditDropin(Dropin):
self.sb.click_link_text("Log in") self.sb.click_link_text("Log in")
self.sb.wait_for_ready_state_complete() self.sb.wait_for_ready_state_complete()
if self.sb.is_text_visible("Welcome back"): if self.sb.is_text_visible("Welcome back"):
logger.debug("login successful") logger.debug("Login successful")
self.sb.click_if_visible("this link") self.sb.click_if_visible("this link")
def _close_cookies_banner(self): def _close_cookies_banner(self):
@@ -88,5 +88,5 @@ class RedditDropin(Dropin):
.map(el => el.src || el.href) .map(el => el.src || el.href)
.filter(url => url && /\.(m3u8|mpd|ism)$/.test(url)); .filter(url => url && /\.(m3u8|mpd|ism)$/.test(url));
""") """)
logger.debug("found {} video URLs", len(filtered_urls)) logger.debug("Found {} video URLs", len(filtered_urls))
return 0, self._download_videos_with_ytdlp(filtered_urls, to_enrich) return 0, self._download_videos_with_ytdlp(filtered_urls, to_enrich)

View File

@@ -57,12 +57,12 @@ class VkDropin(Dropin):
self.sb.open("https://vk.com") self.sb.open("https://vk.com")
self.sb.wait_for_ready_state_complete() self.sb.wait_for_ready_state_complete()
if "/feed" in self.sb.get_current_url(): if "/feed" in self.sb.get_current_url():
logger.debug("already logged in to VK.") logger.debug("Already logged in to VK.")
return True return True
# need to login # need to login
username, password = self._get_username_password("vk.com") username, password = self._get_username_password("vk.com")
logger.debug("logging in to VK with username: {}", username) logger.debug("Logging in to VK with username: {}", username)
self.sb.click('[data-testid="enter-another-way"]', timeout=10) self.sb.click('[data-testid="enter-another-way"]', timeout=10)
self.sb.clear('input[name="login"][type="tel"]', by="css selector", timeout=10) self.sb.clear('input[name="login"][type="tel"]', by="css selector", timeout=10)

View File

@@ -36,9 +36,9 @@ class AAApiDb(Database):
if not self.store_results: if not self.store_results:
return return
if cached: if cached:
logger.debug("skipping saving archive to AA API because it was cached") logger.debug("Skipping saving archive to AA API because it was cached")
return return
logger.debug("saving archive to the AA API.") logger.debug("Saving archive to the AA API.")
payload = { payload = {
"author_id": self.author_id, "author_id": self.author_id,

View File

@@ -72,7 +72,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver", f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
json={"metadata": {"processed": True, "status": "error", "error": reason}}, json={"metadata": {"processed": True, "status": "error", "error": reason}},
) )
logger.info(f"stored failure ID {atlos_id} on Atlos: {reason}") logger.info(f"Stored failure ID {atlos_id} on Atlos: {reason}")
def fetch(self, item: Metadata) -> Union[Metadata, bool]: def fetch(self, item: Metadata) -> Union[Metadata, bool]:
"""check and fetch if the given item has been archived already, each """check and fetch if the given item has been archived already, each
@@ -88,7 +88,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
"""Mark an item as successfully archived in Atlos.""" """Mark an item as successfully archived in Atlos."""
atlos_id = item.metadata.get("atlos_id") atlos_id = item.metadata.get("atlos_id")
if not atlos_id: if not atlos_id:
logger.info("item has no Atlos ID, skipping") logger.info("Item has no Atlos ID, skipping")
return return
self._post( self._post(
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver", f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
@@ -100,7 +100,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
} }
}, },
) )
logger.info(f"stored success ID {atlos_id} on Atlos") logger.info(f"Stored success ID {atlos_id} on Atlos")
# ! Atlos Module - Storage Methods # ! Atlos Module - Storage Methods
@@ -111,12 +111,12 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool: def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool:
"""Upload a media file to Atlos if it has not been uploaded already.""" """Upload a media file to Atlos if it has not been uploaded already."""
if metadata is None: if metadata is None:
logger.error(f"no metadata provided for {media.filename}") logger.error(f"No metadata provided for {media.filename}")
return False return False
atlos_id = metadata.get("atlos_id") atlos_id = metadata.get("atlos_id")
if not atlos_id: if not atlos_id:
logger.error(f"no Atlos ID found in metadata; can't store {media.filename} in Atlos.") logger.error(f"No Atlos ID found in metadata; can't store {media.filename} in Atlos.")
return False return False
media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096) media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096)
@@ -135,7 +135,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
params={"title": media.properties}, params={"title": media.properties},
files={"file": (os.path.basename(media.filename), file_obj)}, files={"file": (os.path.basename(media.filename), file_obj)},
) )
logger.info(f"uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}") logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}")
return True return True
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:

View File

@@ -20,19 +20,19 @@ class CSVFeeder(Feeder):
url_column = first_row.index(url_column) url_column = first_row.index(url_column)
except ValueError: except ValueError:
logger.error( logger.error(
f"column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?" f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?"
) )
return return
elif not (url_or_none(first_row[url_column])): elif not (url_or_none(first_row[url_column])):
# it's a header row, but we've been given a column number already # it's a header row, but we've been given a column number already
logger.debug(f"skipping header row: {first_row}") logger.debug(f"Skipping header row: {first_row}")
else: else:
# first row isn't a header row, rewind the file # first row isn't a header row, rewind the file
f.seek(0) f.seek(0)
for row in reader: for row in reader:
if not url_or_none(row[url_column]): if not url_or_none(row[url_column]):
logger.warning(f"not a valid URL in row: {row}, skipping") logger.warning(f"Not a valid URL in row: {row}, skipping")
continue continue
url = row[url_column] url = row[url_column]
yield Metadata().set_url(url) yield Metadata().set_url(url)

View File

@@ -23,10 +23,10 @@ class GDriveStorage(Storage):
def _setup_google_drive_service(self): def _setup_google_drive_service(self):
"""Initialize Google Drive service based on provided credentials.""" """Initialize Google Drive service based on provided credentials."""
if self.oauth_token: if self.oauth_token:
logger.debug(f"using Google Drive OAuth token: {self.oauth_token}") logger.debug(f"Using Google Drive OAuth token: {self.oauth_token}")
self.service = self._initialize_with_oauth_token() self.service = self._initialize_with_oauth_token()
elif self.service_account: elif self.service_account:
logger.debug(f"using Google Drive service account: {self.service_account}") logger.debug(f"Using Google Drive service account: {self.service_account}")
self.service = self._initialize_with_service_account() self.service = self._initialize_with_service_account()
else: else:
raise ValueError("Missing credentials: either `oauth_token` or `service_account` must be provided.") raise ValueError("Missing credentials: either `oauth_token` or `service_account` must be provided.")
@@ -41,7 +41,7 @@ class GDriveStorage(Storage):
if not creds.valid and creds.expired and creds.refresh_token: if not creds.valid and creds.expired and creds.refresh_token:
creds.refresh(Request()) creds.refresh(Request())
with open(self.oauth_token, "w") as token_file: with open(self.oauth_token, "w") as token_file:
logger.debug("saving refreshed OAuth token.") logger.debug("Saving refreshed OAuth token.")
token_file.write(creds.to_json()) token_file.write(creds.to_json())
elif not creds.valid: elif not creds.valid:
raise ValueError("Invalid OAuth token. Please regenerate the token.") raise ValueError("Invalid OAuth token. Please regenerate the token.")
@@ -62,7 +62,7 @@ class GDriveStorage(Storage):
parent_id, folder_id = self.root_folder_id, None parent_id, folder_id = self.root_folder_id, None
path_parts = media.key.split(os.path.sep) path_parts = media.key.split(os.path.sep)
filename = path_parts[-1] filename = path_parts[-1]
logger.info(f"looking for folders for {path_parts[0:-1]} before getting url for {filename=}") logger.info(f"Looking for folders for {path_parts[0:-1]} before getting url for {filename=}")
for folder in path_parts[0:-1]: for folder in path_parts[0:-1]:
folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True) folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
parent_id = folder_id parent_id = folder_id
@@ -70,7 +70,7 @@ class GDriveStorage(Storage):
file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=True) file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=True)
if not file_id: if not file_id:
# #
logger.info(f"file {filename} not found in folder {folder_id}") logger.info(f"File {filename} not found in folder {folder_id}")
return None return None
return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing" return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
@@ -83,7 +83,7 @@ class GDriveStorage(Storage):
parent_id, upload_to = self.root_folder_id, None parent_id, upload_to = self.root_folder_id, None
path_parts = media.key.split(os.path.sep) path_parts = media.key.split(os.path.sep)
filename = path_parts[-1] filename = path_parts[-1]
logger.info(f"checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}") logger.info(f"Checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}")
for folder in path_parts[0:-1]: for folder in path_parts[0:-1]:
upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False) upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False)
if upload_to is None: if upload_to is None:
@@ -91,7 +91,7 @@ class GDriveStorage(Storage):
parent_id = upload_to parent_id = upload_to
# upload file to gd # upload file to gd
logger.debug(f"uploading {filename=} to folder id {upload_to}") logger.debug(f"Uploading {filename=} to folder id {upload_to}")
file_metadata = {"name": [filename], "parents": [upload_to]} file_metadata = {"name": [filename], "parents": [upload_to]}
try: try:
media = MediaFileUpload(media.filename, resumable=True) media = MediaFileUpload(media.filename, resumable=True)
@@ -100,11 +100,11 @@ class GDriveStorage(Storage):
.create(supportsAllDrives=True, body=file_metadata, media_body=media, fields="id") .create(supportsAllDrives=True, body=file_metadata, media_body=media, fields="id")
.execute() .execute()
) )
logger.debug(f"uploadf: uploaded file {gd_file['id']} successfully in folder={upload_to}") logger.debug(f"Uploadf: uploaded file {gd_file['id']} successfully in folder={upload_to}")
except FileNotFoundError as e: except FileNotFoundError as e:
logger.error(f"gd uploadf: file not found {media.filename=} - {e}") logger.error(f"GD uploadf: file not found {media.filename=} - {e}")
except Exception as e: except Exception as e:
logger.error(f"gd uploadf: error uploading {media.filename=} to {upload_to} - {e}") logger.error(f"GD uploadf: error uploading {media.filename=} to {upload_to} - {e}")
# must be implemented even if unused # must be implemented even if unused
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
@@ -133,7 +133,7 @@ class GDriveStorage(Storage):
self.api_cache = getattr(self, "api_cache", {}) self.api_cache = getattr(self, "api_cache", {})
cache_key = f"{parent_id}_{name}_{use_mime_type}" cache_key = f"{parent_id}_{name}_{use_mime_type}"
if cache_key in self.api_cache: if cache_key in self.api_cache:
logger.debug(f"cache hit for {cache_key=}") logger.debug(f"Cache hit for {cache_key=}")
return self.api_cache[cache_key] return self.api_cache[cache_key]
# API logic # API logic
@@ -168,7 +168,7 @@ class GDriveStorage(Storage):
else: else:
logger.debug(f"{debug_header} not found, attempt {attempt + 1}/{retries}.") logger.debug(f"{debug_header} not found, attempt {attempt + 1}/{retries}.")
if attempt < retries - 1: if attempt < retries - 1:
logger.debug(f"sleeping for {sleep_seconds} second(s)") logger.debug(f"Sleeping for {sleep_seconds} second(s)")
time.sleep(sleep_seconds) time.sleep(sleep_seconds)
if raise_on_missing: if raise_on_missing:
@@ -180,7 +180,7 @@ class GDriveStorage(Storage):
Creates a new GDrive folder @name inside folder @parent_id Creates a new GDrive folder @name inside folder @parent_id
Returns id of the created folder Returns id of the created folder
""" """
logger.debug(f"creating new folder with {name=} inside {parent_id=}") logger.debug(f"Creating new folder with {name=} inside {parent_id=}")
file_metadata = {"name": [name], "mimeType": "application/vnd.google-apps.folder", "parents": [parent_id]} file_metadata = {"name": [name], "mimeType": "application/vnd.google-apps.folder", "parents": [parent_id]}
gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields="id").execute() gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields="id").execute()
return gd_folder.get("id") return gd_folder.get("id")

View File

@@ -18,7 +18,7 @@ class Bluesky(GenericDropin):
# download if embeds present (1 video XOR >=1 images) # download if embeds present (1 video XOR >=1 images)
for media in self._download_bsky_embeds(post, archiver): for media in self._download_bsky_embeds(post, archiver):
result.add_media(media) result.add_media(media)
logger.debug(f"downloaded {len(result.media)} media files") logger.debug(f"Downloaded {len(result.media)} media files")
return result return result

View File

@@ -67,7 +67,7 @@ class GenericExtractor(Extractor):
os.execv(sys.executable, [sys.executable] + sys.argv) os.execv(sys.executable, [sys.executable] + sys.argv)
def update_package(self, package_name: str) -> bool: def update_package(self, package_name: str) -> bool:
logger.info(f"checking and updating {package_name}...") logger.info(f"Checking and updating {package_name}...")
from importlib.metadata import version as get_version from importlib.metadata import version as get_version
old_version = get_version(package_name) old_version = get_version(package_name)
@@ -79,7 +79,7 @@ class GenericExtractor(Extractor):
return True return True
logger.info(f"{package_name} already up to date") logger.info(f"{package_name} already up to date")
except Exception as e: except Exception as e:
logger.error(f"failed to update {package_name}: {e}") logger.error(f"Failed to update {package_name}: {e}")
return False return False
def setup_po_tokens(self) -> None: def setup_po_tokens(self) -> None:
@@ -110,7 +110,7 @@ class GenericExtractor(Extractor):
missing_tools = [tool for tool in ("node", "yarn", "npx") if shutil.which(tool) is None] missing_tools = [tool for tool in ("node", "yarn", "npx") if shutil.which(tool) is None]
if missing_tools: if missing_tools:
logger.error( logger.error(
f"cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. " f"Cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. "
"Install these tools or run bgutils via Docker. " "Install these tools or run bgutils via Docker. "
"See: https://github.com/Brainicism/bgutil-ytdlp-pot-provider" "See: https://github.com/Brainicism/bgutil-ytdlp-pot-provider"
) )
@@ -139,7 +139,7 @@ class GenericExtractor(Extractor):
f"https://github.com/Brainicism/bgutil-ytdlp-pot-provider/archive/refs/tags/{plugin_version}.zip" f"https://github.com/Brainicism/bgutil-ytdlp-pot-provider/archive/refs/tags/{plugin_version}.zip"
) )
zip_path = os.path.join(base_dir, f"{plugin_version}.zip") zip_path = os.path.join(base_dir, f"{plugin_version}.zip")
logger.info(f"downloading bgutils release zip for version {plugin_version}...") logger.info(f"Downloading bgutils release zip for version {plugin_version}...")
urlretrieve(zip_url, zip_path) urlretrieve(zip_url, zip_path)
with zipfile.ZipFile(zip_path, "r") as z: with zipfile.ZipFile(zip_path, "r") as z:
z.extractall(base_dir) z.extractall(base_dir)
@@ -148,7 +148,7 @@ class GenericExtractor(Extractor):
extracted_root = os.path.join(base_dir, f"bgutil-ytdlp-pot-provider-{plugin_version}") extracted_root = os.path.join(base_dir, f"bgutil-ytdlp-pot-provider-{plugin_version}")
shutil.move(os.path.join(extracted_root, "server"), server_dir) shutil.move(os.path.join(extracted_root, "server"), server_dir)
shutil.rmtree(extracted_root) shutil.rmtree(extracted_root)
logger.info("installing dependencies and transpiling PoT Generator script...") logger.info("Installing dependencies and transpiling PoT Generator script...")
subprocess.run(["yarn", "install", "--frozen-lockfile"], cwd=server_dir, check=True) subprocess.run(["yarn", "install", "--frozen-lockfile"], cwd=server_dir, check=True)
subprocess.run(["npx", "tsc"], cwd=server_dir, check=True) subprocess.run(["npx", "tsc"], cwd=server_dir, check=True)
@@ -164,7 +164,7 @@ class GenericExtractor(Extractor):
logger.info(f"PO Token script configured at: {script_path}") logger.info(f"PO Token script configured at: {script_path}")
except Exception as e: except Exception as e:
logger.error(f"failed to set up PO Token script: {e}") logger.error(f"Failed to set up PO Token script: {e}")
def suitable_extractors(self, url: str) -> Generator[str, None, None]: def suitable_extractors(self, url: str) -> Generator[str, None, None]:
""" """
@@ -205,7 +205,7 @@ class GenericExtractor(Extractor):
media = Media(cover_image_path) media = Media(cover_image_path)
metadata.add_media(media, id="cover") metadata.add_media(media, id="cover")
except Exception as e: except Exception as e:
logger.error(f"could not download cover image {thumbnail_url}: {e}") logger.error(f"Could not download cover image {thumbnail_url}: {e}")
dropin = self.dropin_for_name(info_extractor.ie_key()) dropin = self.dropin_for_name(info_extractor.ie_key())
if dropin: if dropin:
@@ -352,7 +352,7 @@ class GenericExtractor(Extractor):
if not dropin: if not dropin:
# TODO: add a proper link to 'how to create your own dropin' # TODO: add a proper link to 'how to create your own dropin'
logger.debug(f"""could not find valid dropin for {info_extractor.ie_key()}. logger.debug(f"""Could not find valid dropin for {info_extractor.ie_key()}.
Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""") Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""")
return False return False
@@ -374,7 +374,7 @@ class GenericExtractor(Extractor):
if "entries" in data: if "entries" in data:
entries = data.get("entries", []) entries = data.get("entries", [])
if not len(entries): if not len(entries):
logger.info("YoutubeDLArchiver could not find any video") logger.info("GenericExtractor could not find any video")
return False return False
else: else:
entries = [data] entries = [data]
@@ -388,7 +388,7 @@ class GenericExtractor(Extractor):
# file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies. # file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies.
continue continue
logger.debug(f"using filename {filename} for entry {entry.get('id', 'unknown')}") logger.debug(f"Using filename {filename} for entry {entry.get('id', 'unknown')}")
new_media = Media(filename) new_media = Media(filename)
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]: for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
@@ -403,12 +403,12 @@ class GenericExtractor(Extractor):
text = " ".join([line.text for line in subs]) text = " ".join([line.text for line in subs])
new_media.set(f"subtitles_{lang}", text) new_media.set(f"subtitles_{lang}", text)
except Exception as e: except Exception as e:
logger.error(f"error loading subtitle file {val.get('filepath')}: {e}") logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}")
result.add_media(new_media) result.add_media(new_media)
except Exception as e: except Exception as e:
logger.error(f"error processing entry {entry}: {e}") logger.error(f"Error processing entry {entry}: {e}")
if not len(result.media): if not len(result.media):
logger.info(f"no media found for entry {entry}, skipping.") logger.info(f"No media found for entry {entry}, skipping.")
return False return False
return self.add_metadata(data, info_extractor, url, result) return self.add_metadata(data, info_extractor, url, result)
@@ -470,14 +470,14 @@ class GenericExtractor(Extractor):
def _helper_for_successful_extract_info(data, info_extractor, url, ydl): def _helper_for_successful_extract_info(data, info_extractor, url, ydl):
if data.get("is_live", False) and not self.livestreams: if data.get("is_live", False) and not self.livestreams:
logger.warning("livestream detected, skipping due to 'livestreams' configuration setting") logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
return False return False
# it's a valid video, that the youtubdedl can download out of the box # it's a valid video, that the youtubdedl can download out of the box
return self.get_metadata_for_video(data, info_extractor, url, ydl) return self.get_metadata_for_video(data, info_extractor, url, ydl)
try: try:
if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor): if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
logger.debug(f"skipping using ytdlp to download files for {info_extractor.ie_key()}") logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
raise SkipYtdlp() raise SkipYtdlp()
# don't download since it can be a live stream # don't download since it can be a live stream
@@ -496,17 +496,17 @@ class GenericExtractor(Extractor):
if not isinstance(e, SkipYtdlp): if not isinstance(e, SkipYtdlp):
logger.debug( logger.debug(
f'issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead' f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead'
) )
try: try:
result = self.get_metadata_for_post(info_extractor, url, ydl) result = self.get_metadata_for_post(info_extractor, url, ydl)
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e: except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
logger.error("error downloading metadata for post: {error}", error=str(post_e)) logger.error("Error downloading metadata for post: {error}", error=str(post_e))
return False return False
except Exception as generic_e: except Exception as generic_e:
logger.debug( logger.debug(
'attempt to extract using ytdlp extractor "{name}" failed: \n {error}', 'Attempt to extract using ytdlp extractor "{name}" failed: \n {error}',
name=info_extractor.IE_NAME, name=info_extractor.IE_NAME,
error=str(generic_e), error=str(generic_e),
exc_info=True, exc_info=True,
@@ -559,17 +559,17 @@ class GenericExtractor(Extractor):
# order of importance: username/password -> api_key -> cookie -> cookies_from_browser -> cookies_file # order of importance: username/password -> api_key -> cookie -> cookies_from_browser -> cookies_file
if auth: if auth:
if "username" in auth and "password" in auth: if "username" in auth and "password" in auth:
logger.debug("using provided auth username and password") logger.debug("Using provided auth username and password")
ydl_options.extend(("--username", auth["username"])) ydl_options.extend(("--username", auth["username"]))
ydl_options.extend(("--password", auth["password"])) ydl_options.extend(("--password", auth["password"]))
elif "cookie" in auth: elif "cookie" in auth:
logger.debug("using provided auth cookie") logger.debug("Using provided auth cookie")
yt_dlp.utils.std_headers["cookie"] = auth["cookie"] yt_dlp.utils.std_headers["cookie"] = auth["cookie"]
elif "cookies_from_browser" in auth: elif "cookies_from_browser" in auth:
logger.debug(f"using extracted cookies from browser {auth['cookies_from_browser']}") logger.debug(f"Using extracted cookies from browser {auth['cookies_from_browser']}")
ydl_options.extend(("--cookies-from-browser", auth["cookies_from_browser"])) ydl_options.extend(("--cookies-from-browser", auth["cookies_from_browser"]))
elif "cookies_file" in auth: elif "cookies_file" in auth:
logger.debug(f"using cookies from file {auth['cookies_file']}") logger.debug(f"Using cookies from file {auth['cookies_file']}")
ydl_options.extend(("--cookies", auth["cookies_file"])) ydl_options.extend(("--cookies", auth["cookies_file"]))
# Applying user-defined extractor_args # Applying user-defined extractor_args
@@ -579,11 +579,11 @@ class GenericExtractor(Extractor):
arg_str = ";".join(f"{k}={v}" for k, v in args.items()) arg_str = ";".join(f"{k}={v}" for k, v in args.items())
else: else:
arg_str = str(args) arg_str = str(args)
logger.debug(f"setting extractor_args: {key}:{arg_str}") logger.debug(f"Setting extractor_args: {key}:{arg_str}")
ydl_options.extend(["--extractor-args", f"{key}:{arg_str}"]) ydl_options.extend(["--extractor-args", f"{key}:{arg_str}"])
if self.ytdlp_args: if self.ytdlp_args:
logger.debug(f"adding additional ytdlp arguments: {self.ytdlp_args}") logger.debug(f"Adding additional ytdlp arguments: {self.ytdlp_args}")
ydl_options += self.ytdlp_args.split(" ") ydl_options += self.ytdlp_args.split(" ")
*_, validated_options = yt_dlp.parse_options(ydl_options) *_, validated_options = yt_dlp.parse_options(ydl_options)

View File

@@ -22,7 +22,7 @@ class Tiktok(GenericDropin):
return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE)) return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE))
def extract_post(self, url: str, ie_instance): def extract_post(self, url: str, ie_instance):
logger.debug(f"using Tikwm API to attempt to download tiktok video from {url=}") logger.debug("Using Tikwm API to attempt to download tiktok video")
endpoint = self.TIKWM_ENDPOINT.format(url=url) endpoint = self.TIKWM_ENDPOINT.format(url=url)
@@ -62,7 +62,7 @@ class Tiktok(GenericDropin):
# get the video or fail # get the video or fail
video_downloaded = archiver.download_from_url(video_url, f"vid_{post.get('id', '')}") video_downloaded = archiver.download_from_url(video_url, f"vid_{post.get('id', '')}")
if not video_downloaded: if not video_downloaded:
logger.error(f"failed to download video from {video_url}") logger.error("Failed to download video")
return False return False
video_media = Media(video_downloaded) video_media = Media(video_downloaded)
if duration := post.get("duration", None): if duration := post.get("duration", None):

View File

@@ -40,7 +40,7 @@ class Twitter(GenericDropin):
raise ValueError("Error retreiving post. Are you sure it exists?") raise ValueError("Error retreiving post. Are you sure it exists?")
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y") timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
except (ValueError, KeyError) as ex: except (ValueError, KeyError) as ex:
logger.warning(f"unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}") logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
return False return False
full_text = tweet.pop("full_text", "") full_text = tweet.pop("full_text", "")
@@ -49,7 +49,7 @@ class Twitter(GenericDropin):
result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp) result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp)
if not tweet.get("entities", {}).get("media"): if not tweet.get("entities", {}).get("media"):
logger.debug("no media found, archiving tweet text only") logger.debug("No media found, archiving tweet text only")
result.status = "twitter-ytdl" result.status = "twitter-ytdl"
return result return result
for i, tw_media in enumerate(tweet["entities"]["media"]): for i, tw_media in enumerate(tweet["entities"]["media"]):

View File

@@ -42,19 +42,19 @@ class GsheetsFeederDB(Feeder, Database):
sh = self.open_sheet() sh = self.open_sheet()
for ii, worksheet in enumerate(sh.worksheets()): for ii, worksheet in enumerate(sh.worksheets()):
if not self.should_process_sheet(worksheet.title): if not self.should_process_sheet(worksheet.title):
logger.debug(f"skipped worksheet '{worksheet.title}' due to allow/block rules") logger.debug(f"Skipped worksheet '{worksheet.title}' due to allow/block rules")
continue continue
logger.info(f"opening worksheet {ii=}: {worksheet.title=} header={self.header}") logger.info(f"Opening worksheet {ii=}: {worksheet.title=} header={self.header}")
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns) gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
if len(missing_cols := self.missing_required_columns(gw)): if len(missing_cols := self.missing_required_columns(gw)):
logger.debug( logger.debug(
f"skipped worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}" f"Skipped worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}"
) )
continue continue
with logger.contextualize(worksheet=f"{sh.title}:{worksheet.title}"): with logger.contextualize(worksheet=f"{sh.title}:{worksheet.title}"):
# process and yield metadata here: # process and yield metadata here:
yield from self._process_rows(gw) yield from self._process_rows(gw)
logger.info(f"finished worksheet {worksheet.title}") logger.info(f"Finished worksheet {worksheet.title}")
def _process_rows(self, gw: GWorksheet): def _process_rows(self, gw: GWorksheet):
for row in range(1 + self.header, gw.count_rows() + 1): for row in range(1 + self.header, gw.count_rows() + 1):
@@ -133,7 +133,7 @@ class GsheetsFeederDB(Feeder, Database):
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "": if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
cell_updates.append((row, col, final_value)) cell_updates.append((row, col, final_value))
except Exception as e: except Exception as e:
logger.error(f"unable to batch {col}={final_value} due to {e}") logger.error(f"Unable to batch {col}={final_value} due to {e}")
status_message = item.status status_message = item.status
if cached: if cached:
@@ -193,13 +193,13 @@ class GsheetsFeederDB(Feeder, Database):
gw, row = self._retrieve_gsheet(item) gw, row = self._retrieve_gsheet(item)
gw.set_cell(row, "status", new_status) gw.set_cell(row, "status", new_status)
except Exception as e: except Exception as e:
logger.debug(f"unable to update sheet: {e}: {traceback.format_exc()}") logger.debug(f"Unable to update sheet: {e}: {traceback.format_exc()}")
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
if gsheet := item.get_context("gsheet"): if gsheet := item.get_context("gsheet"):
gw: GWorksheet = gsheet.get("worksheet") gw: GWorksheet = gsheet.get("worksheet")
row: int = gsheet.get("row") row: int = gsheet.get("row")
elif self.sheet_id: elif self.sheet_id:
logger.error("unable to retrieve Gsheet, GsheetDB must be used alongside GsheetFeeder.") logger.error("Unable to retrieve Gsheet, GsheetDB must be used alongside GsheetFeeder.")
return gw, row return gw, row

View File

@@ -22,7 +22,7 @@ class HashEnricher(Enricher):
""" """
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
logger.debug(f"calculating media hashes with algo={self.algorithm}") logger.debug(f"Calculating media hashes with algo={self.algorithm}")
for i, m in enumerate(to_enrich.media): for i, m in enumerate(to_enrich.media):
if len(hd := self.calculate_hash(m.filename)): if len(hd := self.calculate_hash(m.filename)):

View File

@@ -35,7 +35,7 @@ class HtmlFormatter(Formatter):
def format(self, item: Metadata) -> Media: def format(self, item: Metadata) -> Media:
url = item.get_url() url = item.get_url()
if item.is_empty(): if item.is_empty():
logger.debug("nothing to format, skipping") logger.debug("Nothing to format, skipping")
return return
content = self.template.render( content = self.template.render(

View File

@@ -49,7 +49,7 @@ class InstagramAPIExtractor(Extractor):
if not len(insta_matches) or len(insta_matches[0]) != 3: if not len(insta_matches) or len(insta_matches[0]) != 3:
return return
if len(insta_matches) > 1: if len(insta_matches) > 1:
logger.debug("multiple instagram matches found, using the first one") logger.debug("Multiple instagram matches found, using the first one")
return return
g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2] g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2]
if g1 == "": if g1 == "":
@@ -65,13 +65,13 @@ class InstagramAPIExtractor(Extractor):
return self.download_post(item, id=g3, context="story") return self.download_post(item, id=g3, context="story")
return self.download_stories(item, g2) return self.download_stories(item, g2)
else: else:
logger.warning(f"unknown instagram regex group match {g1=}") logger.warning(f"Unknown instagram regex group match {g1=}")
return return
@retry(wait_random_min=1000, wait_random_max=3000, stop_max_attempt_number=5) @retry(wait_random_min=1000, wait_random_max=3000, stop_max_attempt_number=5)
def call_api(self, path: str, params: dict) -> dict: def call_api(self, path: str, params: dict) -> dict:
headers = {"accept": "application/json", "x-access-key": self.access_token} headers = {"accept": "application/json", "x-access-key": self.access_token}
logger.debug(f"calling {self.api_endpoint}/{path} with {params=}") logger.debug(f"Calling {self.api_endpoint}/{path} with {params=}")
return requests.get(f"{self.api_endpoint}/{path}", headers=headers, params=params).json() return requests.get(f"{self.api_endpoint}/{path}", headers=headers, params=params).json()
def cleanup_dict(self, d: dict | list) -> dict: def cleanup_dict(self, d: dict | list) -> dict:
@@ -112,8 +112,8 @@ class InstagramAPIExtractor(Extractor):
count_posts += len(stories) count_posts += len(stories)
result.set("#stories", len(stories)) result.set("#stories", len(stories))
except Exception as e: except Exception as e:
result.append("errors", f"error downloading stories for {username}") result.append("errors", f"Error downloading stories for {username}")
logger.error(f"error downloading stories for {username}: {e} {traceback.format_exc()}") logger.error(f"Error downloading stories for {username}: {e} {traceback.format_exc()}")
# download all posts # download all posts
try: try:
@@ -122,8 +122,8 @@ class InstagramAPIExtractor(Extractor):
result, user_id, max_to_download=self.full_profile_max_posts - count_posts result, user_id, max_to_download=self.full_profile_max_posts - count_posts
) )
except Exception as e: except Exception as e:
result.append("errors", f"error downloading posts for {username}") result.append("errors", f"Error downloading posts for {username}")
logger.error(f"error downloading posts for {username}: {e} {traceback.format_exc()}") logger.error(f"Error downloading posts for {username}: {e} {traceback.format_exc()}")
# download all tagged # download all tagged
try: try:
@@ -132,8 +132,8 @@ class InstagramAPIExtractor(Extractor):
result, user_id, max_to_download=self.full_profile_max_posts - count_posts result, user_id, max_to_download=self.full_profile_max_posts - count_posts
) )
except Exception as e: except Exception as e:
result.append("errors", f"error downloading tagged posts for {username}") result.append("errors", f"Error downloading tagged posts for {username}")
logger.error(f"error downloading tagged posts for {username}: {e} {traceback.format_exc()}") logger.error(f"Error downloading tagged posts for {username}: {e} {traceback.format_exc()}")
# download all highlights # download all highlights
try: try:
@@ -159,10 +159,10 @@ class InstagramAPIExtractor(Extractor):
except Exception as e: except Exception as e:
result.append( result.append(
"errors", "errors",
f"error downloading highlight id{h.get('pk')} for {username}", f"Error downloading highlight id{h.get('pk')} for {username}",
) )
logger.error( logger.error(
f"error downloading highlight id{h.get('pk')} for {username}: {e} {traceback.format_exc()}" f"Error downloading highlight id{h.get('pk')} for {username}: {e} {traceback.format_exc()}"
) )
if count_highlights >= max_to_download: if count_highlights >= max_to_download:
logger.debug(f"HIGHLIGHTS reached max_to_download={self.full_profile_max_posts}") logger.debug(f"HIGHLIGHTS reached max_to_download={self.full_profile_max_posts}")
@@ -210,8 +210,8 @@ class InstagramAPIExtractor(Extractor):
try: try:
self.scrape_item(result, h, "highlight") self.scrape_item(result, h, "highlight")
except Exception as e: except Exception as e:
result.append("errors", f"error downloading highlight {h.get('id')}") result.append("errors", f"Error downloading highlight {h.get('id')}")
logger.error(f"error downloading highlight, skipping {h.get('id')}: {e} {traceback.format_exc()}") logger.error(f"Error downloading highlight, skipping {h.get('id')}: {e} {traceback.format_exc()}")
return h_info return h_info
@@ -248,13 +248,13 @@ class InstagramAPIExtractor(Extractor):
break break
posts, end_cursor = posts[0], posts[1] posts, end_cursor = posts[0], posts[1]
posts = posts[: min(max_to_download, len(posts))] posts = posts[: min(max_to_download, len(posts))]
logger.info(f"parsing {len(posts)} posts, next {end_cursor=} {post_count=} {max_to_download=}") logger.info(f"Parsing {len(posts)} posts, next {end_cursor=} {post_count=} {max_to_download=}")
for p in posts: for p in posts:
try: try:
self.scrape_item(result, p, "post") self.scrape_item(result, p, "post")
except Exception as e: except Exception as e:
result.append("errors", f"error downloading post {p.get('id')}") result.append("errors", f"Error downloading post {p.get('id')}")
logger.error(f"error downloading post, skipping {p.get('id')}: {e} {traceback.format_exc()}") logger.error(f"Error downloading post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
pbar.update(1) pbar.update(1)
post_count += 1 post_count += 1
if post_count >= max_to_download: if post_count >= max_to_download:
@@ -275,14 +275,14 @@ class InstagramAPIExtractor(Extractor):
break break
next_page_id = resp.get("next_page_id") next_page_id = resp.get("next_page_id")
logger.info(f"parsing {len(posts)} tagged posts, next {next_page_id=}") logger.info(f"Parsing {len(posts)} tagged posts, next {next_page_id=}")
posts = posts[: min(max_to_download, len(posts))] posts = posts[: min(max_to_download, len(posts))]
for p in posts: for p in posts:
try: try:
self.scrape_item(result, p, "tagged") self.scrape_item(result, p, "tagged")
except Exception as e: except Exception as e:
result.append("errors", f"error downloading tagged post {p.get('id')}") result.append("errors", f"Error downloading tagged post {p.get('id')}")
logger.error(f"error downloading tagged post, skipping {p.get('id')}: {e} {traceback.format_exc()}") logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
pbar.update(1) pbar.update(1)
tagged_count += 1 tagged_count += 1
if tagged_count >= max_to_download: if tagged_count >= max_to_download:

View File

@@ -7,6 +7,7 @@ highlights, and tagged posts. Authentication is required via username/password o
import re import re
import os import os
import shutil import shutil
import traceback
import instaloader import instaloader
from auto_archiver.utils.custom_logger import logger from auto_archiver.utils.custom_logger import logger
@@ -44,11 +45,11 @@ class InstagramExtractor(Extractor):
self.insta.load_session_from_file(self.username, self.session_file) self.insta.load_session_from_file(self.username, self.session_file)
except Exception: except Exception:
try: try:
logger.info("no valid session file found - Attempting login with use and password.") logger.info("No valid session file found - Attempting login with username and password.")
self.insta.login(self.username, self.password) self.insta.login(self.username, self.password)
self.insta.save_session_to_file(self.session_file) self.insta.save_session_to_file(self.session_file)
except Exception as e: except Exception as e:
logger.error(f"failed to setup Instagram Extractor with Instagrapi. {e}") logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}")
def download(self, item: Metadata) -> Metadata: def download(self, item: Metadata) -> Metadata:
url = item.get_url() url = item.get_url()
@@ -72,7 +73,7 @@ class InstagramExtractor(Extractor):
result = self.download_profile(url, profile_matches[0]) result = self.download_profile(url, profile_matches[0])
except Exception as e: except Exception as e:
logger.error( logger.error(
f"failed to download with instagram extractor due to: {e}, make sure your account credentials are valid." f"Failed to download with instagram extractor due to: {e}, make sure your account credentials are valid."
) )
finally: finally:
shutil.rmtree(self.download_folder, ignore_errors=True) shutil.rmtree(self.download_folder, ignore_errors=True)
@@ -95,27 +96,27 @@ class InstagramExtractor(Extractor):
try: try:
self.insta.download_post(post, target=f"profile_post_{post.owner_username}") self.insta.download_post(post, target=f"profile_post_{post.owner_username}")
except Exception as e: except Exception as e:
logger.error(f"failed to download post: {post.shortcode}: {e}") logger.error(f"Failed to download post: {post.shortcode}: {e} {traceback.format_exc()}")
except Exception as e: except Exception as e:
logger.error(f"failed profile.get_posts: {e}") logger.error(f"Failed profile.get_posts: {e}: {traceback.format_exc()}")
try: try:
for post in profile.get_tagged_posts(): for post in profile.get_tagged_posts():
try: try:
self.insta.download_post(post, target=f"tagged_post_{post.owner_username}") self.insta.download_post(post, target=f"tagged_post_{post.owner_username}")
except Exception as e: except Exception as e:
logger.error(f"failed to download tagged post: {post.shortcode}: {e}") logger.error(f"Failed to download tagged post: {post.shortcode}: {e} {traceback.format_exc()}")
except Exception as e: except Exception as e:
logger.error(f"failed profile.get_tagged_posts: {e}") logger.error(f"Failed profile.get_tagged_posts: {e} {traceback.format_exc()}")
try: try:
for post in profile.get_igtv_posts(): for post in profile.get_igtv_posts():
try: try:
self.insta.download_post(post, target=f"igtv_post_{post.owner_username}") self.insta.download_post(post, target=f"igtv_post_{post.owner_username}")
except Exception as e: except Exception as e:
logger.error(f"failed to download igtv post: {post.shortcode}: {e}") logger.error(f"Failed to download igtv post: {post.shortcode}: {e} {traceback.format_exc()}")
except Exception as e: except Exception as e:
logger.error(f"failed profile.get_igtv_posts: {e}") logger.error(f"Failed profile.get_igtv_posts: {e} {traceback.format_exc()}")
try: try:
for story in self.insta.get_stories([profile.userid]): for story in self.insta.get_stories([profile.userid]):
@@ -123,9 +124,9 @@ class InstagramExtractor(Extractor):
try: try:
self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}") self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}")
except Exception as e: except Exception as e:
logger.error(f"failed to download story item: {item}: {e}") logger.error(f"Failed to download story item: {item}: {e} {traceback.format_exc()}")
except Exception as e: except Exception as e:
logger.error(f"failed get_stories: {e}") logger.error(f"Failed get_stories: {e} {traceback.format_exc()}")
try: try:
for highlight in self.insta.get_highlights(profile.userid): for highlight in self.insta.get_highlights(profile.userid):
@@ -133,9 +134,9 @@ class InstagramExtractor(Extractor):
try: try:
self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}") self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}")
except Exception as e: except Exception as e:
logger.error(f"failed to download highlight item: {item}: {e}") logger.error(f"Failed to download highlight item: {item}: {e} {traceback.format_exc()}")
except Exception as e: except Exception as e:
logger.error(f"failed get_highlights: {e}") logger.error(f"Failed get_highlights: {e} {traceback.format_exc()}")
return self.process_downloads(url, f"@{username}", profile._asdict(), None) return self.process_downloads(url, f"@{username}", profile._asdict(), None)
@@ -158,4 +159,4 @@ class InstagramExtractor(Extractor):
return result.success("instagram") return result.success("instagram")
except Exception as e: except Exception as e:
logger.error(f"could not fetch instagram post due to: {e}") logger.error(f"Could not fetch instagram post due to: {e} {traceback.format_exc()}")

View File

@@ -32,7 +32,7 @@ class InstagramTbotExtractor(Extractor):
1. makes a copy of session_file that is removed in cleanup 1. makes a copy of session_file that is removed in cleanup
2. checks if the session file is valid 2. checks if the session file is valid
""" """
logger.info(f"SETUP {self.name} checking login...") logger.debug(f"SETUP {self.name} checking login...")
self._prepare_session_file() self._prepare_session_file()
self._initialize_telegram_client() self._initialize_telegram_client()
@@ -58,10 +58,10 @@ class InstagramTbotExtractor(Extractor):
"If you do, disable at least one of the archivers for the first-time setup of the telethon session: {e}" "If you do, disable at least one of the archivers for the first-time setup of the telethon session: {e}"
) )
with self.client.start(): with self.client.start():
logger.info(f"SETUP {self.name} login works.") logger.debug(f"SETUP {self.name} login works.")
def cleanup(self) -> None: def cleanup(self) -> None:
logger.info(f"CLEANUP {self.name}.") logger.debug(f"CLEANUP {self.name}.")
session_file_name = self.session_file + ".session" session_file_name = self.session_file + ".session"
if os.path.exists(session_file_name): if os.path.exists(session_file_name):
os.remove(session_file_name) os.remove(session_file_name)
@@ -79,17 +79,17 @@ class InstagramTbotExtractor(Extractor):
# This may be outdated and replaced by the below message, but keeping until confirmed # This may be outdated and replaced by the below message, but keeping until confirmed
if "You must enter a URL to a post" in message: if "You must enter a URL to a post" in message:
logger.debug(f"invalid link {url=} for {self.name}: {message}") logger.debug(f"Invalid link for {self.name}: {message}")
return False return False
if "Media not found or unavailable" in message: if "Media not found or unavailable" in message:
logger.debug(f"No media found for link {url=} for {self.name}: {message}") logger.debug(f"No media found for {self.name}: {message}")
return False return False
if message: if message:
result.set_content(message).set_title(message[:128]) result.set_content(message).set_title(message[:128])
elif result.is_empty(): elif result.is_empty():
logger.debug(f"No media found for link {url=} for {self.name}: {message}") logger.debug(f"No media found for {self.name}: {message}")
return False return False
return result.success("insta-via-bot") return result.success("insta-via-bot")

View File

@@ -8,9 +8,7 @@ from auto_archiver.core import Media, Metadata
class JsonEnricher(Enricher): class JsonEnricher(Enricher):
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url() logger.debug("Enriching as JSON")
logger.debug(f"JSON Enricher for {url=}")
item_path = os.path.join(self.tmp_dir, "metadata.json") item_path = os.path.join(self.tmp_dir, "metadata.json")
with open(item_path, mode="w", encoding="utf-8") as outf: with open(item_path, mode="w", encoding="utf-8") as outf:

View File

@@ -38,8 +38,7 @@ class LocalStorage(Storage):
os.makedirs(os.path.dirname(dest), exist_ok=True) os.makedirs(os.path.dirname(dest), exist_ok=True)
logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}") logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}")
res = shutil.copy2(media.filename, dest) shutil.copy2(media.filename, dest)
logger.info(res)
return True return True
# must be implemented even if unused # must be implemented even if unused

View File

@@ -12,20 +12,17 @@ class MetaEnricher(Enricher):
""" """
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
if to_enrich.is_empty(): if to_enrich.is_empty():
logger.debug(f"[SKIP] META_ENRICHER there is no media or metadata to enrich: {url=}") logger.debug("[SKIP] META_ENRICHER there is no media or metadata to enrich")
return return
logger.debug(f"calculating archive metadata information for {url=}") logger.debug("Calculating archive metadata information")
self.enrich_file_sizes(to_enrich) self.enrich_file_sizes(to_enrich)
self.enrich_archive_duration(to_enrich) self.enrich_archive_duration(to_enrich)
def enrich_file_sizes(self, to_enrich: Metadata): def enrich_file_sizes(self, to_enrich: Metadata):
logger.debug( logger.debug(f"Calculating archive file sizes for {len(to_enrich.media)} media files")
f"calculating archive file sizes for url={to_enrich.get_url()} ({len(to_enrich.media)} media files)"
)
total_size = 0 total_size = 0
for media in to_enrich.get_all_media(): for media in to_enrich.get_all_media():
file_stats = os.stat(media.filename) file_stats = os.stat(media.filename)
@@ -44,7 +41,7 @@ class MetaEnricher(Enricher):
size /= 1024 size /= 1024
def enrich_archive_duration(self, to_enrich): def enrich_archive_duration(self, to_enrich):
logger.debug(f"calculating archive duration for url={to_enrich.get_url()} ") logger.debug("Calculating archive duration")
archive_duration = datetime.datetime.now(datetime.timezone.utc) - to_enrich.get("_processed_at") archive_duration = datetime.datetime.now(datetime.timezone.utc) - to_enrich.get("_processed_at")
to_enrich.set("archive_duration_seconds", archive_duration.seconds) to_enrich.set("archive_duration_seconds", archive_duration.seconds)

View File

@@ -12,8 +12,7 @@ class MetadataEnricher(Enricher):
""" """
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url() logger.debug("Extracting EXIF metadata")
logger.debug(f"extracting EXIF metadata for {url=}")
for i, m in enumerate(to_enrich.media): for i, m in enumerate(to_enrich.media):
if len(md := self.get_metadata(m.filename)): if len(md := self.get_metadata(m.filename)):
@@ -31,8 +30,8 @@ class MetadataEnricher(Enricher):
field, value = line.strip().split(":", 1) field, value = line.strip().split(":", 1)
metadata[field.strip()] = value.strip() metadata[field.strip()] = value.strip()
return metadata return metadata
except FileNotFoundError: except FileNotFoundError as e:
logger.error("[exif_enricher] ExifTool not found. Make sure ExifTool is installed and added to PATH.") logger.error(f"ExifTool not found. Make sure ExifTool is installed and added to PATH. {e}")
except Exception as e: except Exception as e:
logger.error(f"Error occurred: {e}: {traceback.format_exc()}") logger.error(f"Error occurred: {e}: {traceback.format_exc()}")
return {} return {}

View File

@@ -1,4 +1,5 @@
import os import os
import traceback
from auto_archiver.utils.custom_logger import logger from auto_archiver.utils.custom_logger import logger
import opentimestamps import opentimestamps
@@ -14,13 +15,12 @@ from auto_archiver.utils.misc import get_current_timestamp
class OpentimestampsEnricher(Enricher): class OpentimestampsEnricher(Enricher):
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url() logger.debug("OpenTimestamps timestamping files")
logger.debug(f"OpenTimestamps timestamping files for {url=}")
# Get the media files to timestamp # Get the media files to timestamp
media_files = [m for m in to_enrich.media if m.filename and not m.get("opentimestamps")] media_files = [m for m in to_enrich.media if m.filename and not m.get("opentimestamps")]
if not media_files: if not media_files:
logger.debug(f"No files found to timestamp in {url=}") logger.debug("No files found to timestamp")
return return
timestamp_files = [] timestamp_files = []
@@ -94,7 +94,7 @@ class OpentimestampsEnricher(Enricher):
detached_timestamp.serialize(ctx) detached_timestamp.serialize(ctx)
f.write(ctx.getbytes()) f.write(ctx.getbytes())
except Exception as e: except Exception as e:
logger.warning(f"Failed to serialize timestamp file: {e}") logger.warning(f"Failed to serialize timestamp file: {e} {traceback.format_exc()}")
continue continue
# Create media for the timestamp file # Create media for the timestamp file
@@ -113,16 +113,16 @@ class OpentimestampsEnricher(Enricher):
media.set("opentimestamps", True) media.set("opentimestamps", True)
except Exception as e: except Exception as e:
logger.warning(f"Error while timestamping {media.filename}: {e}") logger.warning(f"Error while timestamping {media.filename}: {e} {traceback.format_exc()}")
# Add timestamp files to the metadata # Add timestamp files to the metadata
if timestamp_files: if timestamp_files:
to_enrich.set("opentimestamped", True) to_enrich.set("opentimestamped", True)
to_enrich.set("opentimestamps_count", len(timestamp_files)) to_enrich.set("opentimestamps_count", len(timestamp_files))
logger.info(f"{len(timestamp_files)} OpenTimestamps proofs created for {url=}") logger.info(f"{len(timestamp_files)} OpenTimestamps proofs created")
else: else:
to_enrich.set("opentimestamped", False) to_enrich.set("opentimestamped", False)
logger.warning(f"No successful timestamps created for {url=}") logger.warning("No successful timestamps created")
def verify_timestamp(self, detached_timestamp): def verify_timestamp(self, detached_timestamp):
""" """

View File

@@ -28,8 +28,7 @@ class PdqHashEnricher(Enricher):
""" """
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url() logger.debug("Calculating perceptual hashes")
logger.debug(f"calculating perceptual hashes for {url=}")
media_with_hashes = [] media_with_hashes = []
for m in to_enrich.media: for m in to_enrich.media:
@@ -44,7 +43,7 @@ class PdqHashEnricher(Enricher):
media.set("pdq_hash", hd) media.set("pdq_hash", hd)
media_with_hashes.append(media.filename) media_with_hashes.append(media.filename)
logger.debug(f"calculated '{len(media_with_hashes)}' perceptual hashes for {url=}: {media_with_hashes}") logger.debug(f"Calculated '{len(media_with_hashes)}' perceptual hashes: {media_with_hashes}")
def calculate_pdq_hash(self, filename): def calculate_pdq_hash(self, filename):
# returns a hexadecimal string with the perceptual hash for the given filename # returns a hexadecimal string with the perceptual hash for the given filename

View File

@@ -56,7 +56,7 @@ class S3Storage(Storage):
if existing_key := self.file_in_folder(path): if existing_key := self.file_in_folder(path):
media._key = existing_key media._key = existing_key
media.set("previously archived", True) media.set("previously archived", True)
logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}") logger.debug(f"Skipping upload of {media.filename} because it already exists in {media.key}")
return False return False
_, ext = os.path.splitext(media.key) _, ext = os.path.splitext(media.key)

View File

@@ -19,10 +19,10 @@ class SSLEnricher(Enricher):
url = to_enrich.get_url() url = to_enrich.get_url()
parsed = urlparse(url) parsed = urlparse(url)
assert parsed.scheme in ["https"], f"Invalid URL scheme {url=}" assert parsed.scheme in ["https"], "Invalid URL scheme"
domain = parsed.netloc domain = parsed.netloc
logger.debug(f"fetching SSL certificate for {domain=} in {url=}") logger.debug(f"Fetching SSL certificate for {domain=}")
cert = ssl.get_server_certificate((domain, 443)) cert = ssl.get_server_certificate((domain, 443))
cert_fn = os.path.join(self.tmp_dir, f"{slugify(domain)}.pem") cert_fn = os.path.join(self.tmp_dir, f"{slugify(domain)}.pem")

View File

@@ -38,7 +38,7 @@ class TelegramExtractor(Extractor):
video = s.find("video") video = s.find("video")
if video is None: if video is None:
logger.warning("could not find video") logger.warning("Could not find video")
image_tags = s.find_all(class_="tgme_widget_message_photo_wrap") image_tags = s.find_all(class_="tgme_widget_message_photo_wrap")
image_urls = [] image_urls = []

View File

@@ -65,7 +65,7 @@ class TelethonExtractor(Extractor):
# get currently joined channels # get currently joined channels
# https://docs.telethon.dev/en/stable/modules/custom.html#module-telethon.tl.custom.dialog # https://docs.telethon.dev/en/stable/modules/custom.html#module-telethon.tl.custom.dialog
joined_channel_ids = [c.id for c in self.client.get_dialogs() if c.is_channel] joined_channel_ids = [c.id for c in self.client.get_dialogs() if c.is_channel]
logger.info(f"already part of {len(joined_channel_ids)} channels") logger.info(f"Already part of {len(joined_channel_ids)} channels")
i = 0 i = 0
pbar = tqdm(desc=f"joining {len(self.channel_invites)} invite links", total=len(self.channel_invites)) pbar = tqdm(desc=f"joining {len(self.channel_invites)} invite links", total=len(self.channel_invites))
@@ -80,22 +80,22 @@ class TelethonExtractor(Extractor):
else: else:
ent = self.client.get_entity(invite) # fails if not a member ent = self.client.get_entity(invite) # fails if not a member
logger.warning( logger.warning(
f"please add the property id='{ent.id}' to the 'channel_invites' configuration where {invite=}, not doing so can lead to a minutes-long setup time due to telegram's rate limiting." f"Please add the property id='{ent.id}' to the 'channel_invites' configuration where {invite=}, not doing so can lead to a minutes-long setup time due to telegram's rate limiting."
) )
except ValueError: except ValueError:
logger.info(f"joining new channel {invite=}") logger.info(f"Joining new channel {invite=}")
try: try:
self.client(ImportChatInviteRequest(match.group(2))) self.client(ImportChatInviteRequest(match.group(2)))
except UserAlreadyParticipantError: except UserAlreadyParticipantError:
logger.info(f"already joined {invite=}") logger.info(f"Already joined {invite=}")
except InviteRequestSentError: except InviteRequestSentError:
logger.warning(f"already sent a join request with {invite} still no answer") logger.warning(f"Already sent a join request with {invite} still no answer")
except InviteHashExpiredError: except InviteHashExpiredError:
logger.warning(f"{invite=} has expired please find a more recent one") logger.warning(f"{invite=} has expired please find a more recent one")
except Exception as e: except Exception as e:
logger.error(f"could not join channel with {invite=} due to {e}") logger.error(f"Could not join channel with {invite=} due to {e}")
except FloodWaitError as e: except FloodWaitError as e:
logger.warning(f"got a flood error, need to wait {e.seconds} seconds") logger.warning(f"Got a flood error, need to wait {e.seconds} seconds")
time.sleep(e.seconds) time.sleep(e.seconds)
continue continue
else: else:
@@ -117,7 +117,7 @@ class TelethonExtractor(Extractor):
url = item.get_url() url = item.get_url()
# detect URLs that we definitely cannot handle # detect URLs that we definitely cannot handle
match = self.valid_url.search(url) match = self.valid_url.search(url)
logger.debug(f"TELETHON: {match=}") logger.debug(f"Found telethon url {match=}")
if not match: if not match:
return False return False
@@ -135,10 +135,10 @@ class TelethonExtractor(Extractor):
try: try:
stories = self.client(functions.stories.GetStoriesByIDRequest(peer=chat, id=[post_id])) stories = self.client(functions.stories.GetStoriesByIDRequest(peer=chat, id=[post_id]))
if not stories.stories: if not stories.stories:
logger.info(f"No stories found for {url}, possibly it's private or the story has expired.") logger.info("No stories found, possibly it's private or the story has expired.")
return False return False
story = stories.stories[0] story = stories.stories[0]
logger.debug(f"TELETHON got story {story.id=} {story.date=} {story.expire_date=}") logger.debug(f"Got story {story.id=} {story.date=} {story.expire_date=}")
result.set_timestamp(story.date).set("views", story.views.to_dict()).set( result.set_timestamp(story.date).set("views", story.views.to_dict()).set(
"expire_date", story.expire_date "expire_date", story.expire_date
) )
@@ -154,20 +154,20 @@ class TelethonExtractor(Extractor):
try: try:
post = self.client.get_messages(chat, ids=post_id) post = self.client.get_messages(chat, ids=post_id)
except ValueError as e: except ValueError as e:
logger.error(f"Could not fetch telegram {url} possibly it's private: {e}") logger.error(f"Could not fetch telegram URL possibly it's private: {e}")
return False return False
except ChannelInvalidError as e: except ChannelInvalidError as e:
logger.error( logger.error(
f"Could not fetch telegram {url}. This error may be fixed if you setup a bot_token in addition to api_id and api_hash (but then private channels will not be archived, we need to update this logic to handle both): {e}" f"Could not fetch telegram URL. This error may be fixed if you setup a bot_token in addition to api_id and api_hash (but then private channels will not be archived, we need to update this logic to handle both): {e}"
) )
return False return False
logger.debug(f"TELETHON got post {post=}") logger.debug(f"Got post {post=}")
if post is None: if post is None:
return False return False
media_posts = self._get_media_posts_in_group(chat, post) media_posts = self._get_media_posts_in_group(chat, post)
logger.debug(f"got {len(media_posts)=} for {url=}") logger.debug(f"Got {len(media_posts)=}")
group_id = post.grouped_id if post.grouped_id is not None else post.id group_id = post.grouped_id if post.grouped_id is not None else post.id
title = post.message title = post.message

View File

@@ -27,12 +27,12 @@ class ThumbnailEnricher(Enricher):
Calculates how many thumbnails to generate and at which timestamps based on the video duration, the number of thumbnails per minute and the max number of thumbnails. Calculates how many thumbnails to generate and at which timestamps based on the video duration, the number of thumbnails per minute and the max number of thumbnails.
Thumbnails are equally distributed across the video duration. Thumbnails are equally distributed across the video duration.
""" """
logger.debug(f"generating thumbnails for {to_enrich.get_url()}") logger.debug("Generating thumbnails")
for m_id, m in enumerate(to_enrich.media[::]): for m_id, m in enumerate(to_enrich.media[::]):
if m.is_video(): if m.is_video():
folder = os.path.join(self.tmp_dir, random_str(24)) folder = os.path.join(self.tmp_dir, random_str(24))
os.makedirs(folder, exist_ok=True) os.makedirs(folder, exist_ok=True)
logger.debug(f"generating thumbnails for {m.filename}") logger.debug(f"Generating thumbnails for {m.filename}")
duration = m.get("duration") duration = m.get("duration")
try: try:
@@ -42,10 +42,10 @@ class ThumbnailEnricher(Enricher):
) )
to_enrich.media[m_id].set("duration", duration) to_enrich.media[m_id].set("duration", duration)
except Exception as e: except Exception as e:
logger.warning(f"failed to get duration with FFMPEG from {m.filename}: {e}") logger.warning(f"Failed to get duration with FFMPEG from {m.filename}: {e}")
if not duration or type(duration) not in [float, int] or duration <= 0: if not duration or type(duration) not in [float, int] or duration <= 0:
logger.warning(f"cannot generate thumbnails for {m.filename} without valid duration") logger.warning(f"Cannot generate thumbnails for {m.filename} without valid duration")
continue continue
num_thumbs = int(min(max(1, (duration / 60) * self.thumbnails_per_minute), self.max_thumbnails)) num_thumbs = int(min(max(1, (duration / 60) * self.thumbnails_per_minute), self.max_thumbnails))

View File

@@ -49,8 +49,7 @@ class TimestampingEnricher(Enricher):
self.session.close() self.session.close()
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url() logger.debug(f"RFC3161 timestamping existing files")
logger.debug(f"RFC3161 timestamping existing files for {url=}")
# create a new text file with the existing media hashes # create a new text file with the existing media hashes
hashes = [ hashes = [
@@ -58,7 +57,7 @@ class TimestampingEnricher(Enricher):
] ]
if not len(hashes): if not len(hashes):
logger.debug(f"No hashes found in {url=}") logger.debug(f"No hashes found")
return return
@@ -74,7 +73,7 @@ class TimestampingEnricher(Enricher):
try: try:
message = bytes(data_to_sign, encoding='utf8') message = bytes(data_to_sign, encoding='utf8')
logger.debug(f"Timestamping {url=} with {tsa_url=}") logger.debug(f"Timestamping with {tsa_url=}")
signed: TimeStampResponse = self.sign_data(tsa_url, message) signed: TimeStampResponse = self.sign_data(tsa_url, message)
# fail if there's any issue with the certificates, uses certifi list of trusted CAs or the user-defined `cert_authorities` # fail if there's any issue with the certificates, uses certifi list of trusted CAs or the user-defined `cert_authorities`
@@ -92,7 +91,7 @@ class TimestampingEnricher(Enricher):
timestamp_token_path = self.save_timestamp_token(signed.time_stamp_token(), tsa_url) timestamp_token_path = self.save_timestamp_token(signed.time_stamp_token(), tsa_url)
timestamp_tokens.append(Media(filename=timestamp_token_path).set("tsa", tsa_url).set("cert_chain", cert_chain)) timestamp_tokens.append(Media(filename=timestamp_token_path).set("tsa", tsa_url).set("cert_chain", cert_chain))
except Exception as e: except Exception as e:
logger.warning(f"Error while timestamping {url=} with {tsa_url=}: {e}") logger.warning(f"Error while timestamping with {tsa_url=}: {e}")
if len(timestamp_tokens): if len(timestamp_tokens):
hashes_media.set("timestamp_authority_files", timestamp_tokens) hashes_media.set("timestamp_authority_files", timestamp_tokens)
@@ -101,9 +100,9 @@ class TimestampingEnricher(Enricher):
hashes_media.set("cryptography v", version("cryptography")) hashes_media.set("cryptography v", version("cryptography"))
to_enrich.add_media(hashes_media, id="timestamped_hashes") to_enrich.add_media(hashes_media, id="timestamped_hashes")
to_enrich.set("timestamped", True) to_enrich.set("timestamped", True)
logger.info(f"{len(timestamp_tokens)} timestamp tokens created for {url=}") logger.info(f"{len(timestamp_tokens)} timestamp tokens created")
else: else:
logger.warning(f"No successful timestamps for {url=}") logger.warning(f"No successful timestamps found")
def save_timestamp_token(self, timestamp_token: bytes, tsa_url: str) -> str: def save_timestamp_token(self, timestamp_token: bytes, tsa_url: str) -> str:
""" """

View File

@@ -45,10 +45,9 @@ class TwitterApiExtractor(Extractor):
if "https://t.co/" in url: if "https://t.co/" in url:
try: try:
r = requests.get(url, timeout=30) r = requests.get(url, timeout=30)
logger.debug(f"Expanded url {url} to {r.url}")
url = r.url url = r.url
except Exception: except Exception as e:
logger.error(f"Failed to expand url {url}") logger.error(f"Failed to expand Twitter URL: {e}")
return url return url
def download(self, item: Metadata) -> Metadata: def download(self, item: Metadata) -> Metadata:
@@ -67,7 +66,7 @@ class TwitterApiExtractor(Extractor):
return False, False return False, False
username, tweet_id = matches[0] # only one URL supported username, tweet_id = matches[0] # only one URL supported
logger.debug(f"Found {username=} and {tweet_id=} in {url=}") logger.debug(f"Found {username=} and {tweet_id=}")
return username, tweet_id return username, tweet_id
@@ -85,7 +84,7 @@ class TwitterApiExtractor(Extractor):
media_fields=["type", "duration_ms", "url", "variants"], media_fields=["type", "duration_ms", "url", "variants"],
tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"], tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"],
) )
logger.debug(tweet) logger.debug(f"Got {tweet=}")
except Exception as e: except Exception as e:
logger.error(f"Could not get tweet: {e}") logger.error(f"Could not get tweet: {e}")
return False return False

View File

@@ -94,7 +94,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
# call docker if explicitly enabled or we are running on the host (not in docker) # call docker if explicitly enabled or we are running on the host (not in docker)
if self.use_docker: if self.use_docker:
logger.debug(f"generating WACZ in Docker for {url=}") logger.debug("Generating WACZ in Docker")
logger.debug(f"{browsertrix_home_host=} {browsertrix_home_container=}") logger.debug(f"{browsertrix_home_host=} {browsertrix_home_container=}")
if self.docker_commands: if self.docker_commands:
cmd = self.docker_commands + cmd cmd = self.docker_commands + cmd
@@ -111,12 +111,12 @@ class WaczExtractorEnricher(Enricher, Extractor):
if self.profile: if self.profile:
profile_file = f"profile-{self.crawl_id}.tar.gz" profile_file = f"profile-{self.crawl_id}.tar.gz"
profile_fn = os.path.join(browsertrix_home_container, profile_file) profile_fn = os.path.join(browsertrix_home_container, profile_file)
logger.debug(f"copying {self.profile} to {profile_fn}") logger.debug(f"Copying {self.profile} to {profile_fn}")
shutil.copyfile(self.profile, profile_fn) shutil.copyfile(self.profile, profile_fn)
cmd.extend(["--profile", os.path.join("/crawls", profile_file)]) cmd.extend(["--profile", os.path.join("/crawls", profile_file)])
else: else:
logger.debug(f"generating WACZ without Docker for {url=}") logger.debug("Generating WACZ without Docker")
if self.profile: if self.profile:
cmd.extend(["--profile", os.path.join("/app", str(self.profile))]) cmd.extend(["--profile", os.path.join("/app", str(self.profile))])

View File

@@ -31,15 +31,15 @@ class WaybackExtractorEnricher(Enricher, Extractor):
url = to_enrich.get_url() url = to_enrich.get_url()
if UrlUtil.is_auth_wall(url): if UrlUtil.is_auth_wall(url):
logger.debug(f"[SKIP] WAYBACK since url is behind AUTH WALL: {url=}") logger.debug("[SKIP] WAYBACK since url is behind AUTH WALL")
return return
logger.debug(f"calling wayback for {url=}")
if to_enrich.get("wayback"): if to_enrich.get("wayback"):
logger.info(f"Wayback enricher had already been executed: {to_enrich.get('wayback')}") logger.info(f"Wayback enricher had already been executed: {to_enrich.get('wayback')}")
return True return True
logger.debug("Calling Wayback")
ia_headers = {"Accept": "application/json", "Authorization": f"LOW {self.key}:{self.secret}"} ia_headers = {"Accept": "application/json", "Authorization": f"LOW {self.key}:{self.secret}"}
post_data = {"url": url} post_data = {"url": url}
if self.if_not_archived_within: if self.if_not_archived_within:
@@ -68,7 +68,7 @@ class WaybackExtractorEnricher(Enricher, Extractor):
attempt = 1 attempt = 1
while not wayback_url and time.time() - start_time <= self.timeout: while not wayback_url and time.time() - start_time <= self.timeout:
try: try:
logger.debug(f"GETting status for {job_id=} on {url=} ({attempt=})") logger.debug(f"GETting status for {job_id=} ({attempt=})")
r_status = requests.get( r_status = requests.get(
f"https://web.archive.org/save/status/{job_id}", headers=ia_headers, proxies=proxies f"https://web.archive.org/save/status/{job_id}", headers=ia_headers, proxies=proxies
) )
@@ -79,13 +79,13 @@ class WaybackExtractorEnricher(Enricher, Extractor):
logger.error(f"Wayback failed with {r_json}") logger.error(f"Wayback failed with {r_json}")
return False return False
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException as e:
logger.warning(f"RequestException: fetching status for {url=} due to: {e}") logger.warning(f"RequestException: fetching status due to: {e}")
break break
except json.decoder.JSONDecodeError: except json.decoder.JSONDecodeError:
logger.error(f"Expected a JSON from Wayback and got {r.text} for {url=}") logger.error(f"Expected a JSON from Wayback and got {r.text}")
break break
except Exception as e: except Exception as e:
logger.warning(f"error fetching status for {url=} due to: {e}") logger.warning(f"error fetching status due to: {e}")
if not wayback_url: if not wayback_url:
attempt += 1 attempt += 1
time.sleep(1) # TODO: can be improved with exponential backoff time.sleep(1) # TODO: can be improved with exponential backoff

View File

@@ -25,7 +25,7 @@ class WhisperEnricher(Enricher):
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url() url = to_enrich.get_url()
logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.") logger.debug(f"WHISPER[{self.action}]: iterating media items")
job_results = {} job_results = {}
for i, m in enumerate(to_enrich.media): for i, m in enumerate(to_enrich.media):
@@ -35,7 +35,7 @@ class WhisperEnricher(Enricher):
try: try:
job_id = self.submit_job(m) job_id = self.submit_job(m)
job_results[job_id] = False job_results[job_id] = False
logger.debug(f"JOB SUBMITTED: {job_id=} for {m.key=}") logger.debug(f"Job submitted: {job_id=} for {m.key=}")
to_enrich.media[i].set("whisper_model", {"job_id": job_id}) to_enrich.media[i].set("whisper_model", {"job_id": job_id})
except Exception as e: except Exception as e:
logger.error( logger.error(
@@ -72,14 +72,14 @@ class WhisperEnricher(Enricher):
"type": self.action, "type": self.action,
# "language": "string" # may be a config # "language": "string" # may be a config
} }
logger.debug(f"calling API with {payload=}") logger.debug(f"Calling API with {payload=}")
response = requests.post( response = requests.post(
f"{self.api_endpoint}/jobs", json=payload, headers={"Authorization": f"Bearer {self.api_key}"} f"{self.api_endpoint}/jobs", json=payload, headers={"Authorization": f"Bearer {self.api_key}"}
) )
assert response.status_code == 201, ( assert response.status_code == 201, (
f"calling the whisper api {self.api_endpoint} returned a non-success code: {response.status_code}" f"calling the whisper api {self.api_endpoint} returned a non-success code: {response.status_code}"
) )
logger.debug(response.json()) logger.debug(f"Response from whisper API: {response.json()}")
return response.json()["id"] return response.json()["id"]
def check_jobs(self, job_results: dict): def check_jobs(self, job_results: dict):
@@ -115,7 +115,7 @@ class WhisperEnricher(Enricher):
assert r_res.status_code == 200, ( assert r_res.status_code == 200, (
f"Job artifacts did not respond with 200, instead with: {r_res.status_code}" f"Job artifacts did not respond with 200, instead with: {r_res.status_code}"
) )
logger.success(r_res.json()) logger.info(f"Job {job_id} completed successfully:{r_res.json()}")
result = {} result = {}
for art_id, artifact in enumerate(r_res.json()): for art_id, artifact in enumerate(r_res.json()):
subtitle = [] subtitle = []

View File

@@ -6,7 +6,6 @@ import uuid
from datetime import datetime, timezone from datetime import datetime, timezone
from dateutil.parser import parse as parse_dt from dateutil.parser import parse as parse_dt
import requests
from auto_archiver.utils.custom_logger import logger from auto_archiver.utils.custom_logger import logger
@@ -15,18 +14,6 @@ def mkdir_if_not_exists(folder):
os.makedirs(folder) os.makedirs(folder)
def expand_url(url):
# expand short URL links
if "https://t.co/" in url:
try:
r = requests.get(url)
logger.debug(f"Expanded url {url} to {r.url}")
return r.url
except Exception:
logger.error(f"Failed to expand url {url}")
return url
def getattr_or(o: object, prop: str, default=None): def getattr_or(o: object, prop: str, default=None):
try: try:
res = getattr(o, prop) res = getattr(o, prop)

View File

@@ -29,7 +29,7 @@ def test_fetch_fail_status(api_db, metadata, mocker):
mock_get = mocker.patch("auto_archiver.modules.api_db.api_db.requests.get") mock_get = mocker.patch("auto_archiver.modules.api_db.api_db.requests.get")
mock_get.return_value.status_code = 400 mock_get.return_value.status_code = 400
mock_get.return_value.json.return_value = {} mock_get.return_value.json.return_value = {}
mock_error = mocker.patch("loguru.logger.error") mock_error = mocker.patch("auto_archiver.utils.custom_logger.logger.error")
assert api_db.fetch(metadata) is False assert api_db.fetch(metadata) is False
mock_error.assert_called_once_with("AA API FAIL (400): {}") mock_error.assert_called_once_with("AA API FAIL (400): {}")

View File

@@ -33,7 +33,6 @@ def test_enrich_skips_empty_metadata(meta_enricher, mock_metadata):
"""Test that enrich() does nothing when Metadata is empty.""" """Test that enrich() does nothing when Metadata is empty."""
mock_metadata.is_empty.return_value = True mock_metadata.is_empty.return_value = True
meta_enricher.enrich(mock_metadata) meta_enricher.enrich(mock_metadata)
mock_metadata.get_url.assert_called_once()
def test_enrich_file_sizes(meta_enricher, metadata, tmp_path): def test_enrich_file_sizes(meta_enricher, metadata, tmp_path):

View File

@@ -65,7 +65,7 @@ def test_enrich_empty_media(enricher, mocker):
def test_get_metadata_error_handling(enricher, mocker): def test_get_metadata_error_handling(enricher, mocker):
mocker.patch("subprocess.run", side_effect=Exception("Test error")) mocker.patch("subprocess.run", side_effect=Exception("Test error"))
mock_log = mocker.patch("loguru.logger.error") mock_log = mocker.patch("auto_archiver.utils.custom_logger.logger.error")
result = enricher.get_metadata("test.jpg") result = enricher.get_metadata("test.jpg")
assert result == {} assert result == {}
assert "Error occurred: " in mock_log.call_args[0][0] assert "Error occurred: " in mock_log.call_args[0][0]

View File

@@ -43,7 +43,7 @@ def test_enrich_skip_non_image(metadata_with_images, mocker):
def test_enrich_handles_corrupted_image(metadata_with_images, mocker): def test_enrich_handles_corrupted_image(metadata_with_images, mocker):
mocker.patch("PIL.Image.open", side_effect=UnidentifiedImageError("Corrupted image")) mocker.patch("PIL.Image.open", side_effect=UnidentifiedImageError("Corrupted image"))
mock_pdq = mocker.patch("pdqhash.compute") mock_pdq = mocker.patch("pdqhash.compute")
mock_logger = mocker.patch("loguru.logger.error") mock_logger = mocker.patch("auto_archiver.utils.custom_logger.logger.error")
enricher = PdqHashEnricher() enricher = PdqHashEnricher()
enricher.enrich(metadata_with_images) enricher.enrich(metadata_with_images)

View File

@@ -75,12 +75,12 @@ def test_enrich_thumbnail_limits(
def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video, mocker): def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video, mocker):
mocker.patch("ffmpeg.probe", side_effect=Exception("Probe error")) mocker.patch("ffmpeg.probe", side_effect=Exception("Probe error"))
mocker.patch("os.makedirs") mocker.patch("os.makedirs")
mock_logger = mocker.patch("loguru.logger.warning") mock_logger = mocker.patch("auto_archiver.utils.custom_logger.logger.warning")
mocker.patch.object(Media, "is_video", return_value=True) mocker.patch.object(Media, "is_video", return_value=True)
thumbnail_enricher.enrich(metadata_with_video) thumbnail_enricher.enrich(metadata_with_video)
# Ensure error was logged # Ensure error was logged
mock_logger.assert_called_with("cannot generate thumbnails for video.mp4 without valid duration") mock_logger.assert_called_with("Cannot generate thumbnails for video.mp4 without valid duration")
# Ensure no thumbnails were created # Ensure no thumbnails were created
thumbnails = metadata_with_video.media[0].get("thumbnails") thumbnails = metadata_with_video.media[0].get("thumbnails")
assert thumbnails is None assert thumbnails is None
@@ -128,12 +128,12 @@ def test_enrich_handles_short_video(
def test_uses_existing_duration_on_exception(thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, mocker): def test_uses_existing_duration_on_exception(thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, mocker):
mock_logger = mocker.patch("loguru.logger.warning") mock_logger = mocker.patch("auto_archiver.utils.custom_logger.logger.warning")
mock_probe = mocker.patch("ffmpeg.probe", side_effect=Exception("New probe error")) mock_probe = mocker.patch("ffmpeg.probe", side_effect=Exception("New probe error"))
metadata_with_video.media[0].set("duration", 3) metadata_with_video.media[0].set("duration", 3)
thumbnail_enricher.enrich(metadata_with_video) thumbnail_enricher.enrich(metadata_with_video)
mock_probe.assert_called_once() mock_probe.assert_called_once()
mock_logger.assert_called_with("failed to get duration with FFMPEG from video.mp4: New probe error") mock_logger.assert_called_with("Failed to get duration with FFMPEG from video.mp4: New probe error")
assert mock_ffmpeg_environment["mock_output"].run.call_count == 3 assert mock_ffmpeg_environment["mock_output"].run.call_count == 3

View File

@@ -46,7 +46,7 @@ def test_setup_with_docker(wacz_enricher, mocker):
def test_already_ran(wacz_enricher, metadata, mocker): def test_already_ran(wacz_enricher, metadata, mocker):
metadata.add_media(Media("test.wacz"), id="browsertrix") metadata.add_media(Media("test.wacz"), id="browsertrix")
mock_log = mocker.patch("loguru.logger.info") mock_log = mocker.patch("auto_archiver.utils.custom_logger.logger.info")
assert wacz_enricher.enrich(metadata) is True assert wacz_enricher.enrich(metadata) is True
assert "WACZ enricher had already been executed" in mock_log.call_args[0][0] assert "WACZ enricher had already been executed" in mock_log.call_args[0][0]
@@ -73,7 +73,7 @@ def test_download_success(wacz_enricher, mocker) -> None:
def test_enrich_already_executed(wacz_enricher, mocker) -> None: def test_enrich_already_executed(wacz_enricher, mocker) -> None:
"""Test enrich if already executed.""" """Test enrich if already executed."""
mock_log = mocker.patch("loguru.logger.info") mock_log = mocker.patch("auto_archiver.utils.custom_logger.logger.info")
metadata = Metadata().set_url("https://example.com") metadata = Metadata().set_url("https://example.com")
media = Media(filename="some_file.wacz") media = Media(filename="some_file.wacz")
metadata.add_media(media, id="browsertrix") metadata.add_media(media, id="browsertrix")

View File

@@ -118,8 +118,7 @@ def test_check_required_values(orchestrator, caplog, test_args):
with pytest.raises(SystemExit): with pytest.raises(SystemExit):
orchestrator.setup_config(test_args) orchestrator.setup_config(test_args)
assert "the following arguments are required: --example_module.required_field" in caplog.records[0].message
assert caplog.records[1].message == "the following arguments are required: --example_module.required_field"
def test_get_required_values_from_config(orchestrator, test_args, tmp_path): def test_get_required_values_from_config(orchestrator, test_args, tmp_path):

View File

@@ -6,7 +6,6 @@ import pytest
from auto_archiver.utils.misc import ( from auto_archiver.utils.misc import (
mkdir_if_not_exists, mkdir_if_not_exists,
expand_url,
getattr_or, getattr_or,
DateTimeEncoder, DateTimeEncoder,
dump_payload, dump_payload,
@@ -39,26 +38,6 @@ class TestDirectoryUtils:
assert existing_dir.exists() assert existing_dir.exists()
class TestURLExpansion:
@pytest.mark.parametrize(
"input_url,expected",
[("https://example.com", "https://example.com"), ("https://t.co/test", "https://expanded.url")],
)
def test_expand_url(self, input_url, expected, mocker):
mock_response = mocker.Mock()
mock_response.url = "https://expanded.url"
mocker.patch("requests.get", return_value=mock_response)
result = expand_url(input_url)
assert result == expected
def test_expand_url_handles_errors(self, caplog, mocker):
mocker.patch("requests.get", side_effect=Exception("Connection error"))
url = "https://t.co/error"
result = expand_url(url)
assert result == url
assert f"Failed to expand url {url}" in caplog.text
class TestAttributeHandling: class TestAttributeHandling:
class Sample: class Sample:
exists = "value" exists = "value"