mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-07 19:08:30 +03:00
concludes logging standardization refactor
This commit is contained in:
@@ -24,4 +24,4 @@ SESSION_FILE = "secrets/anon-insta"
|
|||||||
|
|
||||||
os.makedirs("secrets", exist_ok=True)
|
os.makedirs("secrets", exist_ok=True)
|
||||||
with TelegramClient(SESSION_FILE, API_ID, API_HASH) as client:
|
with TelegramClient(SESSION_FILE, API_ID, API_HASH) as client:
|
||||||
logger.success(f"New session file created: {SESSION_FILE}.session")
|
logger.success(f"new session file created: {SESSION_FILE}.session")
|
||||||
|
|||||||
@@ -94,7 +94,7 @@ class Extractor(BaseModule):
|
|||||||
to_filename = to_filename[-64:]
|
to_filename = to_filename[-64:]
|
||||||
to_filename = os.path.join(self.tmp_dir, to_filename)
|
to_filename = os.path.join(self.tmp_dir, to_filename)
|
||||||
if verbose:
|
if verbose:
|
||||||
logger.debug(f"downloading {to_filename=}")
|
logger.debug(f"Downloading {to_filename=}")
|
||||||
headers = {
|
headers = {
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -86,7 +86,7 @@ class Media:
|
|||||||
@property # getter .mimetype
|
@property # getter .mimetype
|
||||||
def mimetype(self) -> str:
|
def mimetype(self) -> str:
|
||||||
if not self.filename or len(self.filename) == 0:
|
if not self.filename or len(self.filename) == 0:
|
||||||
logger.warning(f"cannot get mimetype from media without filename: {self}")
|
logger.warning(f"Cannot get mimetype from media without filename: {self}")
|
||||||
return ""
|
return ""
|
||||||
if not self._mimetype:
|
if not self._mimetype:
|
||||||
self._mimetype = mimetypes.guess_type(self.filename)[0]
|
self._mimetype = mimetypes.guess_type(self.filename)[0]
|
||||||
@@ -116,7 +116,7 @@ class Media:
|
|||||||
# self.is_video() should be used together with this method
|
# self.is_video() should be used together with this method
|
||||||
try:
|
try:
|
||||||
streams = ffmpeg.probe(self.filename, select_streams="v")["streams"]
|
streams = ffmpeg.probe(self.filename, select_streams="v")["streams"]
|
||||||
logger.debug(f"STREAMS FOR {self.filename} {streams}")
|
logger.debug(f"Streams for {self.filename}: {streams}")
|
||||||
return any(s.get("duration_ts", 0) > 0 for s in streams)
|
return any(s.get("duration_ts", 0) > 0 for s in streams)
|
||||||
except Error:
|
except Error:
|
||||||
return False # ffmpeg errors when reading bad files
|
return False # ffmpeg errors when reading bad files
|
||||||
|
|||||||
@@ -539,11 +539,11 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
for feeder in self.feeders:
|
for feeder in self.feeders:
|
||||||
for item in feeder:
|
for item in feeder:
|
||||||
with logger.contextualize(url=item.get_url(), trace=random_str(12)):
|
with logger.contextualize(url=item.get_url(), trace=random_str(12)):
|
||||||
logger.info("started processing")
|
logger.info("Started processing")
|
||||||
yield self.feed_item(item)
|
yield self.feed_item(item)
|
||||||
url_count += 1
|
url_count += 1
|
||||||
|
|
||||||
logger.info(f"processed {url_count} URL(s)")
|
logger.info(f"Processed {url_count} URL(s)")
|
||||||
self.cleanup()
|
self.cleanup()
|
||||||
|
|
||||||
def feed_item(self, item: Metadata) -> Metadata:
|
def feed_item(self, item: Metadata) -> Metadata:
|
||||||
@@ -561,7 +561,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
return self.archive(item)
|
return self.archive(item)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
# catches keyboard interruptions to do a clean exit
|
# catches keyboard interruptions to do a clean exit
|
||||||
logger.warning("caught interrupt")
|
logger.warning("Caught interrupt")
|
||||||
for d in self.databases:
|
for d in self.databases:
|
||||||
d.aborted(item)
|
d.aborted(item)
|
||||||
self.cleanup()
|
self.cleanup()
|
||||||
@@ -620,25 +620,25 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
try:
|
try:
|
||||||
d.done(cached_result, cached=True)
|
d.done(cached_result, cached=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"database {d.name}: {e}: {traceback.format_exc()}")
|
logger.error(f"Database {d.name}: {e}: {traceback.format_exc()}")
|
||||||
return cached_result
|
return cached_result
|
||||||
|
|
||||||
# 3 - call extractors until one succeeds
|
# 3 - call extractors until one succeeds
|
||||||
for a in self.extractors:
|
for a in self.extractors:
|
||||||
logger.info(f"trying extractor {a.name}")
|
logger.info(f"Trying extractor {a.name}")
|
||||||
try:
|
try:
|
||||||
result.merge(a.download(result))
|
result.merge(a.download(result))
|
||||||
if result.is_success():
|
if result.is_success():
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"archiver {a.name}: {e}: {traceback.format_exc()}")
|
logger.error(f"Extractor {a.name}: {e}: {traceback.format_exc()}")
|
||||||
|
|
||||||
# 4 - call enrichers to work with archived content
|
# 4 - call enrichers to work with archived content
|
||||||
for e in self.enrichers:
|
for e in self.enrichers:
|
||||||
try:
|
try:
|
||||||
e.enrich(result)
|
e.enrich(result)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.error(f"enricher {e.name}: {exc}: {traceback.format_exc()}")
|
logger.error(f"Enricher {e.name}: {exc}: {traceback.format_exc()}")
|
||||||
|
|
||||||
# 5 - store all downloaded/generated media
|
# 5 - store all downloaded/generated media
|
||||||
result.store(storages=self.storages)
|
result.store(storages=self.storages)
|
||||||
@@ -657,7 +657,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
try:
|
try:
|
||||||
d.done(result)
|
d.done(result)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"database {d.name}: {e}: {traceback.format_exc()}")
|
logger.error(f"Database {d.name}: {e}: {traceback.format_exc()}")
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
continue # Skip imported modules/classes/functions
|
continue # Skip imported modules/classes/functions
|
||||||
if isinstance(obj, type) and issubclass(obj, Dropin):
|
if isinstance(obj, type) and issubclass(obj, Dropin):
|
||||||
dropins.append(obj)
|
dropins.append(obj)
|
||||||
logger.debug(f"loaded drop-in classes: {', '.join([d.__name__ for d in dropins])}")
|
logger.debug(f"Loaded drop-in classes: {', '.join([d.__name__ for d in dropins])}")
|
||||||
return dropins
|
return dropins
|
||||||
|
|
||||||
def sanitize_url(self, url: str) -> str:
|
def sanitize_url(self, url: str) -> str:
|
||||||
@@ -86,10 +86,10 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
with SB(uc=True, agent=self.agent, headed=None, user_data_dir=using_user_data_dir, proxy=self.proxy) as sb:
|
with SB(uc=True, agent=self.agent, headed=None, user_data_dir=using_user_data_dir, proxy=self.proxy) as sb:
|
||||||
logger.info(f"selenium browser is up with agent {self.agent}, opening url...")
|
logger.info(f"Selenium browser is up with agent {self.agent}, opening url...")
|
||||||
sb.uc_open_with_reconnect(url, 4)
|
sb.uc_open_with_reconnect(url, 4)
|
||||||
|
|
||||||
logger.debug("handling CAPTCHAs for...")
|
logger.debug("Handling CAPTCHAs for...")
|
||||||
sb.uc_gui_handle_cf()
|
sb.uc_gui_handle_cf()
|
||||||
sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future
|
sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future
|
||||||
|
|
||||||
@@ -97,7 +97,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
dropin.open_page(url)
|
dropin.open_page(url)
|
||||||
|
|
||||||
if self.detect_auth_wall and self._hit_auth_wall(sb):
|
if self.detect_auth_wall and self._hit_auth_wall(sb):
|
||||||
logger.warning("skipping since auth wall or CAPTCHA was detected")
|
logger.warning("Skipping since auth wall or CAPTCHA was detected")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
sb.wait_for_ready_state_complete()
|
sb.wait_for_ready_state_complete()
|
||||||
@@ -124,18 +124,18 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
js_css_selector=dropin.js_for_video_css_selectors(),
|
js_css_selector=dropin.js_for_video_css_selectors(),
|
||||||
max_media=self.max_download_videos - downloaded_videos,
|
max_media=self.max_download_videos - downloaded_videos,
|
||||||
)
|
)
|
||||||
logger.info("completed")
|
logger.info("Completed")
|
||||||
|
|
||||||
return to_enrich
|
return to_enrich
|
||||||
except selenium.common.exceptions.SessionNotCreatedException as e:
|
except selenium.common.exceptions.SessionNotCreatedException as e:
|
||||||
if custom_data_dir: # the retry logic only works once
|
if custom_data_dir: # the retry logic only works once
|
||||||
logger.error(
|
logger.error(
|
||||||
f"session not created error: {e}. Please remove the user_data_dir {self.user_data_dir} and try again, will retry without user data dir though."
|
f"Session not created error: {e}. Please remove the user_data_dir {self.user_data_dir} and try again, will retry without user data dir though."
|
||||||
)
|
)
|
||||||
return self.enrich(to_enrich, custom_data_dir=False)
|
return self.enrich(to_enrich, custom_data_dir=False)
|
||||||
raise e # re-raise
|
raise e # re-raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"runtime error: {e}: {traceback.format_exc()}")
|
logger.error(f"Runtime error: {e}: {traceback.format_exc()}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _get_suitable_dropin(self, url: str, sb: SB):
|
def _get_suitable_dropin(self, url: str, sb: SB):
|
||||||
@@ -145,7 +145,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
"""
|
"""
|
||||||
for dropin in self.dropins:
|
for dropin in self.dropins:
|
||||||
if dropin.suitable(url):
|
if dropin.suitable(url):
|
||||||
logger.debug(f"using drop-in {dropin.__name__}")
|
logger.debug(f"Using drop-in {dropin.__name__}")
|
||||||
return dropin(sb, self)
|
return dropin(sb, self)
|
||||||
|
|
||||||
return DefaultDropin(sb, self)
|
return DefaultDropin(sb, self)
|
||||||
@@ -240,7 +240,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
|
|
||||||
x = max(sb.execute_script("return document.documentElement.scrollWidth"), w)
|
x = max(sb.execute_script("return document.documentElement.scrollWidth"), w)
|
||||||
y = min(max(sb.execute_script("return document.documentElement.scrollHeight"), h), 25_000)
|
y = min(max(sb.execute_script("return document.documentElement.scrollHeight"), h), 25_000)
|
||||||
logger.debug(f"setting window size to {x}x{y} for full page screenshot.")
|
logger.debug(f"Setting window size to {x}x{y} for full page screenshot.")
|
||||||
sb.set_window_size(x, y)
|
sb.set_window_size(x, y)
|
||||||
|
|
||||||
screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png")
|
screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png")
|
||||||
@@ -279,7 +279,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
# js_for_css_selectors
|
# js_for_css_selectors
|
||||||
for src in sources:
|
for src in sources:
|
||||||
if len(all_urls) >= max_media:
|
if len(all_urls) >= max_media:
|
||||||
logger.debug(f"reached max download limit of {max_media} images/videos.")
|
logger.debug(f"Reached max download limit of {max_media} images/videos.")
|
||||||
break
|
break
|
||||||
if not is_relevant_url(src):
|
if not is_relevant_url(src):
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -144,7 +144,7 @@ class Dropin:
|
|||||||
with yt_dlp.YoutubeDL(validated_options) as ydl:
|
with yt_dlp.YoutubeDL(validated_options) as ydl:
|
||||||
for url in video_urls:
|
for url in video_urls:
|
||||||
try:
|
try:
|
||||||
logger.debug("downloading video from url")
|
logger.debug(f"Downloading video from url: {url}")
|
||||||
info = ydl.extract_info(url, download=True)
|
info = ydl.extract_info(url, download=True)
|
||||||
filename = ydl_entry_to_filename(ydl, info)
|
filename = ydl_entry_to_filename(ydl, info)
|
||||||
if not filename: # Failed to download video.
|
if not filename: # Failed to download video.
|
||||||
@@ -156,5 +156,5 @@ class Dropin:
|
|||||||
to_enrich.add_media(media)
|
to_enrich.add_media(media)
|
||||||
downloaded += 1
|
downloaded += 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"download failed: {e} {traceback.format_exc()}")
|
logger.error(f"Download failed: {e} {traceback.format_exc()}")
|
||||||
return downloaded
|
return downloaded
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ class LinkedinDropin(Dropin):
|
|||||||
self.sb.wait_for_ready_state_complete()
|
self.sb.wait_for_ready_state_complete()
|
||||||
|
|
||||||
username, password = self._get_username_password("linkedin.com")
|
username, password = self._get_username_password("linkedin.com")
|
||||||
logger.debug("logging in to Linkedin with username: {}", username)
|
logger.debug("Logging in to Linkedin with username: {}", username)
|
||||||
self.sb.type("#username", username)
|
self.sb.type("#username", username)
|
||||||
self.sb.type("#password", password)
|
self.sb.type("#password", password)
|
||||||
self.sb.click_if_visible("#password-visibility-toggle", timeout=0.5)
|
self.sb.click_if_visible("#password-visibility-toggle", timeout=0.5)
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ class RedditDropin(Dropin):
|
|||||||
self._close_cookies_banner()
|
self._close_cookies_banner()
|
||||||
|
|
||||||
username, password = self._get_username_password("reddit.com")
|
username, password = self._get_username_password("reddit.com")
|
||||||
logger.debug("logging in to Reddit with username: {}", username)
|
logger.debug("Logging in to Reddit with username: {}", username)
|
||||||
|
|
||||||
self.sb.type("#login-username", username)
|
self.sb.type("#login-username", username)
|
||||||
self.sb.type("#login-password", password)
|
self.sb.type("#login-password", password)
|
||||||
@@ -68,7 +68,7 @@ class RedditDropin(Dropin):
|
|||||||
self.sb.click_link_text("Log in")
|
self.sb.click_link_text("Log in")
|
||||||
self.sb.wait_for_ready_state_complete()
|
self.sb.wait_for_ready_state_complete()
|
||||||
if self.sb.is_text_visible("Welcome back"):
|
if self.sb.is_text_visible("Welcome back"):
|
||||||
logger.debug("login successful")
|
logger.debug("Login successful")
|
||||||
self.sb.click_if_visible("this link")
|
self.sb.click_if_visible("this link")
|
||||||
|
|
||||||
def _close_cookies_banner(self):
|
def _close_cookies_banner(self):
|
||||||
@@ -88,5 +88,5 @@ class RedditDropin(Dropin):
|
|||||||
.map(el => el.src || el.href)
|
.map(el => el.src || el.href)
|
||||||
.filter(url => url && /\.(m3u8|mpd|ism)$/.test(url));
|
.filter(url => url && /\.(m3u8|mpd|ism)$/.test(url));
|
||||||
""")
|
""")
|
||||||
logger.debug("found {} video URLs", len(filtered_urls))
|
logger.debug("Found {} video URLs", len(filtered_urls))
|
||||||
return 0, self._download_videos_with_ytdlp(filtered_urls, to_enrich)
|
return 0, self._download_videos_with_ytdlp(filtered_urls, to_enrich)
|
||||||
|
|||||||
@@ -57,12 +57,12 @@ class VkDropin(Dropin):
|
|||||||
self.sb.open("https://vk.com")
|
self.sb.open("https://vk.com")
|
||||||
self.sb.wait_for_ready_state_complete()
|
self.sb.wait_for_ready_state_complete()
|
||||||
if "/feed" in self.sb.get_current_url():
|
if "/feed" in self.sb.get_current_url():
|
||||||
logger.debug("already logged in to VK.")
|
logger.debug("Already logged in to VK.")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# need to login
|
# need to login
|
||||||
username, password = self._get_username_password("vk.com")
|
username, password = self._get_username_password("vk.com")
|
||||||
logger.debug("logging in to VK with username: {}", username)
|
logger.debug("Logging in to VK with username: {}", username)
|
||||||
|
|
||||||
self.sb.click('[data-testid="enter-another-way"]', timeout=10)
|
self.sb.click('[data-testid="enter-another-way"]', timeout=10)
|
||||||
self.sb.clear('input[name="login"][type="tel"]', by="css selector", timeout=10)
|
self.sb.clear('input[name="login"][type="tel"]', by="css selector", timeout=10)
|
||||||
|
|||||||
@@ -36,9 +36,9 @@ class AAApiDb(Database):
|
|||||||
if not self.store_results:
|
if not self.store_results:
|
||||||
return
|
return
|
||||||
if cached:
|
if cached:
|
||||||
logger.debug("skipping saving archive to AA API because it was cached")
|
logger.debug("Skipping saving archive to AA API because it was cached")
|
||||||
return
|
return
|
||||||
logger.debug("saving archive to the AA API.")
|
logger.debug("Saving archive to the AA API.")
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"author_id": self.author_id,
|
"author_id": self.author_id,
|
||||||
|
|||||||
@@ -72,7 +72,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
|||||||
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
|
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
|
||||||
json={"metadata": {"processed": True, "status": "error", "error": reason}},
|
json={"metadata": {"processed": True, "status": "error", "error": reason}},
|
||||||
)
|
)
|
||||||
logger.info(f"stored failure ID {atlos_id} on Atlos: {reason}")
|
logger.info(f"Stored failure ID {atlos_id} on Atlos: {reason}")
|
||||||
|
|
||||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||||
"""check and fetch if the given item has been archived already, each
|
"""check and fetch if the given item has been archived already, each
|
||||||
@@ -88,7 +88,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
|||||||
"""Mark an item as successfully archived in Atlos."""
|
"""Mark an item as successfully archived in Atlos."""
|
||||||
atlos_id = item.metadata.get("atlos_id")
|
atlos_id = item.metadata.get("atlos_id")
|
||||||
if not atlos_id:
|
if not atlos_id:
|
||||||
logger.info("item has no Atlos ID, skipping")
|
logger.info("Item has no Atlos ID, skipping")
|
||||||
return
|
return
|
||||||
self._post(
|
self._post(
|
||||||
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
|
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
|
||||||
@@ -100,7 +100,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
logger.info(f"stored success ID {atlos_id} on Atlos")
|
logger.info(f"Stored success ID {atlos_id} on Atlos")
|
||||||
|
|
||||||
# ! Atlos Module - Storage Methods
|
# ! Atlos Module - Storage Methods
|
||||||
|
|
||||||
@@ -111,12 +111,12 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
|||||||
def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool:
|
def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool:
|
||||||
"""Upload a media file to Atlos if it has not been uploaded already."""
|
"""Upload a media file to Atlos if it has not been uploaded already."""
|
||||||
if metadata is None:
|
if metadata is None:
|
||||||
logger.error(f"no metadata provided for {media.filename}")
|
logger.error(f"No metadata provided for {media.filename}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
atlos_id = metadata.get("atlos_id")
|
atlos_id = metadata.get("atlos_id")
|
||||||
if not atlos_id:
|
if not atlos_id:
|
||||||
logger.error(f"no Atlos ID found in metadata; can't store {media.filename} in Atlos.")
|
logger.error(f"No Atlos ID found in metadata; can't store {media.filename} in Atlos.")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096)
|
media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096)
|
||||||
@@ -135,7 +135,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
|||||||
params={"title": media.properties},
|
params={"title": media.properties},
|
||||||
files={"file": (os.path.basename(media.filename), file_obj)},
|
files={"file": (os.path.basename(media.filename), file_obj)},
|
||||||
)
|
)
|
||||||
logger.info(f"uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}")
|
logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||||||
|
|||||||
@@ -20,19 +20,19 @@ class CSVFeeder(Feeder):
|
|||||||
url_column = first_row.index(url_column)
|
url_column = first_row.index(url_column)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?"
|
f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?"
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
elif not (url_or_none(first_row[url_column])):
|
elif not (url_or_none(first_row[url_column])):
|
||||||
# it's a header row, but we've been given a column number already
|
# it's a header row, but we've been given a column number already
|
||||||
logger.debug(f"skipping header row: {first_row}")
|
logger.debug(f"Skipping header row: {first_row}")
|
||||||
else:
|
else:
|
||||||
# first row isn't a header row, rewind the file
|
# first row isn't a header row, rewind the file
|
||||||
f.seek(0)
|
f.seek(0)
|
||||||
|
|
||||||
for row in reader:
|
for row in reader:
|
||||||
if not url_or_none(row[url_column]):
|
if not url_or_none(row[url_column]):
|
||||||
logger.warning(f"not a valid URL in row: {row}, skipping")
|
logger.warning(f"Not a valid URL in row: {row}, skipping")
|
||||||
continue
|
continue
|
||||||
url = row[url_column]
|
url = row[url_column]
|
||||||
yield Metadata().set_url(url)
|
yield Metadata().set_url(url)
|
||||||
|
|||||||
@@ -23,10 +23,10 @@ class GDriveStorage(Storage):
|
|||||||
def _setup_google_drive_service(self):
|
def _setup_google_drive_service(self):
|
||||||
"""Initialize Google Drive service based on provided credentials."""
|
"""Initialize Google Drive service based on provided credentials."""
|
||||||
if self.oauth_token:
|
if self.oauth_token:
|
||||||
logger.debug(f"using Google Drive OAuth token: {self.oauth_token}")
|
logger.debug(f"Using Google Drive OAuth token: {self.oauth_token}")
|
||||||
self.service = self._initialize_with_oauth_token()
|
self.service = self._initialize_with_oauth_token()
|
||||||
elif self.service_account:
|
elif self.service_account:
|
||||||
logger.debug(f"using Google Drive service account: {self.service_account}")
|
logger.debug(f"Using Google Drive service account: {self.service_account}")
|
||||||
self.service = self._initialize_with_service_account()
|
self.service = self._initialize_with_service_account()
|
||||||
else:
|
else:
|
||||||
raise ValueError("Missing credentials: either `oauth_token` or `service_account` must be provided.")
|
raise ValueError("Missing credentials: either `oauth_token` or `service_account` must be provided.")
|
||||||
@@ -41,7 +41,7 @@ class GDriveStorage(Storage):
|
|||||||
if not creds.valid and creds.expired and creds.refresh_token:
|
if not creds.valid and creds.expired and creds.refresh_token:
|
||||||
creds.refresh(Request())
|
creds.refresh(Request())
|
||||||
with open(self.oauth_token, "w") as token_file:
|
with open(self.oauth_token, "w") as token_file:
|
||||||
logger.debug("saving refreshed OAuth token.")
|
logger.debug("Saving refreshed OAuth token.")
|
||||||
token_file.write(creds.to_json())
|
token_file.write(creds.to_json())
|
||||||
elif not creds.valid:
|
elif not creds.valid:
|
||||||
raise ValueError("Invalid OAuth token. Please regenerate the token.")
|
raise ValueError("Invalid OAuth token. Please regenerate the token.")
|
||||||
@@ -62,7 +62,7 @@ class GDriveStorage(Storage):
|
|||||||
parent_id, folder_id = self.root_folder_id, None
|
parent_id, folder_id = self.root_folder_id, None
|
||||||
path_parts = media.key.split(os.path.sep)
|
path_parts = media.key.split(os.path.sep)
|
||||||
filename = path_parts[-1]
|
filename = path_parts[-1]
|
||||||
logger.info(f"looking for folders for {path_parts[0:-1]} before getting url for {filename=}")
|
logger.info(f"Looking for folders for {path_parts[0:-1]} before getting url for {filename=}")
|
||||||
for folder in path_parts[0:-1]:
|
for folder in path_parts[0:-1]:
|
||||||
folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
|
folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
|
||||||
parent_id = folder_id
|
parent_id = folder_id
|
||||||
@@ -70,7 +70,7 @@ class GDriveStorage(Storage):
|
|||||||
file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=True)
|
file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=True)
|
||||||
if not file_id:
|
if not file_id:
|
||||||
#
|
#
|
||||||
logger.info(f"file {filename} not found in folder {folder_id}")
|
logger.info(f"File {filename} not found in folder {folder_id}")
|
||||||
return None
|
return None
|
||||||
return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
|
return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
|
||||||
|
|
||||||
@@ -83,7 +83,7 @@ class GDriveStorage(Storage):
|
|||||||
parent_id, upload_to = self.root_folder_id, None
|
parent_id, upload_to = self.root_folder_id, None
|
||||||
path_parts = media.key.split(os.path.sep)
|
path_parts = media.key.split(os.path.sep)
|
||||||
filename = path_parts[-1]
|
filename = path_parts[-1]
|
||||||
logger.info(f"checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}")
|
logger.info(f"Checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}")
|
||||||
for folder in path_parts[0:-1]:
|
for folder in path_parts[0:-1]:
|
||||||
upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False)
|
upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False)
|
||||||
if upload_to is None:
|
if upload_to is None:
|
||||||
@@ -91,7 +91,7 @@ class GDriveStorage(Storage):
|
|||||||
parent_id = upload_to
|
parent_id = upload_to
|
||||||
|
|
||||||
# upload file to gd
|
# upload file to gd
|
||||||
logger.debug(f"uploading {filename=} to folder id {upload_to}")
|
logger.debug(f"Uploading {filename=} to folder id {upload_to}")
|
||||||
file_metadata = {"name": [filename], "parents": [upload_to]}
|
file_metadata = {"name": [filename], "parents": [upload_to]}
|
||||||
try:
|
try:
|
||||||
media = MediaFileUpload(media.filename, resumable=True)
|
media = MediaFileUpload(media.filename, resumable=True)
|
||||||
@@ -100,11 +100,11 @@ class GDriveStorage(Storage):
|
|||||||
.create(supportsAllDrives=True, body=file_metadata, media_body=media, fields="id")
|
.create(supportsAllDrives=True, body=file_metadata, media_body=media, fields="id")
|
||||||
.execute()
|
.execute()
|
||||||
)
|
)
|
||||||
logger.debug(f"uploadf: uploaded file {gd_file['id']} successfully in folder={upload_to}")
|
logger.debug(f"Uploadf: uploaded file {gd_file['id']} successfully in folder={upload_to}")
|
||||||
except FileNotFoundError as e:
|
except FileNotFoundError as e:
|
||||||
logger.error(f"gd uploadf: file not found {media.filename=} - {e}")
|
logger.error(f"GD uploadf: file not found {media.filename=} - {e}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"gd uploadf: error uploading {media.filename=} to {upload_to} - {e}")
|
logger.error(f"GD uploadf: error uploading {media.filename=} to {upload_to} - {e}")
|
||||||
|
|
||||||
# must be implemented even if unused
|
# must be implemented even if unused
|
||||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||||||
@@ -133,7 +133,7 @@ class GDriveStorage(Storage):
|
|||||||
self.api_cache = getattr(self, "api_cache", {})
|
self.api_cache = getattr(self, "api_cache", {})
|
||||||
cache_key = f"{parent_id}_{name}_{use_mime_type}"
|
cache_key = f"{parent_id}_{name}_{use_mime_type}"
|
||||||
if cache_key in self.api_cache:
|
if cache_key in self.api_cache:
|
||||||
logger.debug(f"cache hit for {cache_key=}")
|
logger.debug(f"Cache hit for {cache_key=}")
|
||||||
return self.api_cache[cache_key]
|
return self.api_cache[cache_key]
|
||||||
|
|
||||||
# API logic
|
# API logic
|
||||||
@@ -168,7 +168,7 @@ class GDriveStorage(Storage):
|
|||||||
else:
|
else:
|
||||||
logger.debug(f"{debug_header} not found, attempt {attempt + 1}/{retries}.")
|
logger.debug(f"{debug_header} not found, attempt {attempt + 1}/{retries}.")
|
||||||
if attempt < retries - 1:
|
if attempt < retries - 1:
|
||||||
logger.debug(f"sleeping for {sleep_seconds} second(s)")
|
logger.debug(f"Sleeping for {sleep_seconds} second(s)")
|
||||||
time.sleep(sleep_seconds)
|
time.sleep(sleep_seconds)
|
||||||
|
|
||||||
if raise_on_missing:
|
if raise_on_missing:
|
||||||
@@ -180,7 +180,7 @@ class GDriveStorage(Storage):
|
|||||||
Creates a new GDrive folder @name inside folder @parent_id
|
Creates a new GDrive folder @name inside folder @parent_id
|
||||||
Returns id of the created folder
|
Returns id of the created folder
|
||||||
"""
|
"""
|
||||||
logger.debug(f"creating new folder with {name=} inside {parent_id=}")
|
logger.debug(f"Creating new folder with {name=} inside {parent_id=}")
|
||||||
file_metadata = {"name": [name], "mimeType": "application/vnd.google-apps.folder", "parents": [parent_id]}
|
file_metadata = {"name": [name], "mimeType": "application/vnd.google-apps.folder", "parents": [parent_id]}
|
||||||
gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields="id").execute()
|
gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields="id").execute()
|
||||||
return gd_folder.get("id")
|
return gd_folder.get("id")
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ class Bluesky(GenericDropin):
|
|||||||
# download if embeds present (1 video XOR >=1 images)
|
# download if embeds present (1 video XOR >=1 images)
|
||||||
for media in self._download_bsky_embeds(post, archiver):
|
for media in self._download_bsky_embeds(post, archiver):
|
||||||
result.add_media(media)
|
result.add_media(media)
|
||||||
logger.debug(f"downloaded {len(result.media)} media files")
|
logger.debug(f"Downloaded {len(result.media)} media files")
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ class GenericExtractor(Extractor):
|
|||||||
os.execv(sys.executable, [sys.executable] + sys.argv)
|
os.execv(sys.executable, [sys.executable] + sys.argv)
|
||||||
|
|
||||||
def update_package(self, package_name: str) -> bool:
|
def update_package(self, package_name: str) -> bool:
|
||||||
logger.info(f"checking and updating {package_name}...")
|
logger.info(f"Checking and updating {package_name}...")
|
||||||
from importlib.metadata import version as get_version
|
from importlib.metadata import version as get_version
|
||||||
|
|
||||||
old_version = get_version(package_name)
|
old_version = get_version(package_name)
|
||||||
@@ -79,7 +79,7 @@ class GenericExtractor(Extractor):
|
|||||||
return True
|
return True
|
||||||
logger.info(f"{package_name} already up to date")
|
logger.info(f"{package_name} already up to date")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"failed to update {package_name}: {e}")
|
logger.error(f"Failed to update {package_name}: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def setup_po_tokens(self) -> None:
|
def setup_po_tokens(self) -> None:
|
||||||
@@ -110,7 +110,7 @@ class GenericExtractor(Extractor):
|
|||||||
missing_tools = [tool for tool in ("node", "yarn", "npx") if shutil.which(tool) is None]
|
missing_tools = [tool for tool in ("node", "yarn", "npx") if shutil.which(tool) is None]
|
||||||
if missing_tools:
|
if missing_tools:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. "
|
f"Cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. "
|
||||||
"Install these tools or run bgutils via Docker. "
|
"Install these tools or run bgutils via Docker. "
|
||||||
"See: https://github.com/Brainicism/bgutil-ytdlp-pot-provider"
|
"See: https://github.com/Brainicism/bgutil-ytdlp-pot-provider"
|
||||||
)
|
)
|
||||||
@@ -139,7 +139,7 @@ class GenericExtractor(Extractor):
|
|||||||
f"https://github.com/Brainicism/bgutil-ytdlp-pot-provider/archive/refs/tags/{plugin_version}.zip"
|
f"https://github.com/Brainicism/bgutil-ytdlp-pot-provider/archive/refs/tags/{plugin_version}.zip"
|
||||||
)
|
)
|
||||||
zip_path = os.path.join(base_dir, f"{plugin_version}.zip")
|
zip_path = os.path.join(base_dir, f"{plugin_version}.zip")
|
||||||
logger.info(f"downloading bgutils release zip for version {plugin_version}...")
|
logger.info(f"Downloading bgutils release zip for version {plugin_version}...")
|
||||||
urlretrieve(zip_url, zip_path)
|
urlretrieve(zip_url, zip_path)
|
||||||
with zipfile.ZipFile(zip_path, "r") as z:
|
with zipfile.ZipFile(zip_path, "r") as z:
|
||||||
z.extractall(base_dir)
|
z.extractall(base_dir)
|
||||||
@@ -148,7 +148,7 @@ class GenericExtractor(Extractor):
|
|||||||
extracted_root = os.path.join(base_dir, f"bgutil-ytdlp-pot-provider-{plugin_version}")
|
extracted_root = os.path.join(base_dir, f"bgutil-ytdlp-pot-provider-{plugin_version}")
|
||||||
shutil.move(os.path.join(extracted_root, "server"), server_dir)
|
shutil.move(os.path.join(extracted_root, "server"), server_dir)
|
||||||
shutil.rmtree(extracted_root)
|
shutil.rmtree(extracted_root)
|
||||||
logger.info("installing dependencies and transpiling PoT Generator script...")
|
logger.info("Installing dependencies and transpiling PoT Generator script...")
|
||||||
subprocess.run(["yarn", "install", "--frozen-lockfile"], cwd=server_dir, check=True)
|
subprocess.run(["yarn", "install", "--frozen-lockfile"], cwd=server_dir, check=True)
|
||||||
subprocess.run(["npx", "tsc"], cwd=server_dir, check=True)
|
subprocess.run(["npx", "tsc"], cwd=server_dir, check=True)
|
||||||
|
|
||||||
@@ -164,7 +164,7 @@ class GenericExtractor(Extractor):
|
|||||||
logger.info(f"PO Token script configured at: {script_path}")
|
logger.info(f"PO Token script configured at: {script_path}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"failed to set up PO Token script: {e}")
|
logger.error(f"Failed to set up PO Token script: {e}")
|
||||||
|
|
||||||
def suitable_extractors(self, url: str) -> Generator[str, None, None]:
|
def suitable_extractors(self, url: str) -> Generator[str, None, None]:
|
||||||
"""
|
"""
|
||||||
@@ -205,7 +205,7 @@ class GenericExtractor(Extractor):
|
|||||||
media = Media(cover_image_path)
|
media = Media(cover_image_path)
|
||||||
metadata.add_media(media, id="cover")
|
metadata.add_media(media, id="cover")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"could not download cover image {thumbnail_url}: {e}")
|
logger.error(f"Could not download cover image {thumbnail_url}: {e}")
|
||||||
|
|
||||||
dropin = self.dropin_for_name(info_extractor.ie_key())
|
dropin = self.dropin_for_name(info_extractor.ie_key())
|
||||||
if dropin:
|
if dropin:
|
||||||
@@ -352,7 +352,7 @@ class GenericExtractor(Extractor):
|
|||||||
|
|
||||||
if not dropin:
|
if not dropin:
|
||||||
# TODO: add a proper link to 'how to create your own dropin'
|
# TODO: add a proper link to 'how to create your own dropin'
|
||||||
logger.debug(f"""could not find valid dropin for {info_extractor.ie_key()}.
|
logger.debug(f"""Could not find valid dropin for {info_extractor.ie_key()}.
|
||||||
Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""")
|
Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@@ -374,7 +374,7 @@ class GenericExtractor(Extractor):
|
|||||||
if "entries" in data:
|
if "entries" in data:
|
||||||
entries = data.get("entries", [])
|
entries = data.get("entries", [])
|
||||||
if not len(entries):
|
if not len(entries):
|
||||||
logger.info("YoutubeDLArchiver could not find any video")
|
logger.info("GenericExtractor could not find any video")
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
entries = [data]
|
entries = [data]
|
||||||
@@ -388,7 +388,7 @@ class GenericExtractor(Extractor):
|
|||||||
# file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies.
|
# file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
logger.debug(f"using filename {filename} for entry {entry.get('id', 'unknown')}")
|
logger.debug(f"Using filename {filename} for entry {entry.get('id', 'unknown')}")
|
||||||
|
|
||||||
new_media = Media(filename)
|
new_media = Media(filename)
|
||||||
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
|
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
|
||||||
@@ -403,12 +403,12 @@ class GenericExtractor(Extractor):
|
|||||||
text = " ".join([line.text for line in subs])
|
text = " ".join([line.text for line in subs])
|
||||||
new_media.set(f"subtitles_{lang}", text)
|
new_media.set(f"subtitles_{lang}", text)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"error loading subtitle file {val.get('filepath')}: {e}")
|
logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}")
|
||||||
result.add_media(new_media)
|
result.add_media(new_media)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"error processing entry {entry}: {e}")
|
logger.error(f"Error processing entry {entry}: {e}")
|
||||||
if not len(result.media):
|
if not len(result.media):
|
||||||
logger.info(f"no media found for entry {entry}, skipping.")
|
logger.info(f"No media found for entry {entry}, skipping.")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return self.add_metadata(data, info_extractor, url, result)
|
return self.add_metadata(data, info_extractor, url, result)
|
||||||
@@ -470,14 +470,14 @@ class GenericExtractor(Extractor):
|
|||||||
|
|
||||||
def _helper_for_successful_extract_info(data, info_extractor, url, ydl):
|
def _helper_for_successful_extract_info(data, info_extractor, url, ydl):
|
||||||
if data.get("is_live", False) and not self.livestreams:
|
if data.get("is_live", False) and not self.livestreams:
|
||||||
logger.warning("livestream detected, skipping due to 'livestreams' configuration setting")
|
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
|
||||||
return False
|
return False
|
||||||
# it's a valid video, that the youtubdedl can download out of the box
|
# it's a valid video, that the youtubdedl can download out of the box
|
||||||
return self.get_metadata_for_video(data, info_extractor, url, ydl)
|
return self.get_metadata_for_video(data, info_extractor, url, ydl)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
|
if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
|
||||||
logger.debug(f"skipping using ytdlp to download files for {info_extractor.ie_key()}")
|
logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
|
||||||
raise SkipYtdlp()
|
raise SkipYtdlp()
|
||||||
|
|
||||||
# don't download since it can be a live stream
|
# don't download since it can be a live stream
|
||||||
@@ -496,17 +496,17 @@ class GenericExtractor(Extractor):
|
|||||||
|
|
||||||
if not isinstance(e, SkipYtdlp):
|
if not isinstance(e, SkipYtdlp):
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f'issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead'
|
f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead'
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = self.get_metadata_for_post(info_extractor, url, ydl)
|
result = self.get_metadata_for_post(info_extractor, url, ydl)
|
||||||
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
|
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
|
||||||
logger.error("error downloading metadata for post: {error}", error=str(post_e))
|
logger.error("Error downloading metadata for post: {error}", error=str(post_e))
|
||||||
return False
|
return False
|
||||||
except Exception as generic_e:
|
except Exception as generic_e:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
'attempt to extract using ytdlp extractor "{name}" failed: \n {error}',
|
'Attempt to extract using ytdlp extractor "{name}" failed: \n {error}',
|
||||||
name=info_extractor.IE_NAME,
|
name=info_extractor.IE_NAME,
|
||||||
error=str(generic_e),
|
error=str(generic_e),
|
||||||
exc_info=True,
|
exc_info=True,
|
||||||
@@ -559,17 +559,17 @@ class GenericExtractor(Extractor):
|
|||||||
# order of importance: username/password -> api_key -> cookie -> cookies_from_browser -> cookies_file
|
# order of importance: username/password -> api_key -> cookie -> cookies_from_browser -> cookies_file
|
||||||
if auth:
|
if auth:
|
||||||
if "username" in auth and "password" in auth:
|
if "username" in auth and "password" in auth:
|
||||||
logger.debug("using provided auth username and password")
|
logger.debug("Using provided auth username and password")
|
||||||
ydl_options.extend(("--username", auth["username"]))
|
ydl_options.extend(("--username", auth["username"]))
|
||||||
ydl_options.extend(("--password", auth["password"]))
|
ydl_options.extend(("--password", auth["password"]))
|
||||||
elif "cookie" in auth:
|
elif "cookie" in auth:
|
||||||
logger.debug("using provided auth cookie")
|
logger.debug("Using provided auth cookie")
|
||||||
yt_dlp.utils.std_headers["cookie"] = auth["cookie"]
|
yt_dlp.utils.std_headers["cookie"] = auth["cookie"]
|
||||||
elif "cookies_from_browser" in auth:
|
elif "cookies_from_browser" in auth:
|
||||||
logger.debug(f"using extracted cookies from browser {auth['cookies_from_browser']}")
|
logger.debug(f"Using extracted cookies from browser {auth['cookies_from_browser']}")
|
||||||
ydl_options.extend(("--cookies-from-browser", auth["cookies_from_browser"]))
|
ydl_options.extend(("--cookies-from-browser", auth["cookies_from_browser"]))
|
||||||
elif "cookies_file" in auth:
|
elif "cookies_file" in auth:
|
||||||
logger.debug(f"using cookies from file {auth['cookies_file']}")
|
logger.debug(f"Using cookies from file {auth['cookies_file']}")
|
||||||
ydl_options.extend(("--cookies", auth["cookies_file"]))
|
ydl_options.extend(("--cookies", auth["cookies_file"]))
|
||||||
|
|
||||||
# Applying user-defined extractor_args
|
# Applying user-defined extractor_args
|
||||||
@@ -579,11 +579,11 @@ class GenericExtractor(Extractor):
|
|||||||
arg_str = ";".join(f"{k}={v}" for k, v in args.items())
|
arg_str = ";".join(f"{k}={v}" for k, v in args.items())
|
||||||
else:
|
else:
|
||||||
arg_str = str(args)
|
arg_str = str(args)
|
||||||
logger.debug(f"setting extractor_args: {key}:{arg_str}")
|
logger.debug(f"Setting extractor_args: {key}:{arg_str}")
|
||||||
ydl_options.extend(["--extractor-args", f"{key}:{arg_str}"])
|
ydl_options.extend(["--extractor-args", f"{key}:{arg_str}"])
|
||||||
|
|
||||||
if self.ytdlp_args:
|
if self.ytdlp_args:
|
||||||
logger.debug(f"adding additional ytdlp arguments: {self.ytdlp_args}")
|
logger.debug(f"Adding additional ytdlp arguments: {self.ytdlp_args}")
|
||||||
ydl_options += self.ytdlp_args.split(" ")
|
ydl_options += self.ytdlp_args.split(" ")
|
||||||
|
|
||||||
*_, validated_options = yt_dlp.parse_options(ydl_options)
|
*_, validated_options = yt_dlp.parse_options(ydl_options)
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ class Tiktok(GenericDropin):
|
|||||||
return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE))
|
return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE))
|
||||||
|
|
||||||
def extract_post(self, url: str, ie_instance):
|
def extract_post(self, url: str, ie_instance):
|
||||||
logger.debug(f"using Tikwm API to attempt to download tiktok video from {url=}")
|
logger.debug("Using Tikwm API to attempt to download tiktok video")
|
||||||
|
|
||||||
endpoint = self.TIKWM_ENDPOINT.format(url=url)
|
endpoint = self.TIKWM_ENDPOINT.format(url=url)
|
||||||
|
|
||||||
@@ -62,7 +62,7 @@ class Tiktok(GenericDropin):
|
|||||||
# get the video or fail
|
# get the video or fail
|
||||||
video_downloaded = archiver.download_from_url(video_url, f"vid_{post.get('id', '')}")
|
video_downloaded = archiver.download_from_url(video_url, f"vid_{post.get('id', '')}")
|
||||||
if not video_downloaded:
|
if not video_downloaded:
|
||||||
logger.error(f"failed to download video from {video_url}")
|
logger.error("Failed to download video")
|
||||||
return False
|
return False
|
||||||
video_media = Media(video_downloaded)
|
video_media = Media(video_downloaded)
|
||||||
if duration := post.get("duration", None):
|
if duration := post.get("duration", None):
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ class Twitter(GenericDropin):
|
|||||||
raise ValueError("Error retreiving post. Are you sure it exists?")
|
raise ValueError("Error retreiving post. Are you sure it exists?")
|
||||||
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
||||||
except (ValueError, KeyError) as ex:
|
except (ValueError, KeyError) as ex:
|
||||||
logger.warning(f"unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
full_text = tweet.pop("full_text", "")
|
full_text = tweet.pop("full_text", "")
|
||||||
@@ -49,7 +49,7 @@ class Twitter(GenericDropin):
|
|||||||
|
|
||||||
result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp)
|
result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp)
|
||||||
if not tweet.get("entities", {}).get("media"):
|
if not tweet.get("entities", {}).get("media"):
|
||||||
logger.debug("no media found, archiving tweet text only")
|
logger.debug("No media found, archiving tweet text only")
|
||||||
result.status = "twitter-ytdl"
|
result.status = "twitter-ytdl"
|
||||||
return result
|
return result
|
||||||
for i, tw_media in enumerate(tweet["entities"]["media"]):
|
for i, tw_media in enumerate(tweet["entities"]["media"]):
|
||||||
|
|||||||
@@ -42,19 +42,19 @@ class GsheetsFeederDB(Feeder, Database):
|
|||||||
sh = self.open_sheet()
|
sh = self.open_sheet()
|
||||||
for ii, worksheet in enumerate(sh.worksheets()):
|
for ii, worksheet in enumerate(sh.worksheets()):
|
||||||
if not self.should_process_sheet(worksheet.title):
|
if not self.should_process_sheet(worksheet.title):
|
||||||
logger.debug(f"skipped worksheet '{worksheet.title}' due to allow/block rules")
|
logger.debug(f"Skipped worksheet '{worksheet.title}' due to allow/block rules")
|
||||||
continue
|
continue
|
||||||
logger.info(f"opening worksheet {ii=}: {worksheet.title=} header={self.header}")
|
logger.info(f"Opening worksheet {ii=}: {worksheet.title=} header={self.header}")
|
||||||
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
|
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
|
||||||
if len(missing_cols := self.missing_required_columns(gw)):
|
if len(missing_cols := self.missing_required_columns(gw)):
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"skipped worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}"
|
f"Skipped worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}"
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
with logger.contextualize(worksheet=f"{sh.title}:{worksheet.title}"):
|
with logger.contextualize(worksheet=f"{sh.title}:{worksheet.title}"):
|
||||||
# process and yield metadata here:
|
# process and yield metadata here:
|
||||||
yield from self._process_rows(gw)
|
yield from self._process_rows(gw)
|
||||||
logger.info(f"finished worksheet {worksheet.title}")
|
logger.info(f"Finished worksheet {worksheet.title}")
|
||||||
|
|
||||||
def _process_rows(self, gw: GWorksheet):
|
def _process_rows(self, gw: GWorksheet):
|
||||||
for row in range(1 + self.header, gw.count_rows() + 1):
|
for row in range(1 + self.header, gw.count_rows() + 1):
|
||||||
@@ -133,7 +133,7 @@ class GsheetsFeederDB(Feeder, Database):
|
|||||||
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
|
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
|
||||||
cell_updates.append((row, col, final_value))
|
cell_updates.append((row, col, final_value))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"unable to batch {col}={final_value} due to {e}")
|
logger.error(f"Unable to batch {col}={final_value} due to {e}")
|
||||||
|
|
||||||
status_message = item.status
|
status_message = item.status
|
||||||
if cached:
|
if cached:
|
||||||
@@ -193,13 +193,13 @@ class GsheetsFeederDB(Feeder, Database):
|
|||||||
gw, row = self._retrieve_gsheet(item)
|
gw, row = self._retrieve_gsheet(item)
|
||||||
gw.set_cell(row, "status", new_status)
|
gw.set_cell(row, "status", new_status)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"unable to update sheet: {e}: {traceback.format_exc()}")
|
logger.debug(f"Unable to update sheet: {e}: {traceback.format_exc()}")
|
||||||
|
|
||||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||||
if gsheet := item.get_context("gsheet"):
|
if gsheet := item.get_context("gsheet"):
|
||||||
gw: GWorksheet = gsheet.get("worksheet")
|
gw: GWorksheet = gsheet.get("worksheet")
|
||||||
row: int = gsheet.get("row")
|
row: int = gsheet.get("row")
|
||||||
elif self.sheet_id:
|
elif self.sheet_id:
|
||||||
logger.error("unable to retrieve Gsheet, GsheetDB must be used alongside GsheetFeeder.")
|
logger.error("Unable to retrieve Gsheet, GsheetDB must be used alongside GsheetFeeder.")
|
||||||
|
|
||||||
return gw, row
|
return gw, row
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ class HashEnricher(Enricher):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def enrich(self, to_enrich: Metadata) -> None:
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
logger.debug(f"calculating media hashes with algo={self.algorithm}")
|
logger.debug(f"Calculating media hashes with algo={self.algorithm}")
|
||||||
|
|
||||||
for i, m in enumerate(to_enrich.media):
|
for i, m in enumerate(to_enrich.media):
|
||||||
if len(hd := self.calculate_hash(m.filename)):
|
if len(hd := self.calculate_hash(m.filename)):
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ class HtmlFormatter(Formatter):
|
|||||||
def format(self, item: Metadata) -> Media:
|
def format(self, item: Metadata) -> Media:
|
||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
if item.is_empty():
|
if item.is_empty():
|
||||||
logger.debug("nothing to format, skipping")
|
logger.debug("Nothing to format, skipping")
|
||||||
return
|
return
|
||||||
|
|
||||||
content = self.template.render(
|
content = self.template.render(
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
if not len(insta_matches) or len(insta_matches[0]) != 3:
|
if not len(insta_matches) or len(insta_matches[0]) != 3:
|
||||||
return
|
return
|
||||||
if len(insta_matches) > 1:
|
if len(insta_matches) > 1:
|
||||||
logger.debug("multiple instagram matches found, using the first one")
|
logger.debug("Multiple instagram matches found, using the first one")
|
||||||
return
|
return
|
||||||
g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2]
|
g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2]
|
||||||
if g1 == "":
|
if g1 == "":
|
||||||
@@ -65,13 +65,13 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
return self.download_post(item, id=g3, context="story")
|
return self.download_post(item, id=g3, context="story")
|
||||||
return self.download_stories(item, g2)
|
return self.download_stories(item, g2)
|
||||||
else:
|
else:
|
||||||
logger.warning(f"unknown instagram regex group match {g1=}")
|
logger.warning(f"Unknown instagram regex group match {g1=}")
|
||||||
return
|
return
|
||||||
|
|
||||||
@retry(wait_random_min=1000, wait_random_max=3000, stop_max_attempt_number=5)
|
@retry(wait_random_min=1000, wait_random_max=3000, stop_max_attempt_number=5)
|
||||||
def call_api(self, path: str, params: dict) -> dict:
|
def call_api(self, path: str, params: dict) -> dict:
|
||||||
headers = {"accept": "application/json", "x-access-key": self.access_token}
|
headers = {"accept": "application/json", "x-access-key": self.access_token}
|
||||||
logger.debug(f"calling {self.api_endpoint}/{path} with {params=}")
|
logger.debug(f"Calling {self.api_endpoint}/{path} with {params=}")
|
||||||
return requests.get(f"{self.api_endpoint}/{path}", headers=headers, params=params).json()
|
return requests.get(f"{self.api_endpoint}/{path}", headers=headers, params=params).json()
|
||||||
|
|
||||||
def cleanup_dict(self, d: dict | list) -> dict:
|
def cleanup_dict(self, d: dict | list) -> dict:
|
||||||
@@ -112,8 +112,8 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
count_posts += len(stories)
|
count_posts += len(stories)
|
||||||
result.set("#stories", len(stories))
|
result.set("#stories", len(stories))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result.append("errors", f"error downloading stories for {username}")
|
result.append("errors", f"Error downloading stories for {username}")
|
||||||
logger.error(f"error downloading stories for {username}: {e} {traceback.format_exc()}")
|
logger.error(f"Error downloading stories for {username}: {e} {traceback.format_exc()}")
|
||||||
|
|
||||||
# download all posts
|
# download all posts
|
||||||
try:
|
try:
|
||||||
@@ -122,8 +122,8 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
result, user_id, max_to_download=self.full_profile_max_posts - count_posts
|
result, user_id, max_to_download=self.full_profile_max_posts - count_posts
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result.append("errors", f"error downloading posts for {username}")
|
result.append("errors", f"Error downloading posts for {username}")
|
||||||
logger.error(f"error downloading posts for {username}: {e} {traceback.format_exc()}")
|
logger.error(f"Error downloading posts for {username}: {e} {traceback.format_exc()}")
|
||||||
|
|
||||||
# download all tagged
|
# download all tagged
|
||||||
try:
|
try:
|
||||||
@@ -132,8 +132,8 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
result, user_id, max_to_download=self.full_profile_max_posts - count_posts
|
result, user_id, max_to_download=self.full_profile_max_posts - count_posts
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result.append("errors", f"error downloading tagged posts for {username}")
|
result.append("errors", f"Error downloading tagged posts for {username}")
|
||||||
logger.error(f"error downloading tagged posts for {username}: {e} {traceback.format_exc()}")
|
logger.error(f"Error downloading tagged posts for {username}: {e} {traceback.format_exc()}")
|
||||||
|
|
||||||
# download all highlights
|
# download all highlights
|
||||||
try:
|
try:
|
||||||
@@ -159,10 +159,10 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
result.append(
|
result.append(
|
||||||
"errors",
|
"errors",
|
||||||
f"error downloading highlight id{h.get('pk')} for {username}",
|
f"Error downloading highlight id{h.get('pk')} for {username}",
|
||||||
)
|
)
|
||||||
logger.error(
|
logger.error(
|
||||||
f"error downloading highlight id{h.get('pk')} for {username}: {e} {traceback.format_exc()}"
|
f"Error downloading highlight id{h.get('pk')} for {username}: {e} {traceback.format_exc()}"
|
||||||
)
|
)
|
||||||
if count_highlights >= max_to_download:
|
if count_highlights >= max_to_download:
|
||||||
logger.debug(f"HIGHLIGHTS reached max_to_download={self.full_profile_max_posts}")
|
logger.debug(f"HIGHLIGHTS reached max_to_download={self.full_profile_max_posts}")
|
||||||
@@ -210,8 +210,8 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
try:
|
try:
|
||||||
self.scrape_item(result, h, "highlight")
|
self.scrape_item(result, h, "highlight")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result.append("errors", f"error downloading highlight {h.get('id')}")
|
result.append("errors", f"Error downloading highlight {h.get('id')}")
|
||||||
logger.error(f"error downloading highlight, skipping {h.get('id')}: {e} {traceback.format_exc()}")
|
logger.error(f"Error downloading highlight, skipping {h.get('id')}: {e} {traceback.format_exc()}")
|
||||||
|
|
||||||
return h_info
|
return h_info
|
||||||
|
|
||||||
@@ -248,13 +248,13 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
break
|
break
|
||||||
posts, end_cursor = posts[0], posts[1]
|
posts, end_cursor = posts[0], posts[1]
|
||||||
posts = posts[: min(max_to_download, len(posts))]
|
posts = posts[: min(max_to_download, len(posts))]
|
||||||
logger.info(f"parsing {len(posts)} posts, next {end_cursor=} {post_count=} {max_to_download=}")
|
logger.info(f"Parsing {len(posts)} posts, next {end_cursor=} {post_count=} {max_to_download=}")
|
||||||
for p in posts:
|
for p in posts:
|
||||||
try:
|
try:
|
||||||
self.scrape_item(result, p, "post")
|
self.scrape_item(result, p, "post")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result.append("errors", f"error downloading post {p.get('id')}")
|
result.append("errors", f"Error downloading post {p.get('id')}")
|
||||||
logger.error(f"error downloading post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
|
logger.error(f"Error downloading post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
|
||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
post_count += 1
|
post_count += 1
|
||||||
if post_count >= max_to_download:
|
if post_count >= max_to_download:
|
||||||
@@ -275,14 +275,14 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
break
|
break
|
||||||
next_page_id = resp.get("next_page_id")
|
next_page_id = resp.get("next_page_id")
|
||||||
|
|
||||||
logger.info(f"parsing {len(posts)} tagged posts, next {next_page_id=}")
|
logger.info(f"Parsing {len(posts)} tagged posts, next {next_page_id=}")
|
||||||
posts = posts[: min(max_to_download, len(posts))]
|
posts = posts[: min(max_to_download, len(posts))]
|
||||||
for p in posts:
|
for p in posts:
|
||||||
try:
|
try:
|
||||||
self.scrape_item(result, p, "tagged")
|
self.scrape_item(result, p, "tagged")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result.append("errors", f"error downloading tagged post {p.get('id')}")
|
result.append("errors", f"Error downloading tagged post {p.get('id')}")
|
||||||
logger.error(f"error downloading tagged post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
|
logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
|
||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
tagged_count += 1
|
tagged_count += 1
|
||||||
if tagged_count >= max_to_download:
|
if tagged_count >= max_to_download:
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ highlights, and tagged posts. Authentication is required via username/password o
|
|||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
import traceback
|
||||||
import instaloader
|
import instaloader
|
||||||
from auto_archiver.utils.custom_logger import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
@@ -44,11 +45,11 @@ class InstagramExtractor(Extractor):
|
|||||||
self.insta.load_session_from_file(self.username, self.session_file)
|
self.insta.load_session_from_file(self.username, self.session_file)
|
||||||
except Exception:
|
except Exception:
|
||||||
try:
|
try:
|
||||||
logger.info("no valid session file found - Attempting login with use and password.")
|
logger.info("No valid session file found - Attempting login with username and password.")
|
||||||
self.insta.login(self.username, self.password)
|
self.insta.login(self.username, self.password)
|
||||||
self.insta.save_session_to_file(self.session_file)
|
self.insta.save_session_to_file(self.session_file)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"failed to setup Instagram Extractor with Instagrapi. {e}")
|
logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}")
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
@@ -72,7 +73,7 @@ class InstagramExtractor(Extractor):
|
|||||||
result = self.download_profile(url, profile_matches[0])
|
result = self.download_profile(url, profile_matches[0])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"failed to download with instagram extractor due to: {e}, make sure your account credentials are valid."
|
f"Failed to download with instagram extractor due to: {e}, make sure your account credentials are valid."
|
||||||
)
|
)
|
||||||
finally:
|
finally:
|
||||||
shutil.rmtree(self.download_folder, ignore_errors=True)
|
shutil.rmtree(self.download_folder, ignore_errors=True)
|
||||||
@@ -95,27 +96,27 @@ class InstagramExtractor(Extractor):
|
|||||||
try:
|
try:
|
||||||
self.insta.download_post(post, target=f"profile_post_{post.owner_username}")
|
self.insta.download_post(post, target=f"profile_post_{post.owner_username}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"failed to download post: {post.shortcode}: {e}")
|
logger.error(f"Failed to download post: {post.shortcode}: {e} {traceback.format_exc()}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"failed profile.get_posts: {e}")
|
logger.error(f"Failed profile.get_posts: {e}: {traceback.format_exc()}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for post in profile.get_tagged_posts():
|
for post in profile.get_tagged_posts():
|
||||||
try:
|
try:
|
||||||
self.insta.download_post(post, target=f"tagged_post_{post.owner_username}")
|
self.insta.download_post(post, target=f"tagged_post_{post.owner_username}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"failed to download tagged post: {post.shortcode}: {e}")
|
logger.error(f"Failed to download tagged post: {post.shortcode}: {e} {traceback.format_exc()}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"failed profile.get_tagged_posts: {e}")
|
logger.error(f"Failed profile.get_tagged_posts: {e} {traceback.format_exc()}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for post in profile.get_igtv_posts():
|
for post in profile.get_igtv_posts():
|
||||||
try:
|
try:
|
||||||
self.insta.download_post(post, target=f"igtv_post_{post.owner_username}")
|
self.insta.download_post(post, target=f"igtv_post_{post.owner_username}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"failed to download igtv post: {post.shortcode}: {e}")
|
logger.error(f"Failed to download igtv post: {post.shortcode}: {e} {traceback.format_exc()}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"failed profile.get_igtv_posts: {e}")
|
logger.error(f"Failed profile.get_igtv_posts: {e} {traceback.format_exc()}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for story in self.insta.get_stories([profile.userid]):
|
for story in self.insta.get_stories([profile.userid]):
|
||||||
@@ -123,9 +124,9 @@ class InstagramExtractor(Extractor):
|
|||||||
try:
|
try:
|
||||||
self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}")
|
self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"failed to download story item: {item}: {e}")
|
logger.error(f"Failed to download story item: {item}: {e} {traceback.format_exc()}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"failed get_stories: {e}")
|
logger.error(f"Failed get_stories: {e} {traceback.format_exc()}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for highlight in self.insta.get_highlights(profile.userid):
|
for highlight in self.insta.get_highlights(profile.userid):
|
||||||
@@ -133,9 +134,9 @@ class InstagramExtractor(Extractor):
|
|||||||
try:
|
try:
|
||||||
self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}")
|
self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"failed to download highlight item: {item}: {e}")
|
logger.error(f"Failed to download highlight item: {item}: {e} {traceback.format_exc()}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"failed get_highlights: {e}")
|
logger.error(f"Failed get_highlights: {e} {traceback.format_exc()}")
|
||||||
|
|
||||||
return self.process_downloads(url, f"@{username}", profile._asdict(), None)
|
return self.process_downloads(url, f"@{username}", profile._asdict(), None)
|
||||||
|
|
||||||
@@ -158,4 +159,4 @@ class InstagramExtractor(Extractor):
|
|||||||
|
|
||||||
return result.success("instagram")
|
return result.success("instagram")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"could not fetch instagram post due to: {e}")
|
logger.error(f"Could not fetch instagram post due to: {e} {traceback.format_exc()}")
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ class InstagramTbotExtractor(Extractor):
|
|||||||
1. makes a copy of session_file that is removed in cleanup
|
1. makes a copy of session_file that is removed in cleanup
|
||||||
2. checks if the session file is valid
|
2. checks if the session file is valid
|
||||||
"""
|
"""
|
||||||
logger.info(f"SETUP {self.name} checking login...")
|
logger.debug(f"SETUP {self.name} checking login...")
|
||||||
self._prepare_session_file()
|
self._prepare_session_file()
|
||||||
self._initialize_telegram_client()
|
self._initialize_telegram_client()
|
||||||
|
|
||||||
@@ -58,10 +58,10 @@ class InstagramTbotExtractor(Extractor):
|
|||||||
"If you do, disable at least one of the archivers for the first-time setup of the telethon session: {e}"
|
"If you do, disable at least one of the archivers for the first-time setup of the telethon session: {e}"
|
||||||
)
|
)
|
||||||
with self.client.start():
|
with self.client.start():
|
||||||
logger.info(f"SETUP {self.name} login works.")
|
logger.debug(f"SETUP {self.name} login works.")
|
||||||
|
|
||||||
def cleanup(self) -> None:
|
def cleanup(self) -> None:
|
||||||
logger.info(f"CLEANUP {self.name}.")
|
logger.debug(f"CLEANUP {self.name}.")
|
||||||
session_file_name = self.session_file + ".session"
|
session_file_name = self.session_file + ".session"
|
||||||
if os.path.exists(session_file_name):
|
if os.path.exists(session_file_name):
|
||||||
os.remove(session_file_name)
|
os.remove(session_file_name)
|
||||||
@@ -79,17 +79,17 @@ class InstagramTbotExtractor(Extractor):
|
|||||||
|
|
||||||
# This may be outdated and replaced by the below message, but keeping until confirmed
|
# This may be outdated and replaced by the below message, but keeping until confirmed
|
||||||
if "You must enter a URL to a post" in message:
|
if "You must enter a URL to a post" in message:
|
||||||
logger.debug(f"invalid link {url=} for {self.name}: {message}")
|
logger.debug(f"Invalid link for {self.name}: {message}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if "Media not found or unavailable" in message:
|
if "Media not found or unavailable" in message:
|
||||||
logger.debug(f"No media found for link {url=} for {self.name}: {message}")
|
logger.debug(f"No media found for {self.name}: {message}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if message:
|
if message:
|
||||||
result.set_content(message).set_title(message[:128])
|
result.set_content(message).set_title(message[:128])
|
||||||
elif result.is_empty():
|
elif result.is_empty():
|
||||||
logger.debug(f"No media found for link {url=} for {self.name}: {message}")
|
logger.debug(f"No media found for {self.name}: {message}")
|
||||||
return False
|
return False
|
||||||
return result.success("insta-via-bot")
|
return result.success("insta-via-bot")
|
||||||
|
|
||||||
|
|||||||
@@ -8,9 +8,7 @@ from auto_archiver.core import Media, Metadata
|
|||||||
|
|
||||||
class JsonEnricher(Enricher):
|
class JsonEnricher(Enricher):
|
||||||
def enrich(self, to_enrich: Metadata) -> None:
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
url = to_enrich.get_url()
|
logger.debug("Enriching as JSON")
|
||||||
|
|
||||||
logger.debug(f"JSON Enricher for {url=}")
|
|
||||||
|
|
||||||
item_path = os.path.join(self.tmp_dir, "metadata.json")
|
item_path = os.path.join(self.tmp_dir, "metadata.json")
|
||||||
with open(item_path, mode="w", encoding="utf-8") as outf:
|
with open(item_path, mode="w", encoding="utf-8") as outf:
|
||||||
|
|||||||
@@ -38,8 +38,7 @@ class LocalStorage(Storage):
|
|||||||
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
||||||
logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}")
|
logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}")
|
||||||
|
|
||||||
res = shutil.copy2(media.filename, dest)
|
shutil.copy2(media.filename, dest)
|
||||||
logger.info(res)
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# must be implemented even if unused
|
# must be implemented even if unused
|
||||||
|
|||||||
@@ -12,20 +12,17 @@ class MetaEnricher(Enricher):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def enrich(self, to_enrich: Metadata) -> None:
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
url = to_enrich.get_url()
|
|
||||||
if to_enrich.is_empty():
|
if to_enrich.is_empty():
|
||||||
logger.debug(f"[SKIP] META_ENRICHER there is no media or metadata to enrich: {url=}")
|
logger.debug("[SKIP] META_ENRICHER there is no media or metadata to enrich")
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.debug(f"calculating archive metadata information for {url=}")
|
logger.debug("Calculating archive metadata information")
|
||||||
|
|
||||||
self.enrich_file_sizes(to_enrich)
|
self.enrich_file_sizes(to_enrich)
|
||||||
self.enrich_archive_duration(to_enrich)
|
self.enrich_archive_duration(to_enrich)
|
||||||
|
|
||||||
def enrich_file_sizes(self, to_enrich: Metadata):
|
def enrich_file_sizes(self, to_enrich: Metadata):
|
||||||
logger.debug(
|
logger.debug(f"Calculating archive file sizes for {len(to_enrich.media)} media files")
|
||||||
f"calculating archive file sizes for url={to_enrich.get_url()} ({len(to_enrich.media)} media files)"
|
|
||||||
)
|
|
||||||
total_size = 0
|
total_size = 0
|
||||||
for media in to_enrich.get_all_media():
|
for media in to_enrich.get_all_media():
|
||||||
file_stats = os.stat(media.filename)
|
file_stats = os.stat(media.filename)
|
||||||
@@ -44,7 +41,7 @@ class MetaEnricher(Enricher):
|
|||||||
size /= 1024
|
size /= 1024
|
||||||
|
|
||||||
def enrich_archive_duration(self, to_enrich):
|
def enrich_archive_duration(self, to_enrich):
|
||||||
logger.debug(f"calculating archive duration for url={to_enrich.get_url()} ")
|
logger.debug("Calculating archive duration")
|
||||||
|
|
||||||
archive_duration = datetime.datetime.now(datetime.timezone.utc) - to_enrich.get("_processed_at")
|
archive_duration = datetime.datetime.now(datetime.timezone.utc) - to_enrich.get("_processed_at")
|
||||||
to_enrich.set("archive_duration_seconds", archive_duration.seconds)
|
to_enrich.set("archive_duration_seconds", archive_duration.seconds)
|
||||||
|
|||||||
@@ -12,8 +12,7 @@ class MetadataEnricher(Enricher):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def enrich(self, to_enrich: Metadata) -> None:
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
url = to_enrich.get_url()
|
logger.debug("Extracting EXIF metadata")
|
||||||
logger.debug(f"extracting EXIF metadata for {url=}")
|
|
||||||
|
|
||||||
for i, m in enumerate(to_enrich.media):
|
for i, m in enumerate(to_enrich.media):
|
||||||
if len(md := self.get_metadata(m.filename)):
|
if len(md := self.get_metadata(m.filename)):
|
||||||
@@ -31,8 +30,8 @@ class MetadataEnricher(Enricher):
|
|||||||
field, value = line.strip().split(":", 1)
|
field, value = line.strip().split(":", 1)
|
||||||
metadata[field.strip()] = value.strip()
|
metadata[field.strip()] = value.strip()
|
||||||
return metadata
|
return metadata
|
||||||
except FileNotFoundError:
|
except FileNotFoundError as e:
|
||||||
logger.error("[exif_enricher] ExifTool not found. Make sure ExifTool is installed and added to PATH.")
|
logger.error(f"ExifTool not found. Make sure ExifTool is installed and added to PATH. {e}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error occurred: {e}: {traceback.format_exc()}")
|
logger.error(f"Error occurred: {e}: {traceback.format_exc()}")
|
||||||
return {}
|
return {}
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
|
import traceback
|
||||||
|
|
||||||
from auto_archiver.utils.custom_logger import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
import opentimestamps
|
import opentimestamps
|
||||||
@@ -14,13 +15,12 @@ from auto_archiver.utils.misc import get_current_timestamp
|
|||||||
|
|
||||||
class OpentimestampsEnricher(Enricher):
|
class OpentimestampsEnricher(Enricher):
|
||||||
def enrich(self, to_enrich: Metadata) -> None:
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
url = to_enrich.get_url()
|
logger.debug("OpenTimestamps timestamping files")
|
||||||
logger.debug(f"OpenTimestamps timestamping files for {url=}")
|
|
||||||
|
|
||||||
# Get the media files to timestamp
|
# Get the media files to timestamp
|
||||||
media_files = [m for m in to_enrich.media if m.filename and not m.get("opentimestamps")]
|
media_files = [m for m in to_enrich.media if m.filename and not m.get("opentimestamps")]
|
||||||
if not media_files:
|
if not media_files:
|
||||||
logger.debug(f"No files found to timestamp in {url=}")
|
logger.debug("No files found to timestamp")
|
||||||
return
|
return
|
||||||
|
|
||||||
timestamp_files = []
|
timestamp_files = []
|
||||||
@@ -94,7 +94,7 @@ class OpentimestampsEnricher(Enricher):
|
|||||||
detached_timestamp.serialize(ctx)
|
detached_timestamp.serialize(ctx)
|
||||||
f.write(ctx.getbytes())
|
f.write(ctx.getbytes())
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Failed to serialize timestamp file: {e}")
|
logger.warning(f"Failed to serialize timestamp file: {e} {traceback.format_exc()}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Create media for the timestamp file
|
# Create media for the timestamp file
|
||||||
@@ -113,16 +113,16 @@ class OpentimestampsEnricher(Enricher):
|
|||||||
media.set("opentimestamps", True)
|
media.set("opentimestamps", True)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Error while timestamping {media.filename}: {e}")
|
logger.warning(f"Error while timestamping {media.filename}: {e} {traceback.format_exc()}")
|
||||||
|
|
||||||
# Add timestamp files to the metadata
|
# Add timestamp files to the metadata
|
||||||
if timestamp_files:
|
if timestamp_files:
|
||||||
to_enrich.set("opentimestamped", True)
|
to_enrich.set("opentimestamped", True)
|
||||||
to_enrich.set("opentimestamps_count", len(timestamp_files))
|
to_enrich.set("opentimestamps_count", len(timestamp_files))
|
||||||
logger.info(f"{len(timestamp_files)} OpenTimestamps proofs created for {url=}")
|
logger.info(f"{len(timestamp_files)} OpenTimestamps proofs created")
|
||||||
else:
|
else:
|
||||||
to_enrich.set("opentimestamped", False)
|
to_enrich.set("opentimestamped", False)
|
||||||
logger.warning(f"No successful timestamps created for {url=}")
|
logger.warning("No successful timestamps created")
|
||||||
|
|
||||||
def verify_timestamp(self, detached_timestamp):
|
def verify_timestamp(self, detached_timestamp):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -28,8 +28,7 @@ class PdqHashEnricher(Enricher):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def enrich(self, to_enrich: Metadata) -> None:
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
url = to_enrich.get_url()
|
logger.debug("Calculating perceptual hashes")
|
||||||
logger.debug(f"calculating perceptual hashes for {url=}")
|
|
||||||
media_with_hashes = []
|
media_with_hashes = []
|
||||||
|
|
||||||
for m in to_enrich.media:
|
for m in to_enrich.media:
|
||||||
@@ -44,7 +43,7 @@ class PdqHashEnricher(Enricher):
|
|||||||
media.set("pdq_hash", hd)
|
media.set("pdq_hash", hd)
|
||||||
media_with_hashes.append(media.filename)
|
media_with_hashes.append(media.filename)
|
||||||
|
|
||||||
logger.debug(f"calculated '{len(media_with_hashes)}' perceptual hashes for {url=}: {media_with_hashes}")
|
logger.debug(f"Calculated '{len(media_with_hashes)}' perceptual hashes: {media_with_hashes}")
|
||||||
|
|
||||||
def calculate_pdq_hash(self, filename):
|
def calculate_pdq_hash(self, filename):
|
||||||
# returns a hexadecimal string with the perceptual hash for the given filename
|
# returns a hexadecimal string with the perceptual hash for the given filename
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ class S3Storage(Storage):
|
|||||||
if existing_key := self.file_in_folder(path):
|
if existing_key := self.file_in_folder(path):
|
||||||
media._key = existing_key
|
media._key = existing_key
|
||||||
media.set("previously archived", True)
|
media.set("previously archived", True)
|
||||||
logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
|
logger.debug(f"Skipping upload of {media.filename} because it already exists in {media.key}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
_, ext = os.path.splitext(media.key)
|
_, ext = os.path.splitext(media.key)
|
||||||
|
|||||||
@@ -19,10 +19,10 @@ class SSLEnricher(Enricher):
|
|||||||
|
|
||||||
url = to_enrich.get_url()
|
url = to_enrich.get_url()
|
||||||
parsed = urlparse(url)
|
parsed = urlparse(url)
|
||||||
assert parsed.scheme in ["https"], f"Invalid URL scheme {url=}"
|
assert parsed.scheme in ["https"], "Invalid URL scheme"
|
||||||
|
|
||||||
domain = parsed.netloc
|
domain = parsed.netloc
|
||||||
logger.debug(f"fetching SSL certificate for {domain=} in {url=}")
|
logger.debug(f"Fetching SSL certificate for {domain=}")
|
||||||
|
|
||||||
cert = ssl.get_server_certificate((domain, 443))
|
cert = ssl.get_server_certificate((domain, 443))
|
||||||
cert_fn = os.path.join(self.tmp_dir, f"{slugify(domain)}.pem")
|
cert_fn = os.path.join(self.tmp_dir, f"{slugify(domain)}.pem")
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ class TelegramExtractor(Extractor):
|
|||||||
|
|
||||||
video = s.find("video")
|
video = s.find("video")
|
||||||
if video is None:
|
if video is None:
|
||||||
logger.warning("could not find video")
|
logger.warning("Could not find video")
|
||||||
image_tags = s.find_all(class_="tgme_widget_message_photo_wrap")
|
image_tags = s.find_all(class_="tgme_widget_message_photo_wrap")
|
||||||
|
|
||||||
image_urls = []
|
image_urls = []
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ class TelethonExtractor(Extractor):
|
|||||||
# get currently joined channels
|
# get currently joined channels
|
||||||
# https://docs.telethon.dev/en/stable/modules/custom.html#module-telethon.tl.custom.dialog
|
# https://docs.telethon.dev/en/stable/modules/custom.html#module-telethon.tl.custom.dialog
|
||||||
joined_channel_ids = [c.id for c in self.client.get_dialogs() if c.is_channel]
|
joined_channel_ids = [c.id for c in self.client.get_dialogs() if c.is_channel]
|
||||||
logger.info(f"already part of {len(joined_channel_ids)} channels")
|
logger.info(f"Already part of {len(joined_channel_ids)} channels")
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
pbar = tqdm(desc=f"joining {len(self.channel_invites)} invite links", total=len(self.channel_invites))
|
pbar = tqdm(desc=f"joining {len(self.channel_invites)} invite links", total=len(self.channel_invites))
|
||||||
@@ -80,22 +80,22 @@ class TelethonExtractor(Extractor):
|
|||||||
else:
|
else:
|
||||||
ent = self.client.get_entity(invite) # fails if not a member
|
ent = self.client.get_entity(invite) # fails if not a member
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"please add the property id='{ent.id}' to the 'channel_invites' configuration where {invite=}, not doing so can lead to a minutes-long setup time due to telegram's rate limiting."
|
f"Please add the property id='{ent.id}' to the 'channel_invites' configuration where {invite=}, not doing so can lead to a minutes-long setup time due to telegram's rate limiting."
|
||||||
)
|
)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
logger.info(f"joining new channel {invite=}")
|
logger.info(f"Joining new channel {invite=}")
|
||||||
try:
|
try:
|
||||||
self.client(ImportChatInviteRequest(match.group(2)))
|
self.client(ImportChatInviteRequest(match.group(2)))
|
||||||
except UserAlreadyParticipantError:
|
except UserAlreadyParticipantError:
|
||||||
logger.info(f"already joined {invite=}")
|
logger.info(f"Already joined {invite=}")
|
||||||
except InviteRequestSentError:
|
except InviteRequestSentError:
|
||||||
logger.warning(f"already sent a join request with {invite} still no answer")
|
logger.warning(f"Already sent a join request with {invite} still no answer")
|
||||||
except InviteHashExpiredError:
|
except InviteHashExpiredError:
|
||||||
logger.warning(f"{invite=} has expired please find a more recent one")
|
logger.warning(f"{invite=} has expired please find a more recent one")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"could not join channel with {invite=} due to {e}")
|
logger.error(f"Could not join channel with {invite=} due to {e}")
|
||||||
except FloodWaitError as e:
|
except FloodWaitError as e:
|
||||||
logger.warning(f"got a flood error, need to wait {e.seconds} seconds")
|
logger.warning(f"Got a flood error, need to wait {e.seconds} seconds")
|
||||||
time.sleep(e.seconds)
|
time.sleep(e.seconds)
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
@@ -117,7 +117,7 @@ class TelethonExtractor(Extractor):
|
|||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
# detect URLs that we definitely cannot handle
|
# detect URLs that we definitely cannot handle
|
||||||
match = self.valid_url.search(url)
|
match = self.valid_url.search(url)
|
||||||
logger.debug(f"TELETHON: {match=}")
|
logger.debug(f"Found telethon url {match=}")
|
||||||
if not match:
|
if not match:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@@ -135,10 +135,10 @@ class TelethonExtractor(Extractor):
|
|||||||
try:
|
try:
|
||||||
stories = self.client(functions.stories.GetStoriesByIDRequest(peer=chat, id=[post_id]))
|
stories = self.client(functions.stories.GetStoriesByIDRequest(peer=chat, id=[post_id]))
|
||||||
if not stories.stories:
|
if not stories.stories:
|
||||||
logger.info(f"No stories found for {url}, possibly it's private or the story has expired.")
|
logger.info("No stories found, possibly it's private or the story has expired.")
|
||||||
return False
|
return False
|
||||||
story = stories.stories[0]
|
story = stories.stories[0]
|
||||||
logger.debug(f"TELETHON got story {story.id=} {story.date=} {story.expire_date=}")
|
logger.debug(f"Got story {story.id=} {story.date=} {story.expire_date=}")
|
||||||
result.set_timestamp(story.date).set("views", story.views.to_dict()).set(
|
result.set_timestamp(story.date).set("views", story.views.to_dict()).set(
|
||||||
"expire_date", story.expire_date
|
"expire_date", story.expire_date
|
||||||
)
|
)
|
||||||
@@ -154,20 +154,20 @@ class TelethonExtractor(Extractor):
|
|||||||
try:
|
try:
|
||||||
post = self.client.get_messages(chat, ids=post_id)
|
post = self.client.get_messages(chat, ids=post_id)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
logger.error(f"Could not fetch telegram {url} possibly it's private: {e}")
|
logger.error(f"Could not fetch telegram URL possibly it's private: {e}")
|
||||||
return False
|
return False
|
||||||
except ChannelInvalidError as e:
|
except ChannelInvalidError as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Could not fetch telegram {url}. This error may be fixed if you setup a bot_token in addition to api_id and api_hash (but then private channels will not be archived, we need to update this logic to handle both): {e}"
|
f"Could not fetch telegram URL. This error may be fixed if you setup a bot_token in addition to api_id and api_hash (but then private channels will not be archived, we need to update this logic to handle both): {e}"
|
||||||
)
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
logger.debug(f"TELETHON got post {post=}")
|
logger.debug(f"Got post {post=}")
|
||||||
if post is None:
|
if post is None:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
media_posts = self._get_media_posts_in_group(chat, post)
|
media_posts = self._get_media_posts_in_group(chat, post)
|
||||||
logger.debug(f"got {len(media_posts)=} for {url=}")
|
logger.debug(f"Got {len(media_posts)=}")
|
||||||
|
|
||||||
group_id = post.grouped_id if post.grouped_id is not None else post.id
|
group_id = post.grouped_id if post.grouped_id is not None else post.id
|
||||||
title = post.message
|
title = post.message
|
||||||
|
|||||||
@@ -27,12 +27,12 @@ class ThumbnailEnricher(Enricher):
|
|||||||
Calculates how many thumbnails to generate and at which timestamps based on the video duration, the number of thumbnails per minute and the max number of thumbnails.
|
Calculates how many thumbnails to generate and at which timestamps based on the video duration, the number of thumbnails per minute and the max number of thumbnails.
|
||||||
Thumbnails are equally distributed across the video duration.
|
Thumbnails are equally distributed across the video duration.
|
||||||
"""
|
"""
|
||||||
logger.debug(f"generating thumbnails for {to_enrich.get_url()}")
|
logger.debug("Generating thumbnails")
|
||||||
for m_id, m in enumerate(to_enrich.media[::]):
|
for m_id, m in enumerate(to_enrich.media[::]):
|
||||||
if m.is_video():
|
if m.is_video():
|
||||||
folder = os.path.join(self.tmp_dir, random_str(24))
|
folder = os.path.join(self.tmp_dir, random_str(24))
|
||||||
os.makedirs(folder, exist_ok=True)
|
os.makedirs(folder, exist_ok=True)
|
||||||
logger.debug(f"generating thumbnails for {m.filename}")
|
logger.debug(f"Generating thumbnails for {m.filename}")
|
||||||
duration = m.get("duration")
|
duration = m.get("duration")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -42,10 +42,10 @@ class ThumbnailEnricher(Enricher):
|
|||||||
)
|
)
|
||||||
to_enrich.media[m_id].set("duration", duration)
|
to_enrich.media[m_id].set("duration", duration)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"failed to get duration with FFMPEG from {m.filename}: {e}")
|
logger.warning(f"Failed to get duration with FFMPEG from {m.filename}: {e}")
|
||||||
|
|
||||||
if not duration or type(duration) not in [float, int] or duration <= 0:
|
if not duration or type(duration) not in [float, int] or duration <= 0:
|
||||||
logger.warning(f"cannot generate thumbnails for {m.filename} without valid duration")
|
logger.warning(f"Cannot generate thumbnails for {m.filename} without valid duration")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
num_thumbs = int(min(max(1, (duration / 60) * self.thumbnails_per_minute), self.max_thumbnails))
|
num_thumbs = int(min(max(1, (duration / 60) * self.thumbnails_per_minute), self.max_thumbnails))
|
||||||
|
|||||||
@@ -49,8 +49,7 @@ class TimestampingEnricher(Enricher):
|
|||||||
self.session.close()
|
self.session.close()
|
||||||
|
|
||||||
def enrich(self, to_enrich: Metadata) -> None:
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
url = to_enrich.get_url()
|
logger.debug(f"RFC3161 timestamping existing files")
|
||||||
logger.debug(f"RFC3161 timestamping existing files for {url=}")
|
|
||||||
|
|
||||||
# create a new text file with the existing media hashes
|
# create a new text file with the existing media hashes
|
||||||
hashes = [
|
hashes = [
|
||||||
@@ -58,7 +57,7 @@ class TimestampingEnricher(Enricher):
|
|||||||
]
|
]
|
||||||
|
|
||||||
if not len(hashes):
|
if not len(hashes):
|
||||||
logger.debug(f"No hashes found in {url=}")
|
logger.debug(f"No hashes found")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
@@ -74,7 +73,7 @@ class TimestampingEnricher(Enricher):
|
|||||||
try:
|
try:
|
||||||
message = bytes(data_to_sign, encoding='utf8')
|
message = bytes(data_to_sign, encoding='utf8')
|
||||||
|
|
||||||
logger.debug(f"Timestamping {url=} with {tsa_url=}")
|
logger.debug(f"Timestamping with {tsa_url=}")
|
||||||
signed: TimeStampResponse = self.sign_data(tsa_url, message)
|
signed: TimeStampResponse = self.sign_data(tsa_url, message)
|
||||||
|
|
||||||
# fail if there's any issue with the certificates, uses certifi list of trusted CAs or the user-defined `cert_authorities`
|
# fail if there's any issue with the certificates, uses certifi list of trusted CAs or the user-defined `cert_authorities`
|
||||||
@@ -92,7 +91,7 @@ class TimestampingEnricher(Enricher):
|
|||||||
timestamp_token_path = self.save_timestamp_token(signed.time_stamp_token(), tsa_url)
|
timestamp_token_path = self.save_timestamp_token(signed.time_stamp_token(), tsa_url)
|
||||||
timestamp_tokens.append(Media(filename=timestamp_token_path).set("tsa", tsa_url).set("cert_chain", cert_chain))
|
timestamp_tokens.append(Media(filename=timestamp_token_path).set("tsa", tsa_url).set("cert_chain", cert_chain))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Error while timestamping {url=} with {tsa_url=}: {e}")
|
logger.warning(f"Error while timestamping with {tsa_url=}: {e}")
|
||||||
|
|
||||||
if len(timestamp_tokens):
|
if len(timestamp_tokens):
|
||||||
hashes_media.set("timestamp_authority_files", timestamp_tokens)
|
hashes_media.set("timestamp_authority_files", timestamp_tokens)
|
||||||
@@ -101,9 +100,9 @@ class TimestampingEnricher(Enricher):
|
|||||||
hashes_media.set("cryptography v", version("cryptography"))
|
hashes_media.set("cryptography v", version("cryptography"))
|
||||||
to_enrich.add_media(hashes_media, id="timestamped_hashes")
|
to_enrich.add_media(hashes_media, id="timestamped_hashes")
|
||||||
to_enrich.set("timestamped", True)
|
to_enrich.set("timestamped", True)
|
||||||
logger.info(f"{len(timestamp_tokens)} timestamp tokens created for {url=}")
|
logger.info(f"{len(timestamp_tokens)} timestamp tokens created")
|
||||||
else:
|
else:
|
||||||
logger.warning(f"No successful timestamps for {url=}")
|
logger.warning(f"No successful timestamps found")
|
||||||
|
|
||||||
def save_timestamp_token(self, timestamp_token: bytes, tsa_url: str) -> str:
|
def save_timestamp_token(self, timestamp_token: bytes, tsa_url: str) -> str:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -45,10 +45,9 @@ class TwitterApiExtractor(Extractor):
|
|||||||
if "https://t.co/" in url:
|
if "https://t.co/" in url:
|
||||||
try:
|
try:
|
||||||
r = requests.get(url, timeout=30)
|
r = requests.get(url, timeout=30)
|
||||||
logger.debug(f"Expanded url {url} to {r.url}")
|
|
||||||
url = r.url
|
url = r.url
|
||||||
except Exception:
|
except Exception as e:
|
||||||
logger.error(f"Failed to expand url {url}")
|
logger.error(f"Failed to expand Twitter URL: {e}")
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
@@ -67,7 +66,7 @@ class TwitterApiExtractor(Extractor):
|
|||||||
return False, False
|
return False, False
|
||||||
|
|
||||||
username, tweet_id = matches[0] # only one URL supported
|
username, tweet_id = matches[0] # only one URL supported
|
||||||
logger.debug(f"Found {username=} and {tweet_id=} in {url=}")
|
logger.debug(f"Found {username=} and {tweet_id=}")
|
||||||
|
|
||||||
return username, tweet_id
|
return username, tweet_id
|
||||||
|
|
||||||
@@ -85,7 +84,7 @@ class TwitterApiExtractor(Extractor):
|
|||||||
media_fields=["type", "duration_ms", "url", "variants"],
|
media_fields=["type", "duration_ms", "url", "variants"],
|
||||||
tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"],
|
tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"],
|
||||||
)
|
)
|
||||||
logger.debug(tweet)
|
logger.debug(f"Got {tweet=}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Could not get tweet: {e}")
|
logger.error(f"Could not get tweet: {e}")
|
||||||
return False
|
return False
|
||||||
|
|||||||
@@ -94,7 +94,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
|
|
||||||
# call docker if explicitly enabled or we are running on the host (not in docker)
|
# call docker if explicitly enabled or we are running on the host (not in docker)
|
||||||
if self.use_docker:
|
if self.use_docker:
|
||||||
logger.debug(f"generating WACZ in Docker for {url=}")
|
logger.debug("Generating WACZ in Docker")
|
||||||
logger.debug(f"{browsertrix_home_host=} {browsertrix_home_container=}")
|
logger.debug(f"{browsertrix_home_host=} {browsertrix_home_container=}")
|
||||||
if self.docker_commands:
|
if self.docker_commands:
|
||||||
cmd = self.docker_commands + cmd
|
cmd = self.docker_commands + cmd
|
||||||
@@ -111,12 +111,12 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
if self.profile:
|
if self.profile:
|
||||||
profile_file = f"profile-{self.crawl_id}.tar.gz"
|
profile_file = f"profile-{self.crawl_id}.tar.gz"
|
||||||
profile_fn = os.path.join(browsertrix_home_container, profile_file)
|
profile_fn = os.path.join(browsertrix_home_container, profile_file)
|
||||||
logger.debug(f"copying {self.profile} to {profile_fn}")
|
logger.debug(f"Copying {self.profile} to {profile_fn}")
|
||||||
shutil.copyfile(self.profile, profile_fn)
|
shutil.copyfile(self.profile, profile_fn)
|
||||||
cmd.extend(["--profile", os.path.join("/crawls", profile_file)])
|
cmd.extend(["--profile", os.path.join("/crawls", profile_file)])
|
||||||
|
|
||||||
else:
|
else:
|
||||||
logger.debug(f"generating WACZ without Docker for {url=}")
|
logger.debug("Generating WACZ without Docker")
|
||||||
|
|
||||||
if self.profile:
|
if self.profile:
|
||||||
cmd.extend(["--profile", os.path.join("/app", str(self.profile))])
|
cmd.extend(["--profile", os.path.join("/app", str(self.profile))])
|
||||||
|
|||||||
@@ -31,15 +31,15 @@ class WaybackExtractorEnricher(Enricher, Extractor):
|
|||||||
|
|
||||||
url = to_enrich.get_url()
|
url = to_enrich.get_url()
|
||||||
if UrlUtil.is_auth_wall(url):
|
if UrlUtil.is_auth_wall(url):
|
||||||
logger.debug(f"[SKIP] WAYBACK since url is behind AUTH WALL: {url=}")
|
logger.debug("[SKIP] WAYBACK since url is behind AUTH WALL")
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.debug(f"calling wayback for {url=}")
|
|
||||||
|
|
||||||
if to_enrich.get("wayback"):
|
if to_enrich.get("wayback"):
|
||||||
logger.info(f"Wayback enricher had already been executed: {to_enrich.get('wayback')}")
|
logger.info(f"Wayback enricher had already been executed: {to_enrich.get('wayback')}")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
logger.debug("Calling Wayback")
|
||||||
|
|
||||||
ia_headers = {"Accept": "application/json", "Authorization": f"LOW {self.key}:{self.secret}"}
|
ia_headers = {"Accept": "application/json", "Authorization": f"LOW {self.key}:{self.secret}"}
|
||||||
post_data = {"url": url}
|
post_data = {"url": url}
|
||||||
if self.if_not_archived_within:
|
if self.if_not_archived_within:
|
||||||
@@ -68,7 +68,7 @@ class WaybackExtractorEnricher(Enricher, Extractor):
|
|||||||
attempt = 1
|
attempt = 1
|
||||||
while not wayback_url and time.time() - start_time <= self.timeout:
|
while not wayback_url and time.time() - start_time <= self.timeout:
|
||||||
try:
|
try:
|
||||||
logger.debug(f"GETting status for {job_id=} on {url=} ({attempt=})")
|
logger.debug(f"GETting status for {job_id=} ({attempt=})")
|
||||||
r_status = requests.get(
|
r_status = requests.get(
|
||||||
f"https://web.archive.org/save/status/{job_id}", headers=ia_headers, proxies=proxies
|
f"https://web.archive.org/save/status/{job_id}", headers=ia_headers, proxies=proxies
|
||||||
)
|
)
|
||||||
@@ -79,13 +79,13 @@ class WaybackExtractorEnricher(Enricher, Extractor):
|
|||||||
logger.error(f"Wayback failed with {r_json}")
|
logger.error(f"Wayback failed with {r_json}")
|
||||||
return False
|
return False
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
logger.warning(f"RequestException: fetching status for {url=} due to: {e}")
|
logger.warning(f"RequestException: fetching status due to: {e}")
|
||||||
break
|
break
|
||||||
except json.decoder.JSONDecodeError:
|
except json.decoder.JSONDecodeError:
|
||||||
logger.error(f"Expected a JSON from Wayback and got {r.text} for {url=}")
|
logger.error(f"Expected a JSON from Wayback and got {r.text}")
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"error fetching status for {url=} due to: {e}")
|
logger.warning(f"error fetching status due to: {e}")
|
||||||
if not wayback_url:
|
if not wayback_url:
|
||||||
attempt += 1
|
attempt += 1
|
||||||
time.sleep(1) # TODO: can be improved with exponential backoff
|
time.sleep(1) # TODO: can be improved with exponential backoff
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ class WhisperEnricher(Enricher):
|
|||||||
|
|
||||||
def enrich(self, to_enrich: Metadata) -> None:
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
url = to_enrich.get_url()
|
url = to_enrich.get_url()
|
||||||
logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.")
|
logger.debug(f"WHISPER[{self.action}]: iterating media items")
|
||||||
|
|
||||||
job_results = {}
|
job_results = {}
|
||||||
for i, m in enumerate(to_enrich.media):
|
for i, m in enumerate(to_enrich.media):
|
||||||
@@ -35,7 +35,7 @@ class WhisperEnricher(Enricher):
|
|||||||
try:
|
try:
|
||||||
job_id = self.submit_job(m)
|
job_id = self.submit_job(m)
|
||||||
job_results[job_id] = False
|
job_results[job_id] = False
|
||||||
logger.debug(f"JOB SUBMITTED: {job_id=} for {m.key=}")
|
logger.debug(f"Job submitted: {job_id=} for {m.key=}")
|
||||||
to_enrich.media[i].set("whisper_model", {"job_id": job_id})
|
to_enrich.media[i].set("whisper_model", {"job_id": job_id})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
@@ -72,14 +72,14 @@ class WhisperEnricher(Enricher):
|
|||||||
"type": self.action,
|
"type": self.action,
|
||||||
# "language": "string" # may be a config
|
# "language": "string" # may be a config
|
||||||
}
|
}
|
||||||
logger.debug(f"calling API with {payload=}")
|
logger.debug(f"Calling API with {payload=}")
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
f"{self.api_endpoint}/jobs", json=payload, headers={"Authorization": f"Bearer {self.api_key}"}
|
f"{self.api_endpoint}/jobs", json=payload, headers={"Authorization": f"Bearer {self.api_key}"}
|
||||||
)
|
)
|
||||||
assert response.status_code == 201, (
|
assert response.status_code == 201, (
|
||||||
f"calling the whisper api {self.api_endpoint} returned a non-success code: {response.status_code}"
|
f"calling the whisper api {self.api_endpoint} returned a non-success code: {response.status_code}"
|
||||||
)
|
)
|
||||||
logger.debug(response.json())
|
logger.debug(f"Response from whisper API: {response.json()}")
|
||||||
return response.json()["id"]
|
return response.json()["id"]
|
||||||
|
|
||||||
def check_jobs(self, job_results: dict):
|
def check_jobs(self, job_results: dict):
|
||||||
@@ -115,7 +115,7 @@ class WhisperEnricher(Enricher):
|
|||||||
assert r_res.status_code == 200, (
|
assert r_res.status_code == 200, (
|
||||||
f"Job artifacts did not respond with 200, instead with: {r_res.status_code}"
|
f"Job artifacts did not respond with 200, instead with: {r_res.status_code}"
|
||||||
)
|
)
|
||||||
logger.success(r_res.json())
|
logger.info(f"Job {job_id} completed successfully:{r_res.json()}")
|
||||||
result = {}
|
result = {}
|
||||||
for art_id, artifact in enumerate(r_res.json()):
|
for art_id, artifact in enumerate(r_res.json()):
|
||||||
subtitle = []
|
subtitle = []
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ import uuid
|
|||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from dateutil.parser import parse as parse_dt
|
from dateutil.parser import parse as parse_dt
|
||||||
|
|
||||||
import requests
|
|
||||||
from auto_archiver.utils.custom_logger import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
|
|
||||||
@@ -15,18 +14,6 @@ def mkdir_if_not_exists(folder):
|
|||||||
os.makedirs(folder)
|
os.makedirs(folder)
|
||||||
|
|
||||||
|
|
||||||
def expand_url(url):
|
|
||||||
# expand short URL links
|
|
||||||
if "https://t.co/" in url:
|
|
||||||
try:
|
|
||||||
r = requests.get(url)
|
|
||||||
logger.debug(f"Expanded url {url} to {r.url}")
|
|
||||||
return r.url
|
|
||||||
except Exception:
|
|
||||||
logger.error(f"Failed to expand url {url}")
|
|
||||||
return url
|
|
||||||
|
|
||||||
|
|
||||||
def getattr_or(o: object, prop: str, default=None):
|
def getattr_or(o: object, prop: str, default=None):
|
||||||
try:
|
try:
|
||||||
res = getattr(o, prop)
|
res = getattr(o, prop)
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ def test_fetch_fail_status(api_db, metadata, mocker):
|
|||||||
mock_get = mocker.patch("auto_archiver.modules.api_db.api_db.requests.get")
|
mock_get = mocker.patch("auto_archiver.modules.api_db.api_db.requests.get")
|
||||||
mock_get.return_value.status_code = 400
|
mock_get.return_value.status_code = 400
|
||||||
mock_get.return_value.json.return_value = {}
|
mock_get.return_value.json.return_value = {}
|
||||||
mock_error = mocker.patch("loguru.logger.error")
|
mock_error = mocker.patch("auto_archiver.utils.custom_logger.logger.error")
|
||||||
assert api_db.fetch(metadata) is False
|
assert api_db.fetch(metadata) is False
|
||||||
mock_error.assert_called_once_with("AA API FAIL (400): {}")
|
mock_error.assert_called_once_with("AA API FAIL (400): {}")
|
||||||
|
|
||||||
|
|||||||
@@ -33,7 +33,6 @@ def test_enrich_skips_empty_metadata(meta_enricher, mock_metadata):
|
|||||||
"""Test that enrich() does nothing when Metadata is empty."""
|
"""Test that enrich() does nothing when Metadata is empty."""
|
||||||
mock_metadata.is_empty.return_value = True
|
mock_metadata.is_empty.return_value = True
|
||||||
meta_enricher.enrich(mock_metadata)
|
meta_enricher.enrich(mock_metadata)
|
||||||
mock_metadata.get_url.assert_called_once()
|
|
||||||
|
|
||||||
|
|
||||||
def test_enrich_file_sizes(meta_enricher, metadata, tmp_path):
|
def test_enrich_file_sizes(meta_enricher, metadata, tmp_path):
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ def test_enrich_empty_media(enricher, mocker):
|
|||||||
|
|
||||||
def test_get_metadata_error_handling(enricher, mocker):
|
def test_get_metadata_error_handling(enricher, mocker):
|
||||||
mocker.patch("subprocess.run", side_effect=Exception("Test error"))
|
mocker.patch("subprocess.run", side_effect=Exception("Test error"))
|
||||||
mock_log = mocker.patch("loguru.logger.error")
|
mock_log = mocker.patch("auto_archiver.utils.custom_logger.logger.error")
|
||||||
result = enricher.get_metadata("test.jpg")
|
result = enricher.get_metadata("test.jpg")
|
||||||
assert result == {}
|
assert result == {}
|
||||||
assert "Error occurred: " in mock_log.call_args[0][0]
|
assert "Error occurred: " in mock_log.call_args[0][0]
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ def test_enrich_skip_non_image(metadata_with_images, mocker):
|
|||||||
def test_enrich_handles_corrupted_image(metadata_with_images, mocker):
|
def test_enrich_handles_corrupted_image(metadata_with_images, mocker):
|
||||||
mocker.patch("PIL.Image.open", side_effect=UnidentifiedImageError("Corrupted image"))
|
mocker.patch("PIL.Image.open", side_effect=UnidentifiedImageError("Corrupted image"))
|
||||||
mock_pdq = mocker.patch("pdqhash.compute")
|
mock_pdq = mocker.patch("pdqhash.compute")
|
||||||
mock_logger = mocker.patch("loguru.logger.error")
|
mock_logger = mocker.patch("auto_archiver.utils.custom_logger.logger.error")
|
||||||
enricher = PdqHashEnricher()
|
enricher = PdqHashEnricher()
|
||||||
enricher.enrich(metadata_with_images)
|
enricher.enrich(metadata_with_images)
|
||||||
|
|
||||||
|
|||||||
@@ -75,12 +75,12 @@ def test_enrich_thumbnail_limits(
|
|||||||
def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video, mocker):
|
def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video, mocker):
|
||||||
mocker.patch("ffmpeg.probe", side_effect=Exception("Probe error"))
|
mocker.patch("ffmpeg.probe", side_effect=Exception("Probe error"))
|
||||||
mocker.patch("os.makedirs")
|
mocker.patch("os.makedirs")
|
||||||
mock_logger = mocker.patch("loguru.logger.warning")
|
mock_logger = mocker.patch("auto_archiver.utils.custom_logger.logger.warning")
|
||||||
mocker.patch.object(Media, "is_video", return_value=True)
|
mocker.patch.object(Media, "is_video", return_value=True)
|
||||||
|
|
||||||
thumbnail_enricher.enrich(metadata_with_video)
|
thumbnail_enricher.enrich(metadata_with_video)
|
||||||
# Ensure error was logged
|
# Ensure error was logged
|
||||||
mock_logger.assert_called_with("cannot generate thumbnails for video.mp4 without valid duration")
|
mock_logger.assert_called_with("Cannot generate thumbnails for video.mp4 without valid duration")
|
||||||
# Ensure no thumbnails were created
|
# Ensure no thumbnails were created
|
||||||
thumbnails = metadata_with_video.media[0].get("thumbnails")
|
thumbnails = metadata_with_video.media[0].get("thumbnails")
|
||||||
assert thumbnails is None
|
assert thumbnails is None
|
||||||
@@ -128,12 +128,12 @@ def test_enrich_handles_short_video(
|
|||||||
|
|
||||||
|
|
||||||
def test_uses_existing_duration_on_exception(thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, mocker):
|
def test_uses_existing_duration_on_exception(thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, mocker):
|
||||||
mock_logger = mocker.patch("loguru.logger.warning")
|
mock_logger = mocker.patch("auto_archiver.utils.custom_logger.logger.warning")
|
||||||
mock_probe = mocker.patch("ffmpeg.probe", side_effect=Exception("New probe error"))
|
mock_probe = mocker.patch("ffmpeg.probe", side_effect=Exception("New probe error"))
|
||||||
metadata_with_video.media[0].set("duration", 3)
|
metadata_with_video.media[0].set("duration", 3)
|
||||||
thumbnail_enricher.enrich(metadata_with_video)
|
thumbnail_enricher.enrich(metadata_with_video)
|
||||||
mock_probe.assert_called_once()
|
mock_probe.assert_called_once()
|
||||||
mock_logger.assert_called_with("failed to get duration with FFMPEG from video.mp4: New probe error")
|
mock_logger.assert_called_with("Failed to get duration with FFMPEG from video.mp4: New probe error")
|
||||||
assert mock_ffmpeg_environment["mock_output"].run.call_count == 3
|
assert mock_ffmpeg_environment["mock_output"].run.call_count == 3
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ def test_setup_with_docker(wacz_enricher, mocker):
|
|||||||
|
|
||||||
def test_already_ran(wacz_enricher, metadata, mocker):
|
def test_already_ran(wacz_enricher, metadata, mocker):
|
||||||
metadata.add_media(Media("test.wacz"), id="browsertrix")
|
metadata.add_media(Media("test.wacz"), id="browsertrix")
|
||||||
mock_log = mocker.patch("loguru.logger.info")
|
mock_log = mocker.patch("auto_archiver.utils.custom_logger.logger.info")
|
||||||
assert wacz_enricher.enrich(metadata) is True
|
assert wacz_enricher.enrich(metadata) is True
|
||||||
assert "WACZ enricher had already been executed" in mock_log.call_args[0][0]
|
assert "WACZ enricher had already been executed" in mock_log.call_args[0][0]
|
||||||
|
|
||||||
@@ -73,7 +73,7 @@ def test_download_success(wacz_enricher, mocker) -> None:
|
|||||||
|
|
||||||
def test_enrich_already_executed(wacz_enricher, mocker) -> None:
|
def test_enrich_already_executed(wacz_enricher, mocker) -> None:
|
||||||
"""Test enrich if already executed."""
|
"""Test enrich if already executed."""
|
||||||
mock_log = mocker.patch("loguru.logger.info")
|
mock_log = mocker.patch("auto_archiver.utils.custom_logger.logger.info")
|
||||||
metadata = Metadata().set_url("https://example.com")
|
metadata = Metadata().set_url("https://example.com")
|
||||||
media = Media(filename="some_file.wacz")
|
media = Media(filename="some_file.wacz")
|
||||||
metadata.add_media(media, id="browsertrix")
|
metadata.add_media(media, id="browsertrix")
|
||||||
|
|||||||
@@ -118,8 +118,7 @@ def test_check_required_values(orchestrator, caplog, test_args):
|
|||||||
|
|
||||||
with pytest.raises(SystemExit):
|
with pytest.raises(SystemExit):
|
||||||
orchestrator.setup_config(test_args)
|
orchestrator.setup_config(test_args)
|
||||||
|
assert "the following arguments are required: --example_module.required_field" in caplog.records[0].message
|
||||||
assert caplog.records[1].message == "the following arguments are required: --example_module.required_field"
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_required_values_from_config(orchestrator, test_args, tmp_path):
|
def test_get_required_values_from_config(orchestrator, test_args, tmp_path):
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ import pytest
|
|||||||
|
|
||||||
from auto_archiver.utils.misc import (
|
from auto_archiver.utils.misc import (
|
||||||
mkdir_if_not_exists,
|
mkdir_if_not_exists,
|
||||||
expand_url,
|
|
||||||
getattr_or,
|
getattr_or,
|
||||||
DateTimeEncoder,
|
DateTimeEncoder,
|
||||||
dump_payload,
|
dump_payload,
|
||||||
@@ -39,26 +38,6 @@ class TestDirectoryUtils:
|
|||||||
assert existing_dir.exists()
|
assert existing_dir.exists()
|
||||||
|
|
||||||
|
|
||||||
class TestURLExpansion:
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"input_url,expected",
|
|
||||||
[("https://example.com", "https://example.com"), ("https://t.co/test", "https://expanded.url")],
|
|
||||||
)
|
|
||||||
def test_expand_url(self, input_url, expected, mocker):
|
|
||||||
mock_response = mocker.Mock()
|
|
||||||
mock_response.url = "https://expanded.url"
|
|
||||||
mocker.patch("requests.get", return_value=mock_response)
|
|
||||||
result = expand_url(input_url)
|
|
||||||
assert result == expected
|
|
||||||
|
|
||||||
def test_expand_url_handles_errors(self, caplog, mocker):
|
|
||||||
mocker.patch("requests.get", side_effect=Exception("Connection error"))
|
|
||||||
url = "https://t.co/error"
|
|
||||||
result = expand_url(url)
|
|
||||||
assert result == url
|
|
||||||
assert f"Failed to expand url {url}" in caplog.text
|
|
||||||
|
|
||||||
|
|
||||||
class TestAttributeHandling:
|
class TestAttributeHandling:
|
||||||
class Sample:
|
class Sample:
|
||||||
exists = "value"
|
exists = "value"
|
||||||
|
|||||||
Reference in New Issue
Block a user