mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-08 03:18:28 +03:00
WIP refactor logging
This commit is contained in:
@@ -14,7 +14,7 @@ You will need to provide your phone number and a 2FA code the first time you run
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
from telethon.sync import TelegramClient
|
from telethon.sync import TelegramClient
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
|
|
||||||
# Create a
|
# Create a
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ from tempfile import TemporaryDirectory
|
|||||||
from auto_archiver.utils import url as UrlUtil
|
from auto_archiver.utils import url as UrlUtil
|
||||||
from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES
|
from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES
|
||||||
|
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from .module import ModuleFactory
|
from .module import ModuleFactory
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ from ruamel.yaml import YAML, CommentedMap
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from auto_archiver.core.consts import MODULE_TYPES
|
from auto_archiver.core.consts import MODULE_TYPES
|
||||||
@@ -118,8 +118,7 @@ class DefaultValidatingParser(argparse.ArgumentParser):
|
|||||||
"""
|
"""
|
||||||
Override of error to format a nicer looking error message using logger
|
Override of error to format a nicer looking error message using logger
|
||||||
"""
|
"""
|
||||||
logger.error("Problem with configuration file (tip: use --help to see the available options):")
|
logger.error(f"Problem with configuration file (tip: use --help to see the available options): \n{message}")
|
||||||
logger.error(message)
|
|
||||||
self.exit(2)
|
self.exit(2)
|
||||||
|
|
||||||
def parse_known_args(self, args=None, namespace=None):
|
def parse_known_args(self, args=None, namespace=None):
|
||||||
@@ -136,8 +135,7 @@ class DefaultValidatingParser(argparse.ArgumentParser):
|
|||||||
try:
|
try:
|
||||||
self._check_value(action, action.default)
|
self._check_value(action, action.default)
|
||||||
except argparse.ArgumentError as e:
|
except argparse.ArgumentError as e:
|
||||||
logger.error(f"You have an invalid setting in your configuration file ({action.dest}):")
|
logger.error(f"You have an invalid setting in your configuration file ({action.dest}):\n {e}")
|
||||||
logger.error(e)
|
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
return super().parse_known_args(args, namespace)
|
return super().parse_known_args(args, namespace)
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ from contextlib import suppress
|
|||||||
import mimetypes
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
import requests
|
import requests
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
from retrying import retry
|
from retrying import retry
|
||||||
import re
|
import re
|
||||||
|
|
||||||
@@ -94,7 +94,7 @@ class Extractor(BaseModule):
|
|||||||
to_filename = to_filename[-64:]
|
to_filename = to_filename[-64:]
|
||||||
to_filename = os.path.join(self.tmp_dir, to_filename)
|
to_filename = os.path.join(self.tmp_dir, to_filename)
|
||||||
if verbose:
|
if verbose:
|
||||||
logger.debug(f"downloading {url[0:50]=} {to_filename=}")
|
logger.debug(f"downloading {to_filename=}")
|
||||||
headers = {
|
headers = {
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
|
||||||
}
|
}
|
||||||
@@ -117,7 +117,7 @@ class Extractor(BaseModule):
|
|||||||
return to_filename
|
return to_filename
|
||||||
|
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
logger.warning(f"Failed to fetch the Media URL: {str(e)[:250]}")
|
logger.warning(f"Failed to fetch the Media URL: {e}")
|
||||||
if try_best_quality:
|
if try_best_quality:
|
||||||
return None, url
|
return None, url
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ from dataclasses import dataclass, field
|
|||||||
from dataclasses_json import dataclass_json, config
|
from dataclasses_json import dataclass_json, config
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
|
|
||||||
@dataclass_json # annotation order matters
|
@dataclass_json # annotation order matters
|
||||||
@@ -121,8 +121,7 @@ class Media:
|
|||||||
except Error:
|
except Error:
|
||||||
return False # ffmpeg errors when reading bad files
|
return False # ffmpeg errors when reading bad files
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(e)
|
logger.error(f"{e}: {traceback.format_exc()}")
|
||||||
logger.error(traceback.format_exc())
|
|
||||||
try:
|
try:
|
||||||
fsize = os.path.getsize(self.filename)
|
fsize = os.path.getsize(self.filename)
|
||||||
return fsize > 20_000
|
return fsize > 20_000
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from dataclasses_json import dataclass_json
|
|||||||
import datetime
|
import datetime
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from dateutil.parser import parse as parse_dt
|
from dateutil.parser import parse as parse_dt
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from .media import Media
|
from .media import Media
|
||||||
|
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ import sys
|
|||||||
from importlib.util import find_spec
|
from importlib.util import find_spec
|
||||||
import os
|
import os
|
||||||
from os.path import join
|
from os.path import join
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
import auto_archiver
|
import auto_archiver
|
||||||
from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE, SetupError
|
from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE, SetupError
|
||||||
|
|
||||||
|
|||||||
@@ -15,9 +15,11 @@ import traceback
|
|||||||
from copy import copy
|
from copy import copy
|
||||||
|
|
||||||
from rich_argparse import RichHelpFormatter
|
from rich_argparse import RichHelpFormatter
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
from auto_archiver.utils.misc import random_str
|
||||||
|
|
||||||
from .metadata import Metadata, Media
|
from .metadata import Metadata, Media
|
||||||
from auto_archiver.version import __version__
|
from auto_archiver.version import __version__
|
||||||
from .config import (
|
from .config import (
|
||||||
@@ -342,7 +344,12 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
# add other logging info
|
# add other logging info
|
||||||
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
|
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
|
||||||
use_level = logging_config["level"]
|
use_level = logging_config["level"]
|
||||||
self.logger_id = logger.add(sys.stderr, level=use_level)
|
self.logger_id = logger.add(
|
||||||
|
sys.stderr,
|
||||||
|
level=use_level,
|
||||||
|
catch=True,
|
||||||
|
format="<level>{level}</level>: <fg #64FFDA>{message}</fg #64FFDA> {extra[serialize_no_message]}",
|
||||||
|
)
|
||||||
|
|
||||||
rotation = logging_config["rotation"]
|
rotation = logging_config["rotation"]
|
||||||
log_file = logging_config["file"]
|
log_file = logging_config["file"]
|
||||||
@@ -356,9 +363,10 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
f"{log_file}.{i}_{level.lower()}",
|
f"{log_file}.{i}_{level.lower()}",
|
||||||
filter=lambda rec, lvl=level: rec["level"].name == lvl,
|
filter=lambda rec, lvl=level: rec["level"].name == lvl,
|
||||||
rotation=rotation,
|
rotation=rotation,
|
||||||
|
format="{extra[serialized]}",
|
||||||
)
|
)
|
||||||
elif log_file:
|
elif log_file:
|
||||||
logger.add(log_file, rotation=rotation, level=use_level)
|
logger.add(log_file, rotation=rotation, level=use_level, format="{extra[serialized]}")
|
||||||
|
|
||||||
def install_modules(self, modules_by_type):
|
def install_modules(self, modules_by_type):
|
||||||
"""
|
"""
|
||||||
@@ -466,13 +474,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
update_cmd = "`docker pull bellingcat/auto-archiver:latest`"
|
update_cmd = "`docker pull bellingcat/auto-archiver:latest`"
|
||||||
else:
|
else:
|
||||||
update_cmd = "`pip install --upgrade auto-archiver`"
|
update_cmd = "`pip install --upgrade auto-archiver`"
|
||||||
logger.warning("")
|
|
||||||
logger.warning("********* IMPORTANT: UPDATE AVAILABLE ********")
|
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"A new version of auto-archiver is available (v{latest_version}, you have v{current_version})"
|
f"\n********* IMPORTANT: UPDATE AVAILABLE ********\nA new version of auto-archiver is available (v{latest_version}, you have v{current_version})\nMake sure to update to the latest version using: {update_cmd}\n"
|
||||||
)
|
)
|
||||||
logger.warning(f"Make sure to update to the latest version using: {update_cmd}")
|
|
||||||
logger.warning("")
|
|
||||||
|
|
||||||
def setup(self, args: list):
|
def setup(self, args: list):
|
||||||
"""
|
"""
|
||||||
@@ -522,7 +526,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
self.setup(args)
|
self.setup(args)
|
||||||
return self.feed()
|
return self.feed()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(e)
|
logger.error(f"{e}: {traceback.format_exc()}")
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
def cleanup(self) -> None:
|
def cleanup(self) -> None:
|
||||||
@@ -534,10 +538,12 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
url_count = 0
|
url_count = 0
|
||||||
for feeder in self.feeders:
|
for feeder in self.feeders:
|
||||||
for item in feeder:
|
for item in feeder:
|
||||||
|
with logger.contextualize(url=item.get_url(), trace=random_str(12)):
|
||||||
|
logger.info("started processing")
|
||||||
yield self.feed_item(item)
|
yield self.feed_item(item)
|
||||||
url_count += 1
|
url_count += 1
|
||||||
|
|
||||||
logger.info(f"Processed {url_count} URL(s)")
|
logger.info(f"processed {url_count} URL(s)")
|
||||||
self.cleanup()
|
self.cleanup()
|
||||||
|
|
||||||
def feed_item(self, item: Metadata) -> Metadata:
|
def feed_item(self, item: Metadata) -> Metadata:
|
||||||
@@ -555,13 +561,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
return self.archive(item)
|
return self.archive(item)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
# catches keyboard interruptions to do a clean exit
|
# catches keyboard interruptions to do a clean exit
|
||||||
logger.warning(f"caught interrupt on {item=}")
|
logger.warning("caught interrupt")
|
||||||
for d in self.databases:
|
for d in self.databases:
|
||||||
d.aborted(item)
|
d.aborted(item)
|
||||||
self.cleanup()
|
self.cleanup()
|
||||||
exit()
|
exit()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Got unexpected error on item {item}: {e}\n{traceback.format_exc()}")
|
logger.error(f"Got unexpected error: {e}\n{traceback.format_exc()}")
|
||||||
for d in self.databases:
|
for d in self.databases:
|
||||||
if isinstance(e, AssertionError):
|
if isinstance(e, AssertionError):
|
||||||
d.failed(item, str(e))
|
d.failed(item, str(e))
|
||||||
@@ -589,7 +595,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
try:
|
try:
|
||||||
check_url_or_raise(original_url)
|
check_url_or_raise(original_url)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
logger.error(f"Error archiving URL {original_url}: {e}")
|
logger.error(f"Error archiving: {e}")
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
# 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
|
# 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
|
||||||
@@ -599,7 +605,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
|
|
||||||
result.set_url(url)
|
result.set_url(url)
|
||||||
if original_url != url:
|
if original_url != url:
|
||||||
logger.debug(f"Sanitized URL from {original_url} to {url}")
|
logger.debug(f"Sanitized URL to {url}")
|
||||||
result.set("original_url", original_url)
|
result.set("original_url", original_url)
|
||||||
|
|
||||||
# 2 - notify start to DBs, propagate already archived if feature enabled in DBs
|
# 2 - notify start to DBs, propagate already archived if feature enabled in DBs
|
||||||
@@ -614,25 +620,25 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
try:
|
try:
|
||||||
d.done(cached_result, cached=True)
|
d.done(cached_result, cached=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
logger.error(f"database {d.name}: {e}: {traceback.format_exc()}")
|
||||||
return cached_result
|
return cached_result
|
||||||
|
|
||||||
# 3 - call extractors until one succeeds
|
# 3 - call extractors until one succeeds
|
||||||
for a in self.extractors:
|
for a in self.extractors:
|
||||||
logger.info(f"Trying extractor {a.name} for {url}")
|
logger.info(f"trying extractor {a.name}")
|
||||||
try:
|
try:
|
||||||
result.merge(a.download(result))
|
result.merge(a.download(result))
|
||||||
if result.is_success():
|
if result.is_success():
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")
|
logger.error(f"archiver {a.name}: {e}: {traceback.format_exc()}")
|
||||||
|
|
||||||
# 4 - call enrichers to work with archived content
|
# 4 - call enrichers to work with archived content
|
||||||
for e in self.enrichers:
|
for e in self.enrichers:
|
||||||
try:
|
try:
|
||||||
e.enrich(result)
|
e.enrich(result)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
|
logger.error(f"enricher {e.name}: {exc}: {traceback.format_exc()}")
|
||||||
|
|
||||||
# 5 - store all downloaded/generated media
|
# 5 - store all downloaded/generated media
|
||||||
result.store(storages=self.storages)
|
result.store(storages=self.storages)
|
||||||
@@ -651,7 +657,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||||||
try:
|
try:
|
||||||
d.done(result)
|
d.done(result)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
logger.error(f"database {d.name}: {e}: {traceback.format_exc()}")
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ from abc import abstractmethod
|
|||||||
from typing import IO
|
from typing import IO
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
|
|
||||||
from auto_archiver.utils.misc import random_str
|
from auto_archiver.utils.misc import random_str
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ from urllib.parse import urljoin
|
|||||||
import glob
|
import glob
|
||||||
import importlib.util
|
import importlib.util
|
||||||
|
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
import selenium
|
import selenium
|
||||||
from seleniumbase import SB
|
from seleniumbase import SB
|
||||||
|
|
||||||
@@ -57,7 +57,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
continue # Skip imported modules/classes/functions
|
continue # Skip imported modules/classes/functions
|
||||||
if isinstance(obj, type) and issubclass(obj, Dropin):
|
if isinstance(obj, type) and issubclass(obj, Dropin):
|
||||||
dropins.append(obj)
|
dropins.append(obj)
|
||||||
logger.debug(f"ANTIBOT loaded drop-in classes: {', '.join([d.__name__ for d in dropins])}")
|
logger.debug(f"loaded drop-in classes: {', '.join([d.__name__ for d in dropins])}")
|
||||||
return dropins
|
return dropins
|
||||||
|
|
||||||
def sanitize_url(self, url: str) -> str:
|
def sanitize_url(self, url: str) -> str:
|
||||||
@@ -83,14 +83,13 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool:
|
def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool:
|
||||||
using_user_data_dir = self.user_data_dir if custom_data_dir else None
|
using_user_data_dir = self.user_data_dir if custom_data_dir else None
|
||||||
url = to_enrich.get_url()
|
url = to_enrich.get_url()
|
||||||
url_sample = url[:75]
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with SB(uc=True, agent=self.agent, headed=None, user_data_dir=using_user_data_dir, proxy=self.proxy) as sb:
|
with SB(uc=True, agent=self.agent, headed=None, user_data_dir=using_user_data_dir, proxy=self.proxy) as sb:
|
||||||
logger.info(f"ANTIBOT selenium browser is up with agent {self.agent}, opening {url_sample}...")
|
logger.info(f"selenium browser is up with agent {self.agent}, opening url...")
|
||||||
sb.uc_open_with_reconnect(url, 4)
|
sb.uc_open_with_reconnect(url, 4)
|
||||||
|
|
||||||
logger.debug(f"ANTIBOT handling CAPTCHAs for {url_sample}...")
|
logger.debug("handling CAPTCHAs for...")
|
||||||
sb.uc_gui_handle_cf()
|
sb.uc_gui_handle_cf()
|
||||||
sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future
|
sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future
|
||||||
|
|
||||||
@@ -98,7 +97,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
dropin.open_page(url)
|
dropin.open_page(url)
|
||||||
|
|
||||||
if self.detect_auth_wall and self._hit_auth_wall(sb):
|
if self.detect_auth_wall and self._hit_auth_wall(sb):
|
||||||
logger.warning(f"ANTIBOT SKIP since auth wall or CAPTCHA was detected for {url_sample}")
|
logger.warning("skipping since auth wall or CAPTCHA was detected")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
sb.wait_for_ready_state_complete()
|
sb.wait_for_ready_state_complete()
|
||||||
@@ -125,18 +124,18 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
js_css_selector=dropin.js_for_video_css_selectors(),
|
js_css_selector=dropin.js_for_video_css_selectors(),
|
||||||
max_media=self.max_download_videos - downloaded_videos,
|
max_media=self.max_download_videos - downloaded_videos,
|
||||||
)
|
)
|
||||||
logger.info(f"ANTIBOT completed for {url_sample}")
|
logger.info("completed")
|
||||||
|
|
||||||
return to_enrich
|
return to_enrich
|
||||||
except selenium.common.exceptions.SessionNotCreatedException as e:
|
except selenium.common.exceptions.SessionNotCreatedException as e:
|
||||||
if custom_data_dir: # the retry logic only works once
|
if custom_data_dir: # the retry logic only works once
|
||||||
logger.error(
|
logger.error(
|
||||||
f"ANTIBOT session not created error: {e}. Please remove the user_data_dir {self.user_data_dir} and try again, will retry without user data dir though."
|
f"session not created error: {e}. Please remove the user_data_dir {self.user_data_dir} and try again, will retry without user data dir though."
|
||||||
)
|
)
|
||||||
return self.enrich(to_enrich, custom_data_dir=False)
|
return self.enrich(to_enrich, custom_data_dir=False)
|
||||||
raise e # re-raise
|
raise e # re-raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"ANTIBOT runtime error: {e}: {traceback.format_exc()}")
|
logger.error(f"runtime error: {e}: {traceback.format_exc()}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _get_suitable_dropin(self, url: str, sb: SB):
|
def _get_suitable_dropin(self, url: str, sb: SB):
|
||||||
@@ -146,7 +145,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
"""
|
"""
|
||||||
for dropin in self.dropins:
|
for dropin in self.dropins:
|
||||||
if dropin.suitable(url):
|
if dropin.suitable(url):
|
||||||
logger.debug(f"ANTIBOT using drop-in {dropin.__name__} for {url}")
|
logger.debug(f"using drop-in {dropin.__name__}")
|
||||||
return dropin(sb, self)
|
return dropin(sb, self)
|
||||||
|
|
||||||
return DefaultDropin(sb, self)
|
return DefaultDropin(sb, self)
|
||||||
@@ -241,7 +240,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
|
|
||||||
x = max(sb.execute_script("return document.documentElement.scrollWidth"), w)
|
x = max(sb.execute_script("return document.documentElement.scrollWidth"), w)
|
||||||
y = min(max(sb.execute_script("return document.documentElement.scrollHeight"), h), 25_000)
|
y = min(max(sb.execute_script("return document.documentElement.scrollHeight"), h), 25_000)
|
||||||
logger.debug(f"Setting window size to {x}x{y} for full page screenshot.")
|
logger.debug(f"setting window size to {x}x{y} for full page screenshot.")
|
||||||
sb.set_window_size(x, y)
|
sb.set_window_size(x, y)
|
||||||
|
|
||||||
screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png")
|
screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png")
|
||||||
@@ -280,7 +279,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
# js_for_css_selectors
|
# js_for_css_selectors
|
||||||
for src in sources:
|
for src in sources:
|
||||||
if len(all_urls) >= max_media:
|
if len(all_urls) >= max_media:
|
||||||
logger.debug(f"Reached max download limit of {max_media} images/videos.")
|
logger.debug(f"reached max download limit of {max_media} images/videos.")
|
||||||
break
|
break
|
||||||
if not is_relevant_url(src):
|
if not is_relevant_url(src):
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -0,0 +1,60 @@
|
|||||||
|
# def solve_captcha(image_url):
|
||||||
|
# # Download image
|
||||||
|
# img_data = requests.get(image_url).content
|
||||||
|
# encoded_image = base64.b64encode(img_data).decode()
|
||||||
|
|
||||||
|
# # Submit to AntiCaptcha
|
||||||
|
# task = {
|
||||||
|
# "clientKey": ANTI_CAPTCHA_KEY,
|
||||||
|
# "task": {
|
||||||
|
# "type": "ImageToTextTask",
|
||||||
|
# "body": encoded_image
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
# print("[*] Sending captcha request to anti-captcha...")
|
||||||
|
|
||||||
|
# task_response = requests.post("https://api.anti-captcha.com/createTask", json=task).json()
|
||||||
|
# task_id = task_response["taskId"]
|
||||||
|
# print(f"[*] Anti-captcha response: {task_response}")
|
||||||
|
|
||||||
|
# # Poll for result
|
||||||
|
# while True:
|
||||||
|
# time.sleep(5)
|
||||||
|
# res = requests.post("https://api.anti-captcha.com/getTaskResult", json={
|
||||||
|
# "clientKey": ANTI_CAPTCHA_KEY,
|
||||||
|
# "taskId": task_id
|
||||||
|
# }).json()
|
||||||
|
# if res["status"] == "ready":
|
||||||
|
# print(f"[*] Captcha solved: {res}")
|
||||||
|
# return res["solution"]["text"]
|
||||||
|
# print(f"[*] Polling for captcha solution: {res['status']}")
|
||||||
|
|
||||||
|
|
||||||
|
# def solve_recaptcha(site_key, page_url):
|
||||||
|
# print("[*] Sending captcha request to anti-captcha...")
|
||||||
|
# # Step 1: Send captcha request
|
||||||
|
# task_payload = {
|
||||||
|
# "clientKey": ANTI_CAPTCHA_KEY,
|
||||||
|
# "task": {
|
||||||
|
# "type": "NoCaptchaTaskProxyless",
|
||||||
|
# "websiteURL": page_url,
|
||||||
|
# "websiteKey": site_key
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
# response = requests.post("https://api.anti-captcha.com/createTask", json=task_payload).json()
|
||||||
|
# print(f"[*] Anti-captcha response: {response}")
|
||||||
|
# task_id = response["taskId"]
|
||||||
|
|
||||||
|
# # Step 2: Poll for solution
|
||||||
|
# print("[*] Polling for captcha solution...")
|
||||||
|
# for i in range(40): # ~80 seconds
|
||||||
|
# time.sleep(2)
|
||||||
|
# result = requests.post("https://api.anti-captcha.com/getTaskResult", json={
|
||||||
|
# "clientKey": ANTI_CAPTCHA_KEY,
|
||||||
|
# "taskId": task_id
|
||||||
|
# }).json()
|
||||||
|
# print(f" Poll {i+1}: status={result['status']}")
|
||||||
|
# if result["status"] == "ready":
|
||||||
|
# print("[*] Captcha solved!")
|
||||||
|
# return result["solution"]["gRecaptchaResponse"]
|
||||||
|
# raise TimeoutError("AntiCaptcha took too long")
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
|
import traceback
|
||||||
from typing import Mapping
|
from typing import Mapping
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
from seleniumbase import SB
|
from seleniumbase import SB
|
||||||
import yt_dlp
|
import yt_dlp
|
||||||
|
|
||||||
@@ -143,7 +144,7 @@ class Dropin:
|
|||||||
with yt_dlp.YoutubeDL(validated_options) as ydl:
|
with yt_dlp.YoutubeDL(validated_options) as ydl:
|
||||||
for url in video_urls:
|
for url in video_urls:
|
||||||
try:
|
try:
|
||||||
logger.debug(f"Downloading video from URL: {url}")
|
logger.debug("downloading video from url")
|
||||||
info = ydl.extract_info(url, download=True)
|
info = ydl.extract_info(url, download=True)
|
||||||
filename = ydl_entry_to_filename(ydl, info)
|
filename = ydl_entry_to_filename(ydl, info)
|
||||||
if not filename: # Failed to download video.
|
if not filename: # Failed to download video.
|
||||||
@@ -155,5 +156,5 @@ class Dropin:
|
|||||||
to_enrich.add_media(media)
|
to_enrich.add_media(media)
|
||||||
downloaded += 1
|
downloaded += 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error downloading {url}: {e}")
|
logger.error(f"download failed: {e} {traceback.format_exc()}")
|
||||||
return downloaded
|
return downloaded
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
from typing import Mapping
|
from typing import Mapping
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||||
|
|
||||||
|
|
||||||
@@ -62,7 +62,7 @@ class LinkedinDropin(Dropin):
|
|||||||
self.sb.wait_for_ready_state_complete()
|
self.sb.wait_for_ready_state_complete()
|
||||||
|
|
||||||
username, password = self._get_username_password("linkedin.com")
|
username, password = self._get_username_password("linkedin.com")
|
||||||
logger.debug("LinkedinDropin Logging in to Linkedin with username: {}", username)
|
logger.debug("logging in to Linkedin with username: {}", username)
|
||||||
self.sb.type("#username", username)
|
self.sb.type("#username", username)
|
||||||
self.sb.type("#password", password)
|
self.sb.type("#password", password)
|
||||||
self.sb.click_if_visible("#password-visibility-toggle", timeout=0.5)
|
self.sb.click_if_visible("#password-visibility-toggle", timeout=0.5)
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ from typing import Mapping
|
|||||||
from auto_archiver.core.metadata import Metadata
|
from auto_archiver.core.metadata import Metadata
|
||||||
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||||
|
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
|
|
||||||
class RedditDropin(Dropin):
|
class RedditDropin(Dropin):
|
||||||
@@ -50,7 +50,7 @@ class RedditDropin(Dropin):
|
|||||||
self._close_cookies_banner()
|
self._close_cookies_banner()
|
||||||
|
|
||||||
username, password = self._get_username_password("reddit.com")
|
username, password = self._get_username_password("reddit.com")
|
||||||
logger.debug("RedditDropin Logging in to Reddit with username: {}", username)
|
logger.debug("logging in to Reddit with username: {}", username)
|
||||||
|
|
||||||
self.sb.type("#login-username", username)
|
self.sb.type("#login-username", username)
|
||||||
self.sb.type("#login-password", password)
|
self.sb.type("#login-password", password)
|
||||||
@@ -68,7 +68,7 @@ class RedditDropin(Dropin):
|
|||||||
self.sb.click_link_text("Log in")
|
self.sb.click_link_text("Log in")
|
||||||
self.sb.wait_for_ready_state_complete()
|
self.sb.wait_for_ready_state_complete()
|
||||||
if self.sb.is_text_visible("Welcome back"):
|
if self.sb.is_text_visible("Welcome back"):
|
||||||
logger.debug("RedditDropin Login successful")
|
logger.debug("login successful")
|
||||||
self.sb.click_if_visible("this link")
|
self.sb.click_if_visible("this link")
|
||||||
|
|
||||||
def _close_cookies_banner(self):
|
def _close_cookies_banner(self):
|
||||||
@@ -88,5 +88,5 @@ class RedditDropin(Dropin):
|
|||||||
.map(el => el.src || el.href)
|
.map(el => el.src || el.href)
|
||||||
.filter(url => url && /\.(m3u8|mpd|ism)$/.test(url));
|
.filter(url => url && /\.(m3u8|mpd|ism)$/.test(url));
|
||||||
""")
|
""")
|
||||||
logger.debug("RedditDropin Found {} video URLs", len(filtered_urls))
|
logger.debug("found {} video URLs", len(filtered_urls))
|
||||||
return 0, self._download_videos_with_ytdlp(filtered_urls, to_enrich)
|
return 0, self._download_videos_with_ytdlp(filtered_urls, to_enrich)
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ from typing import Mapping
|
|||||||
from auto_archiver.core.metadata import Metadata
|
from auto_archiver.core.metadata import Metadata
|
||||||
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||||
|
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
|
|
||||||
class VkDropin(Dropin):
|
class VkDropin(Dropin):
|
||||||
@@ -57,12 +57,12 @@ class VkDropin(Dropin):
|
|||||||
self.sb.open("https://vk.com")
|
self.sb.open("https://vk.com")
|
||||||
self.sb.wait_for_ready_state_complete()
|
self.sb.wait_for_ready_state_complete()
|
||||||
if "/feed" in self.sb.get_current_url():
|
if "/feed" in self.sb.get_current_url():
|
||||||
logger.debug("Already logged in to VK.")
|
logger.debug("already logged in to VK.")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# need to login
|
# need to login
|
||||||
username, password = self._get_username_password("vk.com")
|
username, password = self._get_username_password("vk.com")
|
||||||
logger.debug("Logging in to VK with username: {}", username)
|
logger.debug("logging in to VK with username: {}", username)
|
||||||
|
|
||||||
self.sb.click('[data-testid="enter-another-way"]', timeout=10)
|
self.sb.click('[data-testid="enter-another-way"]', timeout=10)
|
||||||
self.sb.clear('input[name="login"][type="tel"]', by="css selector", timeout=10)
|
self.sb.clear('input[name="login"][type="tel"]', by="css selector", timeout=10)
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ from typing import Union
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import requests
|
import requests
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from auto_archiver.core import Database
|
from auto_archiver.core import Database
|
||||||
from auto_archiver.core import Metadata
|
from auto_archiver.core import Metadata
|
||||||
@@ -36,9 +36,9 @@ class AAApiDb(Database):
|
|||||||
if not self.store_results:
|
if not self.store_results:
|
||||||
return
|
return
|
||||||
if cached:
|
if cached:
|
||||||
logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
|
logger.debug("skipping saving archive to AA API because it was cached")
|
||||||
return
|
return
|
||||||
logger.debug(f"saving archive of {item.get_url()} to the AA API.")
|
logger.debug("saving archive to the AA API.")
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"author_id": self.author_id,
|
"author_id": self.author_id,
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ import os
|
|||||||
from typing import IO, Iterator, Optional, Union
|
from typing import IO, Iterator, Optional, Union
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from auto_archiver.core import Database, Feeder, Media, Metadata, Storage
|
from auto_archiver.core import Database, Feeder, Media, Metadata, Storage
|
||||||
from auto_archiver.utils import calculate_file_hash
|
from auto_archiver.utils import calculate_file_hash
|
||||||
@@ -66,13 +66,13 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
|||||||
"""Mark an item as failed in Atlos, if the ID exists."""
|
"""Mark an item as failed in Atlos, if the ID exists."""
|
||||||
atlos_id = item.metadata.get("atlos_id")
|
atlos_id = item.metadata.get("atlos_id")
|
||||||
if not atlos_id:
|
if not atlos_id:
|
||||||
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
|
logger.info("No Atlos ID available, skipping")
|
||||||
return
|
return
|
||||||
self._post(
|
self._post(
|
||||||
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
|
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
|
||||||
json={"metadata": {"processed": True, "status": "error", "error": reason}},
|
json={"metadata": {"processed": True, "status": "error", "error": reason}},
|
||||||
)
|
)
|
||||||
logger.info(f"Stored failure for {item.get_url()} (ID {atlos_id}) on Atlos: {reason}")
|
logger.info(f"stored failure ID {atlos_id} on Atlos: {reason}")
|
||||||
|
|
||||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||||
"""check and fetch if the given item has been archived already, each
|
"""check and fetch if the given item has been archived already, each
|
||||||
@@ -88,7 +88,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
|||||||
"""Mark an item as successfully archived in Atlos."""
|
"""Mark an item as successfully archived in Atlos."""
|
||||||
atlos_id = item.metadata.get("atlos_id")
|
atlos_id = item.metadata.get("atlos_id")
|
||||||
if not atlos_id:
|
if not atlos_id:
|
||||||
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
|
logger.info("item has no Atlos ID, skipping")
|
||||||
return
|
return
|
||||||
self._post(
|
self._post(
|
||||||
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
|
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
|
||||||
@@ -100,7 +100,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
logger.info(f"Stored success for {item.get_url()} (ID {atlos_id}) on Atlos")
|
logger.info(f"stored success ID {atlos_id} on Atlos")
|
||||||
|
|
||||||
# ! Atlos Module - Storage Methods
|
# ! Atlos Module - Storage Methods
|
||||||
|
|
||||||
@@ -111,12 +111,12 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
|||||||
def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool:
|
def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool:
|
||||||
"""Upload a media file to Atlos if it has not been uploaded already."""
|
"""Upload a media file to Atlos if it has not been uploaded already."""
|
||||||
if metadata is None:
|
if metadata is None:
|
||||||
logger.error(f"No metadata provided for {media.filename}")
|
logger.error(f"no metadata provided for {media.filename}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
atlos_id = metadata.get("atlos_id")
|
atlos_id = metadata.get("atlos_id")
|
||||||
if not atlos_id:
|
if not atlos_id:
|
||||||
logger.error(f"No Atlos ID found in metadata; can't store {media.filename} in Atlos.")
|
logger.error(f"no Atlos ID found in metadata; can't store {media.filename} in Atlos.")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096)
|
media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096)
|
||||||
@@ -135,7 +135,7 @@ class AtlosFeederDbStorage(Feeder, Database, Storage):
|
|||||||
params={"title": media.properties},
|
params={"title": media.properties},
|
||||||
files={"file": (os.path.basename(media.filename), file_obj)},
|
files={"file": (os.path.basename(media.filename), file_obj)},
|
||||||
)
|
)
|
||||||
logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}")
|
logger.info(f"uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
from loguru import logger
|
|
||||||
|
|
||||||
from auto_archiver.core.feeder import Feeder
|
from auto_archiver.core.feeder import Feeder
|
||||||
from auto_archiver.core.metadata import Metadata
|
from auto_archiver.core.metadata import Metadata
|
||||||
from auto_archiver.core.consts import SetupError
|
from auto_archiver.core.consts import SetupError
|
||||||
@@ -16,8 +14,5 @@ class CLIFeeder(Feeder):
|
|||||||
def __iter__(self) -> Metadata:
|
def __iter__(self) -> Metadata:
|
||||||
urls = self.config["urls"]
|
urls = self.config["urls"]
|
||||||
for url in urls:
|
for url in urls:
|
||||||
logger.debug(f"Processing {url}")
|
|
||||||
m = Metadata().set_url(url)
|
m = Metadata().set_url(url)
|
||||||
yield m
|
yield m
|
||||||
|
|
||||||
logger.success(f"Processed {len(urls)} URL(s)")
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from auto_archiver.core import Database
|
from auto_archiver.core import Database
|
||||||
from auto_archiver.core import Metadata
|
from auto_archiver.core import Metadata
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
from csv import DictWriter
|
from csv import DictWriter
|
||||||
from dataclasses import asdict
|
from dataclasses import asdict
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
import csv
|
import csv
|
||||||
|
|
||||||
from auto_archiver.core import Feeder
|
from auto_archiver.core import Feeder
|
||||||
@@ -20,20 +20,19 @@ class CSVFeeder(Feeder):
|
|||||||
url_column = first_row.index(url_column)
|
url_column = first_row.index(url_column)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?"
|
f"column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?"
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
elif not (url_or_none(first_row[url_column])):
|
elif not (url_or_none(first_row[url_column])):
|
||||||
# it's a header row, but we've been given a column number already
|
# it's a header row, but we've been given a column number already
|
||||||
logger.debug(f"Skipping header row: {first_row}")
|
logger.debug(f"skipping header row: {first_row}")
|
||||||
else:
|
else:
|
||||||
# first row isn't a header row, rewind the file
|
# first row isn't a header row, rewind the file
|
||||||
f.seek(0)
|
f.seek(0)
|
||||||
|
|
||||||
for row in reader:
|
for row in reader:
|
||||||
if not url_or_none(row[url_column]):
|
if not url_or_none(row[url_column]):
|
||||||
logger.warning(f"Not a valid URL in row: {row}, skipping")
|
logger.warning(f"not a valid URL in row: {row}, skipping")
|
||||||
continue
|
continue
|
||||||
url = row[url_column]
|
url = row[url_column]
|
||||||
logger.debug(f"Processing {url}")
|
|
||||||
yield Metadata().set_url(url)
|
yield Metadata().set_url(url)
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ from google.oauth2 import service_account
|
|||||||
from google.oauth2.credentials import Credentials
|
from google.oauth2.credentials import Credentials
|
||||||
from googleapiclient.discovery import build
|
from googleapiclient.discovery import build
|
||||||
from googleapiclient.http import MediaFileUpload
|
from googleapiclient.http import MediaFileUpload
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from auto_archiver.core import Media
|
from auto_archiver.core import Media
|
||||||
from auto_archiver.core import Storage
|
from auto_archiver.core import Storage
|
||||||
@@ -23,10 +23,10 @@ class GDriveStorage(Storage):
|
|||||||
def _setup_google_drive_service(self):
|
def _setup_google_drive_service(self):
|
||||||
"""Initialize Google Drive service based on provided credentials."""
|
"""Initialize Google Drive service based on provided credentials."""
|
||||||
if self.oauth_token:
|
if self.oauth_token:
|
||||||
logger.debug(f"Using Google Drive OAuth token: {self.oauth_token}")
|
logger.debug(f"using Google Drive OAuth token: {self.oauth_token}")
|
||||||
self.service = self._initialize_with_oauth_token()
|
self.service = self._initialize_with_oauth_token()
|
||||||
elif self.service_account:
|
elif self.service_account:
|
||||||
logger.debug(f"Using Google Drive service account: {self.service_account}")
|
logger.debug(f"using Google Drive service account: {self.service_account}")
|
||||||
self.service = self._initialize_with_service_account()
|
self.service = self._initialize_with_service_account()
|
||||||
else:
|
else:
|
||||||
raise ValueError("Missing credentials: either `oauth_token` or `service_account` must be provided.")
|
raise ValueError("Missing credentials: either `oauth_token` or `service_account` must be provided.")
|
||||||
@@ -41,7 +41,7 @@ class GDriveStorage(Storage):
|
|||||||
if not creds.valid and creds.expired and creds.refresh_token:
|
if not creds.valid and creds.expired and creds.refresh_token:
|
||||||
creds.refresh(Request())
|
creds.refresh(Request())
|
||||||
with open(self.oauth_token, "w") as token_file:
|
with open(self.oauth_token, "w") as token_file:
|
||||||
logger.debug("Saving refreshed OAuth token.")
|
logger.debug("saving refreshed OAuth token.")
|
||||||
token_file.write(creds.to_json())
|
token_file.write(creds.to_json())
|
||||||
elif not creds.valid:
|
elif not creds.valid:
|
||||||
raise ValueError("Invalid OAuth token. Please regenerate the token.")
|
raise ValueError("Invalid OAuth token. Please regenerate the token.")
|
||||||
@@ -180,7 +180,7 @@ class GDriveStorage(Storage):
|
|||||||
Creates a new GDrive folder @name inside folder @parent_id
|
Creates a new GDrive folder @name inside folder @parent_id
|
||||||
Returns id of the created folder
|
Returns id of the created folder
|
||||||
"""
|
"""
|
||||||
logger.debug(f"Creating new folder with {name=} inside {parent_id=}")
|
logger.debug(f"creating new folder with {name=} inside {parent_id=}")
|
||||||
file_metadata = {"name": [name], "mimeType": "application/vnd.google-apps.folder", "parents": [parent_id]}
|
file_metadata = {"name": [name], "mimeType": "application/vnd.google-apps.folder", "parents": [parent_id]}
|
||||||
gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields="id").execute()
|
gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields="id").execute()
|
||||||
return gd_folder.get("id")
|
return gd_folder.get("id")
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from auto_archiver.core.extractor import Extractor
|
from auto_archiver.core.extractor import Extractor
|
||||||
from auto_archiver.core.metadata import Metadata, Media
|
from auto_archiver.core.metadata import Metadata, Media
|
||||||
@@ -18,7 +18,7 @@ class Bluesky(GenericDropin):
|
|||||||
# download if embeds present (1 video XOR >=1 images)
|
# download if embeds present (1 video XOR >=1 images)
|
||||||
for media in self._download_bsky_embeds(post, archiver):
|
for media in self._download_bsky_embeds(post, archiver):
|
||||||
result.add_media(media)
|
result.add_media(media)
|
||||||
logger.debug(f"Downloaded {len(result.media)} media files")
|
logger.debug(f"downloaded {len(result.media)} media files")
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ from yt_dlp.extractor.common import InfoExtractor
|
|||||||
from yt_dlp.utils import MaxDownloadsReached
|
from yt_dlp.utils import MaxDownloadsReached
|
||||||
import pysubs2
|
import pysubs2
|
||||||
|
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from auto_archiver.core.extractor import Extractor
|
from auto_archiver.core.extractor import Extractor
|
||||||
from auto_archiver.core import Metadata, Media
|
from auto_archiver.core import Metadata, Media
|
||||||
@@ -63,12 +63,11 @@ class GenericExtractor(Extractor):
|
|||||||
if os.environ.get("AUTO_ARCHIVER_ALLOW_RESTART", "1") != "1":
|
if os.environ.get("AUTO_ARCHIVER_ALLOW_RESTART", "1") != "1":
|
||||||
logger.warning("yt-dlp or plugin was updated — please restart auto-archiver manually")
|
logger.warning("yt-dlp or plugin was updated — please restart auto-archiver manually")
|
||||||
else:
|
else:
|
||||||
logger.warning("yt-dlp or plugin was updated — restarting auto-archiver")
|
logger.warning("yt-dlp or plugin was updated — restarting auto-archiver\n ======= RESTARTING ======= ")
|
||||||
logger.warning(" ======= RESTARTING ======= ")
|
|
||||||
os.execv(sys.executable, [sys.executable] + sys.argv)
|
os.execv(sys.executable, [sys.executable] + sys.argv)
|
||||||
|
|
||||||
def update_package(self, package_name: str) -> bool:
|
def update_package(self, package_name: str) -> bool:
|
||||||
logger.info(f"Checking and updating {package_name}...")
|
logger.info(f"checking and updating {package_name}...")
|
||||||
from importlib.metadata import version as get_version
|
from importlib.metadata import version as get_version
|
||||||
|
|
||||||
old_version = get_version(package_name)
|
old_version = get_version(package_name)
|
||||||
@@ -80,7 +79,7 @@ class GenericExtractor(Extractor):
|
|||||||
return True
|
return True
|
||||||
logger.info(f"{package_name} already up to date")
|
logger.info(f"{package_name} already up to date")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error updating {package_name}: {e}")
|
logger.error(f"failed to update {package_name}: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def setup_po_tokens(self) -> None:
|
def setup_po_tokens(self) -> None:
|
||||||
@@ -111,7 +110,7 @@ class GenericExtractor(Extractor):
|
|||||||
missing_tools = [tool for tool in ("node", "yarn", "npx") if shutil.which(tool) is None]
|
missing_tools = [tool for tool in ("node", "yarn", "npx") if shutil.which(tool) is None]
|
||||||
if missing_tools:
|
if missing_tools:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. "
|
f"cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. "
|
||||||
"Install these tools or run bgutils via Docker. "
|
"Install these tools or run bgutils via Docker. "
|
||||||
"See: https://github.com/Brainicism/bgutil-ytdlp-pot-provider"
|
"See: https://github.com/Brainicism/bgutil-ytdlp-pot-provider"
|
||||||
)
|
)
|
||||||
@@ -140,7 +139,7 @@ class GenericExtractor(Extractor):
|
|||||||
f"https://github.com/Brainicism/bgutil-ytdlp-pot-provider/archive/refs/tags/{plugin_version}.zip"
|
f"https://github.com/Brainicism/bgutil-ytdlp-pot-provider/archive/refs/tags/{plugin_version}.zip"
|
||||||
)
|
)
|
||||||
zip_path = os.path.join(base_dir, f"{plugin_version}.zip")
|
zip_path = os.path.join(base_dir, f"{plugin_version}.zip")
|
||||||
logger.info(f"Downloading bgutils release zip for version {plugin_version}...")
|
logger.info(f"downloading bgutils release zip for version {plugin_version}...")
|
||||||
urlretrieve(zip_url, zip_path)
|
urlretrieve(zip_url, zip_path)
|
||||||
with zipfile.ZipFile(zip_path, "r") as z:
|
with zipfile.ZipFile(zip_path, "r") as z:
|
||||||
z.extractall(base_dir)
|
z.extractall(base_dir)
|
||||||
@@ -149,7 +148,7 @@ class GenericExtractor(Extractor):
|
|||||||
extracted_root = os.path.join(base_dir, f"bgutil-ytdlp-pot-provider-{plugin_version}")
|
extracted_root = os.path.join(base_dir, f"bgutil-ytdlp-pot-provider-{plugin_version}")
|
||||||
shutil.move(os.path.join(extracted_root, "server"), server_dir)
|
shutil.move(os.path.join(extracted_root, "server"), server_dir)
|
||||||
shutil.rmtree(extracted_root)
|
shutil.rmtree(extracted_root)
|
||||||
logger.info("Installing dependencies and transpiling PoT Generator script...")
|
logger.info("installing dependencies and transpiling PoT Generator script...")
|
||||||
subprocess.run(["yarn", "install", "--frozen-lockfile"], cwd=server_dir, check=True)
|
subprocess.run(["yarn", "install", "--frozen-lockfile"], cwd=server_dir, check=True)
|
||||||
subprocess.run(["npx", "tsc"], cwd=server_dir, check=True)
|
subprocess.run(["npx", "tsc"], cwd=server_dir, check=True)
|
||||||
|
|
||||||
@@ -165,7 +164,7 @@ class GenericExtractor(Extractor):
|
|||||||
logger.info(f"PO Token script configured at: {script_path}")
|
logger.info(f"PO Token script configured at: {script_path}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to set up PO Token script: {e}")
|
logger.error(f"failed to set up PO Token script: {e}")
|
||||||
|
|
||||||
def suitable_extractors(self, url: str) -> Generator[str, None, None]:
|
def suitable_extractors(self, url: str) -> Generator[str, None, None]:
|
||||||
"""
|
"""
|
||||||
@@ -206,7 +205,7 @@ class GenericExtractor(Extractor):
|
|||||||
media = Media(cover_image_path)
|
media = Media(cover_image_path)
|
||||||
metadata.add_media(media, id="cover")
|
metadata.add_media(media, id="cover")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error downloading cover image {thumbnail_url}: {e}")
|
logger.error(f"could not download cover image {thumbnail_url}: {e}")
|
||||||
|
|
||||||
dropin = self.dropin_for_name(info_extractor.ie_key())
|
dropin = self.dropin_for_name(info_extractor.ie_key())
|
||||||
if dropin:
|
if dropin:
|
||||||
@@ -353,7 +352,7 @@ class GenericExtractor(Extractor):
|
|||||||
|
|
||||||
if not dropin:
|
if not dropin:
|
||||||
# TODO: add a proper link to 'how to create your own dropin'
|
# TODO: add a proper link to 'how to create your own dropin'
|
||||||
logger.debug(f"""Could not find valid dropin for {info_extractor.ie_key()}.
|
logger.debug(f"""could not find valid dropin for {info_extractor.ie_key()}.
|
||||||
Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""")
|
Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@@ -389,7 +388,7 @@ class GenericExtractor(Extractor):
|
|||||||
# file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies.
|
# file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
logger.debug(f"Using filename {filename} for entry {entry.get('id', 'unknown')}")
|
logger.debug(f"using filename {filename} for entry {entry.get('id', 'unknown')}")
|
||||||
|
|
||||||
new_media = Media(filename)
|
new_media = Media(filename)
|
||||||
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
|
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
|
||||||
@@ -404,12 +403,12 @@ class GenericExtractor(Extractor):
|
|||||||
text = " ".join([line.text for line in subs])
|
text = " ".join([line.text for line in subs])
|
||||||
new_media.set(f"subtitles_{lang}", text)
|
new_media.set(f"subtitles_{lang}", text)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}")
|
logger.error(f"error loading subtitle file {val.get('filepath')}: {e}")
|
||||||
result.add_media(new_media)
|
result.add_media(new_media)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error processing entry {entry}: {e}")
|
logger.error(f"error processing entry {entry}: {e}")
|
||||||
if not len(result.media):
|
if not len(result.media):
|
||||||
logger.info(f"No media found for entry {entry}, skipping.")
|
logger.info(f"no media found for entry {entry}, skipping.")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return self.add_metadata(data, info_extractor, url, result)
|
return self.add_metadata(data, info_extractor, url, result)
|
||||||
@@ -471,14 +470,14 @@ class GenericExtractor(Extractor):
|
|||||||
|
|
||||||
def _helper_for_successful_extract_info(data, info_extractor, url, ydl):
|
def _helper_for_successful_extract_info(data, info_extractor, url, ydl):
|
||||||
if data.get("is_live", False) and not self.livestreams:
|
if data.get("is_live", False) and not self.livestreams:
|
||||||
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
|
logger.warning("livestream detected, skipping due to 'livestreams' configuration setting")
|
||||||
return False
|
return False
|
||||||
# it's a valid video, that the youtubdedl can download out of the box
|
# it's a valid video, that the youtubdedl can download out of the box
|
||||||
return self.get_metadata_for_video(data, info_extractor, url, ydl)
|
return self.get_metadata_for_video(data, info_extractor, url, ydl)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
|
if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
|
||||||
logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
|
logger.debug(f"skipping using ytdlp to download files for {info_extractor.ie_key()}")
|
||||||
raise SkipYtdlp()
|
raise SkipYtdlp()
|
||||||
|
|
||||||
# don't download since it can be a live stream
|
# don't download since it can be a live stream
|
||||||
@@ -497,17 +496,17 @@ class GenericExtractor(Extractor):
|
|||||||
|
|
||||||
if not isinstance(e, SkipYtdlp):
|
if not isinstance(e, SkipYtdlp):
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead'
|
f'issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead'
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = self.get_metadata_for_post(info_extractor, url, ydl)
|
result = self.get_metadata_for_post(info_extractor, url, ydl)
|
||||||
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
|
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
|
||||||
logger.error("Error downloading metadata for post: {error}", error=str(post_e))
|
logger.error("error downloading metadata for post: {error}", error=str(post_e))
|
||||||
return False
|
return False
|
||||||
except Exception as generic_e:
|
except Exception as generic_e:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
'Attempt to extract using ytdlp extractor "{name}" failed: \n {error}',
|
'attempt to extract using ytdlp extractor "{name}" failed: \n {error}',
|
||||||
name=info_extractor.IE_NAME,
|
name=info_extractor.IE_NAME,
|
||||||
error=str(generic_e),
|
error=str(generic_e),
|
||||||
exc_info=True,
|
exc_info=True,
|
||||||
@@ -560,17 +559,17 @@ class GenericExtractor(Extractor):
|
|||||||
# order of importance: username/password -> api_key -> cookie -> cookies_from_browser -> cookies_file
|
# order of importance: username/password -> api_key -> cookie -> cookies_from_browser -> cookies_file
|
||||||
if auth:
|
if auth:
|
||||||
if "username" in auth and "password" in auth:
|
if "username" in auth and "password" in auth:
|
||||||
logger.debug(f"Using provided auth username and password for {url}")
|
logger.debug("using provided auth username and password")
|
||||||
ydl_options.extend(("--username", auth["username"]))
|
ydl_options.extend(("--username", auth["username"]))
|
||||||
ydl_options.extend(("--password", auth["password"]))
|
ydl_options.extend(("--password", auth["password"]))
|
||||||
elif "cookie" in auth:
|
elif "cookie" in auth:
|
||||||
logger.debug(f"Using provided auth cookie for {url}")
|
logger.debug("using provided auth cookie")
|
||||||
yt_dlp.utils.std_headers["cookie"] = auth["cookie"]
|
yt_dlp.utils.std_headers["cookie"] = auth["cookie"]
|
||||||
elif "cookies_from_browser" in auth:
|
elif "cookies_from_browser" in auth:
|
||||||
logger.debug(f"Using extracted cookies from browser {auth['cookies_from_browser']} for {url}")
|
logger.debug(f"using extracted cookies from browser {auth['cookies_from_browser']}")
|
||||||
ydl_options.extend(("--cookies-from-browser", auth["cookies_from_browser"]))
|
ydl_options.extend(("--cookies-from-browser", auth["cookies_from_browser"]))
|
||||||
elif "cookies_file" in auth:
|
elif "cookies_file" in auth:
|
||||||
logger.debug(f"Using cookies from file {auth['cookies_file']} for {url}")
|
logger.debug(f"using cookies from file {auth['cookies_file']}")
|
||||||
ydl_options.extend(("--cookies", auth["cookies_file"]))
|
ydl_options.extend(("--cookies", auth["cookies_file"]))
|
||||||
|
|
||||||
# Applying user-defined extractor_args
|
# Applying user-defined extractor_args
|
||||||
@@ -580,11 +579,11 @@ class GenericExtractor(Extractor):
|
|||||||
arg_str = ";".join(f"{k}={v}" for k, v in args.items())
|
arg_str = ";".join(f"{k}={v}" for k, v in args.items())
|
||||||
else:
|
else:
|
||||||
arg_str = str(args)
|
arg_str = str(args)
|
||||||
logger.debug(f"Setting extractor_args: {key}:{arg_str}")
|
logger.debug(f"setting extractor_args: {key}:{arg_str}")
|
||||||
ydl_options.extend(["--extractor-args", f"{key}:{arg_str}"])
|
ydl_options.extend(["--extractor-args", f"{key}:{arg_str}"])
|
||||||
|
|
||||||
if self.ytdlp_args:
|
if self.ytdlp_args:
|
||||||
logger.debug("Adding additional ytdlp arguments: {self.ytdlp_args}")
|
logger.debug(f"adding additional ytdlp arguments: {self.ytdlp_args}")
|
||||||
ydl_options += self.ytdlp_args.split(" ")
|
ydl_options += self.ytdlp_args.split(" ")
|
||||||
|
|
||||||
*_, validated_options = yt_dlp.parse_options(ydl_options)
|
*_, validated_options = yt_dlp.parse_options(ydl_options)
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import requests
|
import requests
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from yt_dlp.extractor.tiktok import TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE
|
from yt_dlp.extractor.tiktok import TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE
|
||||||
|
|
||||||
@@ -22,7 +22,7 @@ class Tiktok(GenericDropin):
|
|||||||
return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE))
|
return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE))
|
||||||
|
|
||||||
def extract_post(self, url: str, ie_instance):
|
def extract_post(self, url: str, ie_instance):
|
||||||
logger.debug(f"Using Tikwm API to attempt to download tiktok video from {url=}")
|
logger.debug(f"using Tikwm API to attempt to download tiktok video from {url=}")
|
||||||
|
|
||||||
endpoint = self.TIKWM_ENDPOINT.format(url=url)
|
endpoint = self.TIKWM_ENDPOINT.format(url=url)
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import re
|
import re
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
|
|
||||||
from auto_archiver.core.metadata import Metadata, Media
|
from auto_archiver.core.metadata import Metadata, Media
|
||||||
@@ -40,7 +40,7 @@ class Twitter(GenericDropin):
|
|||||||
raise ValueError("Error retreiving post. Are you sure it exists?")
|
raise ValueError("Error retreiving post. Are you sure it exists?")
|
||||||
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
||||||
except (ValueError, KeyError) as ex:
|
except (ValueError, KeyError) as ex:
|
||||||
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
logger.warning(f"unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
full_text = tweet.pop("full_text", "")
|
full_text = tweet.pop("full_text", "")
|
||||||
@@ -49,7 +49,7 @@ class Twitter(GenericDropin):
|
|||||||
|
|
||||||
result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp)
|
result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp)
|
||||||
if not tweet.get("entities", {}).get("media"):
|
if not tweet.get("entities", {}).get("media"):
|
||||||
logger.debug("No media found, archiving tweet text only")
|
logger.debug("no media found, archiving tweet text only")
|
||||||
result.status = "twitter-ytdl"
|
result.status = "twitter-ytdl"
|
||||||
return result
|
return result
|
||||||
for i, tw_media in enumerate(tweet["entities"]["media"]):
|
for i, tw_media in enumerate(tweet["entities"]["media"]):
|
||||||
|
|||||||
@@ -10,11 +10,12 @@ The filtered rows are processed into `Metadata` objects.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import traceback
|
||||||
from typing import Tuple, Union, Iterator
|
from typing import Tuple, Union, Iterator
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
|
|
||||||
import gspread
|
import gspread
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
from retrying import retry
|
from retrying import retry
|
||||||
|
|
||||||
@@ -41,19 +42,19 @@ class GsheetsFeederDB(Feeder, Database):
|
|||||||
sh = self.open_sheet()
|
sh = self.open_sheet()
|
||||||
for ii, worksheet in enumerate(sh.worksheets()):
|
for ii, worksheet in enumerate(sh.worksheets()):
|
||||||
if not self.should_process_sheet(worksheet.title):
|
if not self.should_process_sheet(worksheet.title):
|
||||||
logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
|
logger.debug(f"skipped worksheet '{worksheet.title}' due to allow/block rules")
|
||||||
continue
|
continue
|
||||||
logger.info(f"Opening worksheet {ii=}: {worksheet.title=} header={self.header}")
|
logger.info(f"opening worksheet {ii=}: {worksheet.title=} header={self.header}")
|
||||||
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
|
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
|
||||||
if len(missing_cols := self.missing_required_columns(gw)):
|
if len(missing_cols := self.missing_required_columns(gw)):
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}"
|
f"skipped worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}"
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
with logger.contextualize(worksheet=f"{sh.title}:{worksheet.title}"):
|
||||||
# process and yield metadata here:
|
# process and yield metadata here:
|
||||||
yield from self._process_rows(gw)
|
yield from self._process_rows(gw)
|
||||||
logger.info(f"Finished worksheet {worksheet.title}")
|
logger.info(f"finished worksheet {worksheet.title}")
|
||||||
|
|
||||||
def _process_rows(self, gw: GWorksheet):
|
def _process_rows(self, gw: GWorksheet):
|
||||||
for row in range(1 + self.header, gw.count_rows() + 1):
|
for row in range(1 + self.header, gw.count_rows() + 1):
|
||||||
@@ -69,6 +70,8 @@ class GsheetsFeederDB(Feeder, Database):
|
|||||||
# All checks done - archival process starts here
|
# All checks done - archival process starts here
|
||||||
m = Metadata().set_url(url)
|
m = Metadata().set_url(url)
|
||||||
self._set_context(m, gw, row)
|
self._set_context(m, gw, row)
|
||||||
|
|
||||||
|
with logger.contextualize(row=row):
|
||||||
yield m
|
yield m
|
||||||
|
|
||||||
def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
|
def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
|
||||||
@@ -99,16 +102,16 @@ class GsheetsFeederDB(Feeder, Database):
|
|||||||
return missing
|
return missing
|
||||||
|
|
||||||
def started(self, item: Metadata) -> None:
|
def started(self, item: Metadata) -> None:
|
||||||
logger.info(f"STARTED {item}")
|
logger.info("STARTED")
|
||||||
gw, row = self._retrieve_gsheet(item)
|
gw, row = self._retrieve_gsheet(item)
|
||||||
gw.set_cell(row, "status", "Archive in progress")
|
gw.set_cell(row, "status", "Archive in progress")
|
||||||
|
|
||||||
def failed(self, item: Metadata, reason: str) -> None:
|
def failed(self, item: Metadata, reason: str) -> None:
|
||||||
logger.error(f"FAILED {item}")
|
logger.error("FAILED")
|
||||||
self._safe_status_update(item, f"Archive failed {reason}")
|
self._safe_status_update(item, f"Archive failed {reason}")
|
||||||
|
|
||||||
def aborted(self, item: Metadata) -> None:
|
def aborted(self, item: Metadata) -> None:
|
||||||
logger.warning(f"ABORTED {item}")
|
logger.warning("ABORTED")
|
||||||
self._safe_status_update(item, "")
|
self._safe_status_update(item, "")
|
||||||
|
|
||||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||||
@@ -122,9 +125,7 @@ class GsheetsFeederDB(Feeder, Database):
|
|||||||
cell_updates = []
|
cell_updates = []
|
||||||
row_values = gw.get_row(row)
|
row_values = gw.get_row(row)
|
||||||
|
|
||||||
spreadsheet = gw.wks.spreadsheet.title
|
logger.info("DONE")
|
||||||
worksheet = gw.wks.title
|
|
||||||
logger.info(f"DONE url='{item.get_url()}' {row=} on {spreadsheet=} : {worksheet=}")
|
|
||||||
|
|
||||||
def batch_if_valid(col, val, final_value=None):
|
def batch_if_valid(col, val, final_value=None):
|
||||||
final_value = final_value or val
|
final_value = final_value or val
|
||||||
@@ -132,7 +133,7 @@ class GsheetsFeederDB(Feeder, Database):
|
|||||||
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
|
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
|
||||||
cell_updates.append((row, col, final_value))
|
cell_updates.append((row, col, final_value))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Unable to batch {col}={final_value} due to {e}")
|
logger.error(f"unable to batch {col}={final_value} due to {e}")
|
||||||
|
|
||||||
status_message = item.status
|
status_message = item.status
|
||||||
if cached:
|
if cached:
|
||||||
@@ -192,15 +193,13 @@ class GsheetsFeederDB(Feeder, Database):
|
|||||||
gw, row = self._retrieve_gsheet(item)
|
gw, row = self._retrieve_gsheet(item)
|
||||||
gw.set_cell(row, "status", new_status)
|
gw.set_cell(row, "status", new_status)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Unable to update sheet: {e}")
|
logger.debug(f"unable to update sheet: {e}: {traceback.format_exc()}")
|
||||||
|
|
||||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||||
if gsheet := item.get_context("gsheet"):
|
if gsheet := item.get_context("gsheet"):
|
||||||
gw: GWorksheet = gsheet.get("worksheet")
|
gw: GWorksheet = gsheet.get("worksheet")
|
||||||
row: int = gsheet.get("row")
|
row: int = gsheet.get("row")
|
||||||
elif self.sheet_id:
|
elif self.sheet_id:
|
||||||
logger.error(
|
logger.error("unable to retrieve Gsheet, GsheetDB must be used alongside GsheetFeeder.")
|
||||||
f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder."
|
|
||||||
)
|
|
||||||
|
|
||||||
return gw, row
|
return gw, row
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ making it suitable for handling large files efficiently.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import hashlib
|
import hashlib
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from auto_archiver.core import Enricher
|
from auto_archiver.core import Enricher
|
||||||
from auto_archiver.core import Metadata
|
from auto_archiver.core import Metadata
|
||||||
@@ -22,8 +22,7 @@ class HashEnricher(Enricher):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def enrich(self, to_enrich: Metadata) -> None:
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
url = to_enrich.get_url()
|
logger.debug(f"calculating media hashes with algo={self.algorithm}")
|
||||||
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
|
|
||||||
|
|
||||||
for i, m in enumerate(to_enrich.media):
|
for i, m in enumerate(to_enrich.media):
|
||||||
if len(hd := self.calculate_hash(m.filename)):
|
if len(hd := self.calculate_hash(m.filename)):
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import os
|
|||||||
import pathlib
|
import pathlib
|
||||||
from jinja2 import Environment, FileSystemLoader
|
from jinja2 import Environment, FileSystemLoader
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
import json
|
import json
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
@@ -35,7 +35,7 @@ class HtmlFormatter(Formatter):
|
|||||||
def format(self, item: Metadata) -> Media:
|
def format(self, item: Metadata) -> Media:
|
||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
if item.is_empty():
|
if item.is_empty():
|
||||||
logger.debug(f"[SKIP] FORMAT there is no media or metadata to format: {url=}")
|
logger.debug("nothing to format, skipping")
|
||||||
return
|
return
|
||||||
|
|
||||||
content = self.template.render(
|
content = self.template.render(
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ from datetime import datetime
|
|||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
from retrying import retry
|
from retrying import retry
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
@@ -45,11 +45,11 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
url.replace("instagr.com", "instagram.com").replace("instagr.am", "instagram.com")
|
url.replace("instagr.com", "instagram.com").replace("instagr.am", "instagram.com")
|
||||||
insta_matches = self.valid_url.findall(url)
|
insta_matches = self.valid_url.findall(url)
|
||||||
logger.info(f"{insta_matches=}")
|
|
||||||
if not len(insta_matches) or len(insta_matches[0]) != 3:
|
if not len(insta_matches) or len(insta_matches[0]) != 3:
|
||||||
return
|
return
|
||||||
if len(insta_matches) > 1:
|
if len(insta_matches) > 1:
|
||||||
logger.warning(f"Multiple instagram matches found in {url=}, using the first one")
|
logger.debug("multiple instagram matches found, using the first one")
|
||||||
return
|
return
|
||||||
g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2]
|
g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2]
|
||||||
if g1 == "":
|
if g1 == "":
|
||||||
@@ -65,7 +65,7 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
return self.download_post(item, id=g3, context="story")
|
return self.download_post(item, id=g3, context="story")
|
||||||
return self.download_stories(item, g2)
|
return self.download_stories(item, g2)
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Unknown instagram regex group match {g1=} found in {url=}")
|
logger.warning(f"unknown instagram regex group match {g1=}")
|
||||||
return
|
return
|
||||||
|
|
||||||
@retry(wait_random_min=1000, wait_random_max=3000, stop_max_attempt_number=5)
|
@retry(wait_random_min=1000, wait_random_max=3000, stop_max_attempt_number=5)
|
||||||
@@ -112,8 +112,8 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
count_posts += len(stories)
|
count_posts += len(stories)
|
||||||
result.set("#stories", len(stories))
|
result.set("#stories", len(stories))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result.append("errors", f"Error downloading stories for {username}")
|
result.append("errors", f"error downloading stories for {username}")
|
||||||
logger.error(f"Error downloading stories for {username}: {e} {traceback.format_exc()}")
|
logger.error(f"error downloading stories for {username}: {e} {traceback.format_exc()}")
|
||||||
|
|
||||||
# download all posts
|
# download all posts
|
||||||
try:
|
try:
|
||||||
@@ -122,8 +122,8 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
result, user_id, max_to_download=self.full_profile_max_posts - count_posts
|
result, user_id, max_to_download=self.full_profile_max_posts - count_posts
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result.append("errors", f"Error downloading posts for {username}")
|
result.append("errors", f"error downloading posts for {username}")
|
||||||
logger.error(f"Error downloading posts for {username}: {e} {traceback.format_exc()}")
|
logger.error(f"error downloading posts for {username}: {e} {traceback.format_exc()}")
|
||||||
|
|
||||||
# download all tagged
|
# download all tagged
|
||||||
try:
|
try:
|
||||||
@@ -132,8 +132,8 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
result, user_id, max_to_download=self.full_profile_max_posts - count_posts
|
result, user_id, max_to_download=self.full_profile_max_posts - count_posts
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result.append("errors", f"Error downloading tagged posts for {username}")
|
result.append("errors", f"error downloading tagged posts for {username}")
|
||||||
logger.error(f"Error downloading tagged posts for {username}: {e} {traceback.format_exc()}")
|
logger.error(f"error downloading tagged posts for {username}: {e} {traceback.format_exc()}")
|
||||||
|
|
||||||
# download all highlights
|
# download all highlights
|
||||||
try:
|
try:
|
||||||
@@ -159,10 +159,10 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
result.append(
|
result.append(
|
||||||
"errors",
|
"errors",
|
||||||
f"Error downloading highlight id{h.get('pk')} for {username}",
|
f"error downloading highlight id{h.get('pk')} for {username}",
|
||||||
)
|
)
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Error downloading highlight id{h.get('pk')} for {username}: {e} {traceback.format_exc()}"
|
f"error downloading highlight id{h.get('pk')} for {username}: {e} {traceback.format_exc()}"
|
||||||
)
|
)
|
||||||
if count_highlights >= max_to_download:
|
if count_highlights >= max_to_download:
|
||||||
logger.debug(f"HIGHLIGHTS reached max_to_download={self.full_profile_max_posts}")
|
logger.debug(f"HIGHLIGHTS reached max_to_download={self.full_profile_max_posts}")
|
||||||
@@ -208,8 +208,8 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
try:
|
try:
|
||||||
self.scrape_item(result, h, "highlight")
|
self.scrape_item(result, h, "highlight")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result.append("errors", f"Error downloading highlight {h.get('id')}")
|
result.append("errors", f"error downloading highlight {h.get('id')}")
|
||||||
logger.error(f"Error downloading highlight, skipping {h.get('id')}: {e} {traceback.format_exc()}")
|
logger.error(f"error downloading highlight, skipping {h.get('id')}: {e} {traceback.format_exc()}")
|
||||||
|
|
||||||
return h_info
|
return h_info
|
||||||
|
|
||||||
@@ -251,8 +251,8 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
try:
|
try:
|
||||||
self.scrape_item(result, p, "post")
|
self.scrape_item(result, p, "post")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result.append("errors", f"Error downloading post {p.get('id')}")
|
result.append("errors", f"error downloading post {p.get('id')}")
|
||||||
logger.error(f"Error downloading post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
|
logger.error(f"error downloading post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
|
||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
post_count += 1
|
post_count += 1
|
||||||
if post_count >= max_to_download:
|
if post_count >= max_to_download:
|
||||||
@@ -279,8 +279,8 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
try:
|
try:
|
||||||
self.scrape_item(result, p, "tagged")
|
self.scrape_item(result, p, "tagged")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result.append("errors", f"Error downloading tagged post {p.get('id')}")
|
result.append("errors", f"error downloading tagged post {p.get('id')}")
|
||||||
logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
|
logger.error(f"error downloading tagged post, skipping {p.get('id')}: {e} {traceback.format_exc()}")
|
||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
tagged_count += 1
|
tagged_count += 1
|
||||||
if tagged_count >= max_to_download:
|
if tagged_count >= max_to_download:
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ import re
|
|||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import instaloader
|
import instaloader
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from auto_archiver.core import Extractor
|
from auto_archiver.core import Extractor
|
||||||
from auto_archiver.core import Metadata
|
from auto_archiver.core import Metadata
|
||||||
@@ -29,8 +29,9 @@ class InstagramExtractor(Extractor):
|
|||||||
# TODO: links to stories
|
# TODO: links to stories
|
||||||
|
|
||||||
def setup(self) -> None:
|
def setup(self) -> None:
|
||||||
logger.warning("Instagram Extractor is not actively maintained, and may not work as expected.")
|
logger.warning(
|
||||||
logger.warning("Please consider using the Instagram Tbot Extractor or Instagram API Extractor instead.")
|
"Instagram Extractor is not actively maintained, and may not work as expected.\nPlease consider using the Instagram Tbot Extractor or Instagram API Extractor instead."
|
||||||
|
)
|
||||||
|
|
||||||
self.insta = instaloader.Instaloader(
|
self.insta = instaloader.Instaloader(
|
||||||
download_geotags=True,
|
download_geotags=True,
|
||||||
@@ -43,12 +44,11 @@ class InstagramExtractor(Extractor):
|
|||||||
self.insta.load_session_from_file(self.username, self.session_file)
|
self.insta.load_session_from_file(self.username, self.session_file)
|
||||||
except Exception:
|
except Exception:
|
||||||
try:
|
try:
|
||||||
logger.debug("Session file failed", exc_info=True)
|
logger.info("no valid session file found - Attempting login with use and password.")
|
||||||
logger.info("No valid session file found - Attempting login with use and password.")
|
|
||||||
self.insta.login(self.username, self.password)
|
self.insta.login(self.username, self.password)
|
||||||
self.insta.save_session_to_file(self.session_file)
|
self.insta.save_session_to_file(self.session_file)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}")
|
logger.error(f"failed to setup Instagram Extractor with Instagrapi. {e}")
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
@@ -72,14 +72,14 @@ class InstagramExtractor(Extractor):
|
|||||||
result = self.download_profile(url, profile_matches[0])
|
result = self.download_profile(url, profile_matches[0])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Failed to download with instagram extractor due to: {e}, make sure your account credentials are valid."
|
f"failed to download with instagram extractor due to: {e}, make sure your account credentials are valid."
|
||||||
)
|
)
|
||||||
finally:
|
finally:
|
||||||
shutil.rmtree(self.download_folder, ignore_errors=True)
|
shutil.rmtree(self.download_folder, ignore_errors=True)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def download_post(self, url: str, post_id: str) -> Metadata:
|
def download_post(self, url: str, post_id: str) -> Metadata:
|
||||||
logger.debug(f"Instagram {post_id=} detected in {url=}")
|
logger.debug(f"Instagram {post_id=} detected")
|
||||||
|
|
||||||
post = instaloader.Post.from_shortcode(self.insta.context, post_id)
|
post = instaloader.Post.from_shortcode(self.insta.context, post_id)
|
||||||
if self.insta.download_post(post, target=post.owner_username):
|
if self.insta.download_post(post, target=post.owner_username):
|
||||||
@@ -87,7 +87,7 @@ class InstagramExtractor(Extractor):
|
|||||||
|
|
||||||
def download_profile(self, url: str, username: str) -> Metadata:
|
def download_profile(self, url: str, username: str) -> Metadata:
|
||||||
# gets posts, posts where username is tagged, igtv postss, stories, and highlights
|
# gets posts, posts where username is tagged, igtv postss, stories, and highlights
|
||||||
logger.debug(f"Instagram {username=} detected in {url=}")
|
logger.debug(f"Instagram {username=} detected")
|
||||||
|
|
||||||
profile = instaloader.Profile.from_username(self.insta.context, username)
|
profile = instaloader.Profile.from_username(self.insta.context, username)
|
||||||
try:
|
try:
|
||||||
@@ -95,27 +95,27 @@ class InstagramExtractor(Extractor):
|
|||||||
try:
|
try:
|
||||||
self.insta.download_post(post, target=f"profile_post_{post.owner_username}")
|
self.insta.download_post(post, target=f"profile_post_{post.owner_username}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to download post: {post.shortcode}: {e}")
|
logger.error(f"failed to download post: {post.shortcode}: {e}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed profile.get_posts: {e}")
|
logger.error(f"failed profile.get_posts: {e}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for post in profile.get_tagged_posts():
|
for post in profile.get_tagged_posts():
|
||||||
try:
|
try:
|
||||||
self.insta.download_post(post, target=f"tagged_post_{post.owner_username}")
|
self.insta.download_post(post, target=f"tagged_post_{post.owner_username}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to download tagged post: {post.shortcode}: {e}")
|
logger.error(f"failed to download tagged post: {post.shortcode}: {e}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed profile.get_tagged_posts: {e}")
|
logger.error(f"failed profile.get_tagged_posts: {e}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for post in profile.get_igtv_posts():
|
for post in profile.get_igtv_posts():
|
||||||
try:
|
try:
|
||||||
self.insta.download_post(post, target=f"igtv_post_{post.owner_username}")
|
self.insta.download_post(post, target=f"igtv_post_{post.owner_username}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to download igtv post: {post.shortcode}: {e}")
|
logger.error(f"failed to download igtv post: {post.shortcode}: {e}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed profile.get_igtv_posts: {e}")
|
logger.error(f"failed profile.get_igtv_posts: {e}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for story in self.insta.get_stories([profile.userid]):
|
for story in self.insta.get_stories([profile.userid]):
|
||||||
@@ -123,9 +123,9 @@ class InstagramExtractor(Extractor):
|
|||||||
try:
|
try:
|
||||||
self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}")
|
self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to download story item: {item}: {e}")
|
logger.error(f"failed to download story item: {item}: {e}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed get_stories: {e}")
|
logger.error(f"failed get_stories: {e}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for highlight in self.insta.get_highlights(profile.userid):
|
for highlight in self.insta.get_highlights(profile.userid):
|
||||||
@@ -133,9 +133,9 @@ class InstagramExtractor(Extractor):
|
|||||||
try:
|
try:
|
||||||
self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}")
|
self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to download highlight item: {item}: {e}")
|
logger.error(f"failed to download highlight item: {item}: {e}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed get_highlights: {e}")
|
logger.error(f"failed get_highlights: {e}")
|
||||||
|
|
||||||
return self.process_downloads(url, f"@{username}", profile._asdict(), None)
|
return self.process_downloads(url, f"@{username}", profile._asdict(), None)
|
||||||
|
|
||||||
@@ -158,4 +158,4 @@ class InstagramExtractor(Extractor):
|
|||||||
|
|
||||||
return result.success("instagram")
|
return result.success("instagram")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Could not fetch instagram post {url} due to: {e}")
|
logger.error(f"could not fetch instagram post due to: {e}")
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ import shutil
|
|||||||
import time
|
import time
|
||||||
from sqlite3 import OperationalError
|
from sqlite3 import OperationalError
|
||||||
|
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
from telethon.sync import TelegramClient
|
from telethon.sync import TelegramClient
|
||||||
|
|
||||||
from auto_archiver.core import Extractor
|
from auto_archiver.core import Extractor
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import json
|
import json
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from auto_archiver.core import Enricher
|
from auto_archiver.core import Enricher
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import shutil
|
import shutil
|
||||||
from typing import IO
|
from typing import IO
|
||||||
import os
|
import os
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from auto_archiver.core import Media
|
from auto_archiver.core import Media
|
||||||
from auto_archiver.core import Storage
|
from auto_archiver.core import Storage
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import datetime
|
import datetime
|
||||||
import os
|
import os
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from auto_archiver.core import Enricher
|
from auto_archiver.core import Enricher
|
||||||
from auto_archiver.core import Metadata
|
from auto_archiver.core import Metadata
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import subprocess
|
import subprocess
|
||||||
import traceback
|
import traceback
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from auto_archiver.core import Enricher
|
from auto_archiver.core import Enricher
|
||||||
from auto_archiver.core import Metadata
|
from auto_archiver.core import Metadata
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
import opentimestamps
|
import opentimestamps
|
||||||
from opentimestamps.calendar import RemoteCalendar, DEFAULT_CALENDAR_WHITELIST
|
from opentimestamps.calendar import RemoteCalendar, DEFAULT_CALENDAR_WHITELIST
|
||||||
from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile
|
from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ import traceback
|
|||||||
import pdqhash
|
import pdqhash
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from PIL import Image, UnidentifiedImageError
|
from PIL import Image, UnidentifiedImageError
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from auto_archiver.core import Enricher
|
from auto_archiver.core import Enricher
|
||||||
from auto_archiver.core import Metadata
|
from auto_archiver.core import Metadata
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ from typing import IO
|
|||||||
|
|
||||||
import boto3
|
import boto3
|
||||||
import os
|
import os
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from auto_archiver.core import Media
|
from auto_archiver.core import Media
|
||||||
from auto_archiver.core import Storage
|
from auto_archiver.core import Storage
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import ssl
|
|||||||
import os
|
import os
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from auto_archiver.core import Enricher
|
from auto_archiver.core import Enricher
|
||||||
from auto_archiver.core import Metadata, Media
|
from auto_archiver.core import Metadata, Media
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import requests
|
|||||||
import re
|
import re
|
||||||
import html
|
import html
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from auto_archiver.core import Extractor
|
from auto_archiver.core import Extractor
|
||||||
from auto_archiver.core import Metadata, Media
|
from auto_archiver.core import Metadata, Media
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from telethon.errors.rpcerrorlist import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from auto_archiver.core import Extractor
|
from auto_archiver.core import Extractor
|
||||||
from auto_archiver.core import Metadata, Media
|
from auto_archiver.core import Metadata, Media
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ and identify important moments without watching the entire video.
|
|||||||
|
|
||||||
import ffmpeg
|
import ffmpeg
|
||||||
import os
|
import os
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from auto_archiver.core import Enricher
|
from auto_archiver.core import Enricher
|
||||||
from auto_archiver.core import Media, Metadata
|
from auto_archiver.core import Media, Metadata
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import hashlib
|
|||||||
|
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
import requests
|
import requests
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from rfc3161_client import (decode_timestamp_response,TimestampRequestBuilder,TimeStampResponse, VerifierBuilder)
|
from rfc3161_client import (decode_timestamp_response,TimestampRequestBuilder,TimeStampResponse, VerifierBuilder)
|
||||||
from rfc3161_client import VerificationError as Rfc3161VerificationError
|
from rfc3161_client import VerificationError as Rfc3161VerificationError
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import re
|
|||||||
import mimetypes
|
import mimetypes
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
from pytwitter import Api
|
from pytwitter import Api
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import os
|
|||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
from warcio.archiveiterator import ArchiveIterator
|
from warcio.archiveiterator import ArchiveIterator
|
||||||
|
|
||||||
from auto_archiver.core import Media, Metadata
|
from auto_archiver.core import Media, Metadata
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import json
|
import json
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
import time
|
import time
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import traceback
|
import traceback
|
||||||
import requests
|
import requests
|
||||||
import time
|
import time
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from auto_archiver.core import Enricher
|
from auto_archiver.core import Enricher
|
||||||
from auto_archiver.core import Metadata, Media
|
from auto_archiver.core import Metadata, Media
|
||||||
|
|||||||
37
src/auto_archiver/utils/custom_logger.py
Normal file
37
src/auto_archiver/utils/custom_logger.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
from loguru import logger
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
def extract_log_data(record):
|
||||||
|
subset = {
|
||||||
|
"level": record["level"].name,
|
||||||
|
"time": record["time"].isoformat(timespec="seconds"),
|
||||||
|
}
|
||||||
|
subset["loc"] = f"{record['file'].name}:{record['function']}:{record['line']}"
|
||||||
|
|
||||||
|
for extra_key in ["trace", "url", "worksheet", "row"]:
|
||||||
|
if extra_val := record.get("extra", {}).get(extra_key):
|
||||||
|
subset[extra_key] = extra_val
|
||||||
|
|
||||||
|
subset["message"] = record["message"]
|
||||||
|
if exception := record.get("exception"):
|
||||||
|
subset["exception"] = exception
|
||||||
|
return subset
|
||||||
|
|
||||||
|
|
||||||
|
def serialize_no_message(record):
|
||||||
|
subset = extract_log_data(record)
|
||||||
|
subset.pop("message", None)
|
||||||
|
return json.dumps(subset, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
|
def serialize(record):
|
||||||
|
return json.dumps(extract_log_data(record), ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
|
def patching(record):
|
||||||
|
record["extra"]["serialized"] = serialize(record)
|
||||||
|
record["extra"]["serialize_no_message"] = serialize_no_message(record)
|
||||||
|
|
||||||
|
|
||||||
|
logger = logger.patch(patching)
|
||||||
@@ -7,7 +7,7 @@ from datetime import datetime, timezone
|
|||||||
from dateutil.parser import parse as parse_dt
|
from dateutil.parser import parse as parse_dt
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
|
|
||||||
def mkdir_if_not_exists(folder):
|
def mkdir_if_not_exists(folder):
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ from tempfile import TemporaryDirectory
|
|||||||
from typing import Dict, Tuple
|
from typing import Dict, Tuple
|
||||||
import hashlib
|
import hashlib
|
||||||
|
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
import pytest
|
import pytest
|
||||||
from auto_archiver.core.metadata import Metadata, Media
|
from auto_archiver.core.metadata import Metadata, Media
|
||||||
from auto_archiver.core.module import ModuleFactory
|
from auto_archiver.core.module import ModuleFactory
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
from auto_archiver.core import Extractor
|
from auto_archiver.core import Extractor
|
||||||
|
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
|
|
||||||
class ExampleExtractor(Extractor):
|
class ExampleExtractor(Extractor):
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata
|
from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata
|
||||||
|
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
|
|
||||||
class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):
|
class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ def orchestration_file(orchestration_file_path):
|
|||||||
def autoarchiver(tmp_path, monkeypatch, request):
|
def autoarchiver(tmp_path, monkeypatch, request):
|
||||||
def _autoarchiver(args=[]):
|
def _autoarchiver(args=[]):
|
||||||
def cleanup():
|
def cleanup():
|
||||||
from loguru import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
if not logger._core.handlers.get(0):
|
if not logger._core.handlers.get(0):
|
||||||
logger._core.handlers_count = 0
|
logger._core.handlers_count = 0
|
||||||
|
|||||||
Reference in New Issue
Block a user