diff --git a/src/auto_archiver/enrichers/screenshot_enricher.py b/src/auto_archiver/enrichers/screenshot_enricher.py index a953d16..3c768f6 100644 --- a/src/auto_archiver/enrichers/screenshot_enricher.py +++ b/src/auto_archiver/enrichers/screenshot_enricher.py @@ -3,7 +3,7 @@ import time, uuid, os from selenium.common.exceptions import TimeoutException from . import Enricher -from ..utils import Webdriver +from ..utils import Webdriver, UrlUtil from ..core import Media, Metadata class ScreenshotEnricher(Enricher): @@ -19,6 +19,10 @@ class ScreenshotEnricher(Enricher): def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() + if UrlUtil.is_auth_wall(url): + logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}") + return + logger.debug(f"Enriching screenshot for {url=}") with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver: try: diff --git a/src/auto_archiver/enrichers/wayback_enricher.py b/src/auto_archiver/enrichers/wayback_enricher.py index 187c3fe..45ecb7d 100644 --- a/src/auto_archiver/enrichers/wayback_enricher.py +++ b/src/auto_archiver/enrichers/wayback_enricher.py @@ -1,8 +1,10 @@ from loguru import logger import time, requests + from . import Enricher from ..archivers import Archiver +from ..utils import UrlUtil from ..core import Metadata class WaybackArchiverEnricher(Enricher, Archiver): @@ -33,6 +35,10 @@ class WaybackArchiverEnricher(Enricher, Archiver): def enrich(self, to_enrich: Metadata) -> bool: url = to_enrich.get_url() + if UrlUtil.is_auth_wall(url): + logger.debug(f"[SKIP] WAYBACK since url is behind AUTH WALL: {url=}") + return + logger.debug(f"calling wayback for {url=}") if to_enrich.get("wayback"): diff --git a/src/auto_archiver/formatters/html_formatter.py b/src/auto_archiver/formatters/html_formatter.py index cb00111..abd8cf4 100644 --- a/src/auto_archiver/formatters/html_formatter.py +++ b/src/auto_archiver/formatters/html_formatter.py @@ -3,6 +3,7 @@ from dataclasses import dataclass import mimetypes, uuid, os, pathlib from jinja2 import Environment, FileSystemLoader from urllib.parse import quote +from loguru import logger from ..version import __version__ from ..core import Metadata, Media @@ -26,12 +27,17 @@ class HtmlFormatter(Formatter): @staticmethod def configs() -> dict: return { - "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}, - + "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"} } + def format(self, item: Metadata) -> Media: + url = item.get_url() + if item.is_empty(): + logger.debug(f"[SKIP] FORMAT there is no media or metadata to format: {url=}") + return + content = self.template.render( - url=item.get_url(), + url=url, title=item.get_title(), media=item.media, metadata=item.get_clean_metadata(), diff --git a/src/auto_archiver/utils/url.py b/src/auto_archiver/utils/url.py new file mode 100644 index 0000000..c854405 --- /dev/null +++ b/src/auto_archiver/utils/url.py @@ -0,0 +1,19 @@ +import re + +class UrlUtil: + telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)") + is_istagram = re.compile(r"https:\/\/www\.instagram\.com") + + @staticmethod + def clean(url): return url + + @staticmethod + def is_auth_wall(url): + """ + checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work + """ + if UrlUtil.telegram_private.match(url): return True + if UrlUtil.is_istagram.match(url): return True + + return False +