mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-08 03:18:28 +03:00
url auth wall detect
This commit is contained in:
@@ -3,7 +3,7 @@ import time, uuid, os
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
from . import Enricher
|
||||
from ..utils import Webdriver
|
||||
from ..utils import Webdriver, UrlUtil
|
||||
from ..core import Media, Metadata
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
@@ -19,6 +19,10 @@ class ScreenshotEnricher(Enricher):
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
if UrlUtil.is_auth_wall(url):
|
||||
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
|
||||
return
|
||||
|
||||
logger.debug(f"Enriching screenshot for {url=}")
|
||||
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver:
|
||||
try:
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
from loguru import logger
|
||||
import time, requests
|
||||
|
||||
|
||||
from . import Enricher
|
||||
from ..archivers import Archiver
|
||||
from ..utils import UrlUtil
|
||||
from ..core import Metadata
|
||||
|
||||
class WaybackArchiverEnricher(Enricher, Archiver):
|
||||
@@ -33,6 +35,10 @@ class WaybackArchiverEnricher(Enricher, Archiver):
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> bool:
|
||||
url = to_enrich.get_url()
|
||||
if UrlUtil.is_auth_wall(url):
|
||||
logger.debug(f"[SKIP] WAYBACK since url is behind AUTH WALL: {url=}")
|
||||
return
|
||||
|
||||
logger.debug(f"calling wayback for {url=}")
|
||||
|
||||
if to_enrich.get("wayback"):
|
||||
|
||||
@@ -3,6 +3,7 @@ from dataclasses import dataclass
|
||||
import mimetypes, uuid, os, pathlib
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from urllib.parse import quote
|
||||
from loguru import logger
|
||||
|
||||
from ..version import __version__
|
||||
from ..core import Metadata, Media
|
||||
@@ -26,12 +27,17 @@ class HtmlFormatter(Formatter):
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"},
|
||||
|
||||
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
|
||||
}
|
||||
|
||||
def format(self, item: Metadata) -> Media:
|
||||
url = item.get_url()
|
||||
if item.is_empty():
|
||||
logger.debug(f"[SKIP] FORMAT there is no media or metadata to format: {url=}")
|
||||
return
|
||||
|
||||
content = self.template.render(
|
||||
url=item.get_url(),
|
||||
url=url,
|
||||
title=item.get_title(),
|
||||
media=item.media,
|
||||
metadata=item.get_clean_metadata(),
|
||||
|
||||
19
src/auto_archiver/utils/url.py
Normal file
19
src/auto_archiver/utils/url.py
Normal file
@@ -0,0 +1,19 @@
|
||||
import re
|
||||
|
||||
class UrlUtil:
|
||||
telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)")
|
||||
is_istagram = re.compile(r"https:\/\/www\.instagram\.com")
|
||||
|
||||
@staticmethod
|
||||
def clean(url): return url
|
||||
|
||||
@staticmethod
|
||||
def is_auth_wall(url):
|
||||
"""
|
||||
checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
|
||||
"""
|
||||
if UrlUtil.telegram_private.match(url): return True
|
||||
if UrlUtil.is_istagram.match(url): return True
|
||||
|
||||
return False
|
||||
|
||||
Reference in New Issue
Block a user