mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
wayback enricher ready
This commit is contained in:
@@ -1,2 +1,3 @@
|
||||
from .enricher import Enricher
|
||||
from .screenshot_enricher import ScreenshotEnricher
|
||||
from .screenshot_enricher import ScreenshotEnricher
|
||||
from .wayback_enricher import WaybackEnricher
|
||||
@@ -18,4 +18,4 @@ class Enricher(Step, ABC):
|
||||
return Step.init(name, config, Enricher)
|
||||
|
||||
@abstractmethod
|
||||
def enrich(self, item: Metadata) -> Metadata: pass
|
||||
def enrich(self, to_enrich: Metadata) -> None: pass
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
from media import Media
|
||||
from utils import Webdriver
|
||||
from . import Enricher
|
||||
from metadata import Metadata
|
||||
from loguru import logger
|
||||
import time, uuid, os
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
import time
|
||||
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
name = "screenshot"
|
||||
name = "screenshot_enricher"
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
@@ -17,16 +18,18 @@ class ScreenshotEnricher(Enricher):
|
||||
"timeout": {"default": 60, "help": "timeout for taking the screenshot"}
|
||||
}
|
||||
|
||||
def enrich(self, item: Metadata) -> Metadata:
|
||||
url = self.get_url(item)
|
||||
print(f"enriching {url=}")
|
||||
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver: # TODO: make a util
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"Enriching screenshot for {url=}")
|
||||
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver:
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(2)
|
||||
screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
|
||||
driver.save_screenshot(screenshot_file)
|
||||
to_enrich.add_media(Media(filename=screenshot_file, id="screenshot"))
|
||||
except TimeoutException:
|
||||
logger.info("TimeoutException loading page for screenshot")
|
||||
|
||||
#TODO: return saved object
|
||||
driver.save_screenshot("TODO-HASH_OR_UUID.png")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
|
||||
# return None
|
||||
|
||||
68
src/enrichers/wayback_enricher.py
Normal file
68
src/enrichers/wayback_enricher.py
Normal file
@@ -0,0 +1,68 @@
|
||||
from utils import Webdriver
|
||||
from . import Enricher
|
||||
from metadata import Metadata
|
||||
from loguru import logger
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
import time, requests
|
||||
|
||||
|
||||
class WaybackEnricher(Enricher):
|
||||
"""
|
||||
Submits the current URL to the webarchive and returns a job_id or completed archive
|
||||
"""
|
||||
name = "wayback_enricher"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API key"
|
||||
assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API secret"
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"timeout": {"default": 5, "help": "number of seconds to wait for a response from webarchive's wayback machine, after that only job_id is saved but page will still be processed."},
|
||||
"key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"},
|
||||
"secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"Enriching wayback for {url=}")
|
||||
|
||||
ia_headers = {
|
||||
"Accept": "application/json",
|
||||
"Authorization": f"LOW {self.key}:{self.secret}"
|
||||
}
|
||||
r = requests.post('https://web.archive.org/save/', headers=ia_headers, data={'url': url})
|
||||
|
||||
if r.status_code != 200:
|
||||
logger.error(em:=f"Internet archive failed with status of {r.status_code}: {r.json()}")
|
||||
to_enrich.set("wayback", em)
|
||||
return
|
||||
|
||||
# check job status
|
||||
job_id = r.json()['job_id']
|
||||
|
||||
# waits at most timeout seconds until job is completed, otherwise only enriches the job_id information
|
||||
start_time = time.time()
|
||||
wayback_url = False
|
||||
attempt = 1
|
||||
while not wayback_url and time.time() - start_time <= self.timeout:
|
||||
try:
|
||||
|
||||
logger.debug(f"GETting status for {job_id=} on {url=} ({attempt=})")
|
||||
r_status = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers)
|
||||
r_json = r_status.json()
|
||||
if r_status.status_code == 200 and r_json['status'] == 'success':
|
||||
wayback_url = f"https://web.archive.org/web/{r_json['timestamp']}/{r_json['original_url']}"
|
||||
except Exception as e:
|
||||
logger.warning(f"error fetching status for {url=} due to: {e}")
|
||||
if not wayback_url:
|
||||
attempt += 1
|
||||
time.sleep(1) # TODO: can be improved with exponential backoff
|
||||
|
||||
if wayback_url:
|
||||
to_enrich.set("wayback", wayback_url)
|
||||
else:
|
||||
to_enrich.set("wayback", {"job_id": job_id, "check_status": f'https://web.archive.org/save/status/{job_id}'})
|
||||
Reference in New Issue
Block a user