Further cleanup

* Removes (partly) the ArchivingOrchestrator
* Removes the cli_feeder module, and makes it the 'default', allowing you to pass URLs directly on the command line, without having to use the cumbersome --cli_feeder.urls. Just do auto-archiver https://my.url.com
* More unit tests
* Improved error handling
This commit is contained in:
Patrick Robertson
2025-01-30 16:43:09 +01:00
parent 953011f368
commit d6b4b7a932
27 changed files with 417 additions and 191 deletions

View File

@@ -12,7 +12,6 @@ from dataclasses import dataclass
import mimetypes
import os
import mimetypes
import requests
from loguru import logger
from retrying import retry
@@ -71,7 +70,7 @@ class Extractor(BaseModule):
to_filename = url.split('/')[-1].split('?')[0]
if len(to_filename) > 64:
to_filename = to_filename[-64:]
to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
to_filename = os.path.join(self.tmp_dir, to_filename)
if verbose: logger.debug(f"downloading {url[0:50]=} {to_filename=}")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'