Further cleanup

* Removes (partly) the ArchivingOrchestrator
* Removes the cli_feeder module, and makes it the 'default', allowing you to pass URLs directly on the command line, without having to use the cumbersome --cli_feeder.urls. Just do auto-archiver https://my.url.com
* More unit tests
* Improved error handling
This commit is contained in:
Patrick Robertson
2025-01-30 16:43:09 +01:00
parent 953011f368
commit d6b4b7a932
27 changed files with 417 additions and 191 deletions

View File

@@ -40,5 +40,3 @@ class AtlosFeeder(Feeder):
if len(data["results"]) == 0 or cursor is None:
break
logger.success(f"Processed {count} URL(s)")

View File

@@ -1 +0,0 @@
from .cli_feeder import CLIFeeder

View File

@@ -1,27 +0,0 @@
{
"name": "CLI Feeder",
"type": ["feeder"],
"requires_setup": False,
"dependencies": {
"python": ["loguru"],
},
'entry_point': 'cli_feeder::CLIFeeder',
"configs": {
"urls": {
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
"nargs": "+",
"required": True,
"do_not_store": True,
"metavar": "INPUT URLS",
},
},
"description": """
Processes URLs to archive passed via the command line and feeds them into the archiving pipeline.
### Features
- Takes a single URL or a list of URLs provided via the command line.
- Converts each URL into a `Metadata` object and yields it for processing.
- Ensures URLs are processed only if they are explicitly provided.
"""
}

View File

@@ -1,15 +0,0 @@
from loguru import logger
from auto_archiver.core import Feeder
from auto_archiver.core import Metadata, ArchivingContext
class CLIFeeder(Feeder):
def __iter__(self) -> Metadata:
for url in self.urls:
logger.debug(f"Processing URL: '{url}'")
yield Metadata().set_url(url)
ArchivingContext.set("folder", "cli")
logger.success(f"Processed {len(self.urls)} URL(s)")

View File

@@ -26,7 +26,6 @@
- Supports reading URLs from multiple input files, specified as a comma-separated list.
- Allows specifying the column number or name to extract URLs from.
- Skips header rows if the first value is not a valid URL.
- Integrates with the `ArchivingContext` to manage URL feeding.
### Setu N
- Input files should be formatted with one URL per line.

View File

@@ -20,6 +20,4 @@ class CSVFeeder(Feeder):
url = row[0]
logger.debug(f"Processing {url}")
yield Metadata().set_url(url)
ArchivingContext.set("folder", "cli")
logger.success(f"Processed {len(self.urls)} URL(s)")
ArchivingContext.set("folder", "cli")

View File

@@ -270,7 +270,11 @@ class GenericExtractor(Extractor):
logger.debug('Using Facebook cookie')
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'),
'quiet': False, 'noplaylist': not self.allow_playlist ,
'writesubtitles': self.subtitles,'writeautomaticsub': self.subtitles,
"live_from_start": self.live_from_start, "proxy": self.proxy,
"max_downloads": self.max_downloads, "playlistend": self.max_downloads}
if item.netloc in ['youtube.com', 'www.youtube.com']:
if self.cookies_from_browser:

View File

@@ -7,7 +7,7 @@ import json
import base64
from auto_archiver.version import __version__
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.core import Metadata, Media
from auto_archiver.core import Formatter
from auto_archiver.modules.hash_enricher import HashEnricher
from auto_archiver.utils.misc import random_str
@@ -46,7 +46,7 @@ class HtmlFormatter(Formatter):
version=__version__
)
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html")
html_path = os.path.join(self.tmp_dir, f"formatted{random_str(24)}.html")
with open(html_path, mode="w", encoding="utf-8") as outf:
outf.write(content)
final_media = Media(filename=html_path, _mimetype="text/html")

View File

@@ -7,7 +7,7 @@ from selenium.common.exceptions import TimeoutException
from auto_archiver.core import Enricher
from auto_archiver.utils import Webdriver, UrlUtil, random_str
from auto_archiver.core import Media, Metadata, ArchivingContext
from auto_archiver.core import Media, Metadata
class ScreenshotEnricher(Enricher):
@@ -23,11 +23,11 @@ class ScreenshotEnricher(Enricher):
try:
driver.get(url)
time.sleep(int(self.sleep_before_screenshot))
screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png")
screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png")
driver.save_screenshot(screenshot_file)
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
if self.save_to_pdf:
pdf_file = os.path.join(ArchivingContext.get_tmp_dir(), f"pdf_{random_str(8)}.pdf")
pdf_file = os.path.join(self.tmp_dir, f"pdf_{random_str(8)}.pdf")
pdf = driver.print_page(driver.print_options)
with open(pdf_file, "wb") as f:
f.write(base64.b64decode(pdf))

View File

@@ -23,6 +23,6 @@ class SSLEnricher(Enricher):
logger.debug(f"fetching SSL certificate for {domain=} in {url=}")
cert = ssl.get_server_certificate((domain, 443))
cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{slugify(domain)}.pem")
cert_fn = os.path.join(self.tmp_dir, f"{slugify(domain)}.pem")
with open(cert_fn, "w") as f: f.write(cert)
to_enrich.add_media(Media(filename=cert_fn), id="ssl_certificate")

View File

@@ -9,7 +9,7 @@ from tqdm import tqdm
import re, time, json, os
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.core import Metadata, Media
from auto_archiver.utils import random_str
@@ -120,7 +120,7 @@ class TelethonArchiver(Extractor):
media_posts = self._get_media_posts_in_group(chat, post)
logger.debug(f'got {len(media_posts)=} for {url=}')
tmp_dir = ArchivingContext.get_tmp_dir()
tmp_dir = self.tmp_dir
group_id = post.grouped_id if post.grouped_id is not None else post.id
title = post.message

View File

@@ -28,7 +28,7 @@ class ThumbnailEnricher(Enricher):
logger.debug(f"generating thumbnails for {to_enrich.get_url()}")
for m_id, m in enumerate(to_enrich.media[::]):
if m.is_video():
folder = os.path.join(ArchivingContext.get_tmp_dir(), random_str(24))
folder = os.path.join(self.tmp_dir, random_str(24))
os.makedirs(folder, exist_ok=True)
logger.debug(f"generating thumbnails for {m.filename}")
duration = m.get("duration")

View File

@@ -9,9 +9,7 @@ from asn1crypto import pem
import certifi
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, ArchivingContext, Media
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media
class TimestampingEnricher(Enricher):
"""
@@ -33,7 +31,7 @@ class TimestampingEnricher(Enricher):
logger.warning(f"No hashes found in {url=}")
return
tmp_dir = ArchivingContext.get_tmp_dir()
tmp_dir = self.tmp_dir
hashes_fn = os.path.join(tmp_dir, "hashes.txt")
data_to_sign = "\n".join(hashes)
@@ -93,7 +91,7 @@ class TimestampingEnricher(Enricher):
cert_chain = []
for cert in path:
cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{str(cert.serial_number)[:20]}.crt")
cert_fn = os.path.join(self.tmp_dir, f"{str(cert.serial_number)[:20]}.crt")
with open(cert_fn, "wb") as f:
f.write(cert.dump())
cert_chain.append(Media(filename=cert_fn).set("subject", cert.subject.native["common_name"]))

View File

@@ -3,7 +3,7 @@ from vk_url_scraper import VkScraper
from auto_archiver.utils.misc import dump_payload
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.core import Metadata, Media
class VkExtractor(Extractor):
@@ -35,7 +35,7 @@ class VkExtractor(Extractor):
result.set_content(dump_payload(vk_scrapes))
filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir())
filenames = self.vks.download_media(vk_scrapes, self.tmp_dir)
for filename in filenames:
result.add_media(Media(filename))

View File

@@ -5,7 +5,7 @@ from zipfile import ZipFile
from loguru import logger
from warcio.archiveiterator import ArchiveIterator
from auto_archiver.core import Media, Metadata, ArchivingContext
from auto_archiver.core import Media, Metadata
from auto_archiver.core import Extractor, Enricher
from auto_archiver.utils import UrlUtil, random_str
@@ -51,7 +51,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
url = to_enrich.get_url()
collection = random_str(8)
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(ArchivingContext.get_tmp_dir())
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host
cmd = [
@@ -154,7 +154,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
logger.info(f"WACZ extract_media or extract_screenshot flag is set, extracting media from {wacz_filename=}")
# unzipping the .wacz
tmp_dir = ArchivingContext.get_tmp_dir()
tmp_dir = self.tmp_dir
unzipped_dir = os.path.join(tmp_dir, "unzipped")
with ZipFile(wacz_filename, 'r') as z_obj:
z_obj.extractall(path=unzipped_dir)