from loguru import logger import csv from auto_archiver.base_processors import Feeder from auto_archiver.core import Metadata, ArchivingContext from auto_archiver.utils import url_or_none class CSVFeeder(Feeder): name = "csv_feeder" @staticmethod def configs() -> dict: return { "files": { "default": None, "help": "Path to the input file(s) to read the URLs from, comma separated. \ Input files should be formatted with one URL per line", "type": "auto_archiver.utils.parse_csv_to_set", }, "column": { "default": None, "help": "Column number or name to read the URLs from, 0-indexed", } } def __iter__(self) -> Metadata: url_column = self.column or 0 for file in self.files: with open(file, "r") as f: reader = csv.reader(f) first_row = next(reader) if not(url_or_none(first_row[url_column])): # it's a header row, skip it logger.debug(f"Skipping header row: {first_row}") for row in reader: url = row[0] logger.debug(f"Processing {url}") yield Metadata().set_url(url) ArchivingContext.set("folder", "cli") logger.success(f"Processed {len(self.urls)} URL(s)")