auto-archiver/src/auto_archiver/modules/csv_feeder/csv_feeder.py

from loguru import logger
import csv

from auto_archiver.base_processors import Feeder
from auto_archiver.core import Metadata, ArchivingContext
from auto_archiver.utils import url_or_none

class CSVFeeder(Feeder):

    name = "csv_feeder"


    @staticmethod
    def configs() -> dict:
        return {
            "files": {
                "default": None,
                "help": "Path to the input file(s) to read the URLs from, comma separated. \
                        Input files should be formatted with one URL per line",
                "type": "auto_archiver.utils.parse_csv_to_set",
            },
            "column": {
                "default": None,
                "help": "Column number or name to read the URLs from, 0-indexed",
            }
        }


    def __iter__(self) -> Metadata:
        url_column = self.column or 0
        for file in self.files:
            with open(file, "r") as f:
                reader = csv.reader(f)
                first_row = next(reader)
                if not(url_or_none(first_row[url_column])):
                    # it's a header row, skip it
                    logger.debug(f"Skipping header row: {first_row}")
                for row in reader:
                    url = row[0]
                    logger.debug(f"Processing {url}")
                    yield Metadata().set_url(url)
            ArchivingContext.set("folder", "cli")

        logger.success(f"Processed {len(self.urls)} URL(s)")