diff --git a/src/auto_archiver/modules/csv_feeder/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py index 15dfa85..c3f6eea 100644 --- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py +++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py @@ -7,16 +7,32 @@ from auto_archiver.utils import url_or_none class CSVFeeder(Feeder): + column = None + + def __iter__(self) -> Metadata: - url_column = self.column or 0 for file in self.files: with open(file, "r") as f: reader = csv.reader(f) first_row = next(reader) - if not(url_or_none(first_row[url_column])): - # it's a header row, skip it + url_column = self.column or 0 + if isinstance(url_column, str): + try: + url_column = first_row.index(url_column) + except ValueError: + logger.error(f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?") + return + elif not(url_or_none(first_row[url_column])): + # it's a header row, but we've been given a column number already logger.debug(f"Skipping header row: {first_row}") + else: + # first row isn't a header row, rewind the file + f.seek(0) + for row in reader: - url = row[0] + if not url_or_none(row[url_column]): + logger.warning(f"Not a valid URL in row: {row}, skipping") + continue + url = row[url_column] logger.debug(f"Processing {url}") yield Metadata().set_url(url) \ No newline at end of file diff --git a/tests/data/csv_no_headers.csv b/tests/data/csv_no_headers.csv new file mode 100644 index 0000000..cd66b33 --- /dev/null +++ b/tests/data/csv_no_headers.csv @@ -0,0 +1,2 @@ +https://example.com/1/,data 1 +https://example.com/2/,data 2 \ No newline at end of file diff --git a/tests/data/csv_with_headers.csv b/tests/data/csv_with_headers.csv new file mode 100644 index 0000000..c3e296d --- /dev/null +++ b/tests/data/csv_with_headers.csv @@ -0,0 +1,3 @@ +webpages,other data +https://example.com/1/,data 1 +https://example.com/2/,data 2 \ No newline at end of file diff --git a/tests/feeders/test_csv_feeder.py b/tests/feeders/test_csv_feeder.py new file mode 100644 index 0000000..546c3a7 --- /dev/null +++ b/tests/feeders/test_csv_feeder.py @@ -0,0 +1,57 @@ +import pytest + +@pytest.fixture +def headerless_csv_file(): + return "tests/data/csv_no_headers.csv" + +@pytest.fixture +def header_csv_file(): + return "tests/data/csv_with_headers.csv" + +@pytest.fixture +def header_csv_file_non_default_column(): + return "tests/data/csv_with_headers_non_default_column.csv" + + +def test_csv_feeder_no_headers(headerless_csv_file, setup_module): + from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder + + feeder = setup_module(CSVFeeder, {"files": [headerless_csv_file]}) + + urls = list(feeder) + assert len(urls) == 2 + assert urls[0].get_url() == "https://example.com/1/" + assert urls[1].get_url() == "https://example.com/2/" + +def test_csv_feeder_with_headers(header_csv_file, setup_module): + from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder + + feeder = setup_module(CSVFeeder, {"files": [header_csv_file]}) + + urls = list(feeder) + assert len(urls) == 2 + assert urls[0].get_url() == "https://example.com/1/" + assert urls[1].get_url() == "https://example.com/2/" + +def test_csv_feeder_wrong_column(header_csv_file, setup_module, caplog): + from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder + + + with caplog.at_level("WARNING"): + feeder = setup_module(CSVFeeder, {"files": [header_csv_file], "column": 1}) + urls = list(feeder) + + assert len(urls) == 0 + assert "Not a valid URL in row" in caplog.text + assert len(caplog.records) == 2 + + +def test_csv_feeder_column_by_name(header_csv_file, setup_module): + from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder + + feeder = setup_module(CSVFeeder, {"files": [header_csv_file], "column": "webpages"}) + + urls = list(feeder) + assert len(urls) == 2 + assert urls[0].get_url() == "https://example.com/1/" + assert urls[1].get_url() == "https://example.com/2/" \ No newline at end of file