Unit tests for csv feeder + fix some bugs

This commit is contained in:
Patrick Robertson
2025-02-04 13:37:17 +01:00
parent b301f60ea3
commit 78e6418249
4 changed files with 82 additions and 4 deletions

View File

@@ -7,16 +7,32 @@ from auto_archiver.utils import url_or_none
class CSVFeeder(Feeder):
column = None
def __iter__(self) -> Metadata:
url_column = self.column or 0
for file in self.files:
with open(file, "r") as f:
reader = csv.reader(f)
first_row = next(reader)
if not(url_or_none(first_row[url_column])):
# it's a header row, skip it
url_column = self.column or 0
if isinstance(url_column, str):
try:
url_column = first_row.index(url_column)
except ValueError:
logger.error(f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?")
return
elif not(url_or_none(first_row[url_column])):
# it's a header row, but we've been given a column number already
logger.debug(f"Skipping header row: {first_row}")
else:
# first row isn't a header row, rewind the file
f.seek(0)
for row in reader:
url = row[0]
if not url_or_none(row[url_column]):
logger.warning(f"Not a valid URL in row: {row}, skipping")
continue
url = row[url_column]
logger.debug(f"Processing {url}")
yield Metadata().set_url(url)