mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 12:48:28 +03:00
44 lines
1.4 KiB
Python
44 lines
1.4 KiB
Python
from loguru import logger
|
|
import csv
|
|
|
|
from auto_archiver.base_processors import Feeder
|
|
from auto_archiver.core import Metadata, ArchivingContext
|
|
from auto_archiver.utils import url_or_none
|
|
|
|
class CSVFeeder(Feeder):
|
|
|
|
name = "csv_feeder"
|
|
|
|
|
|
@staticmethod
|
|
def configs() -> dict:
|
|
return {
|
|
"files": {
|
|
"default": None,
|
|
"help": "Path to the input file(s) to read the URLs from, comma separated. \
|
|
Input files should be formatted with one URL per line",
|
|
"type": "auto_archiver.utils.parse_csv_to_set",
|
|
},
|
|
"column": {
|
|
"default": None,
|
|
"help": "Column number or name to read the URLs from, 0-indexed",
|
|
}
|
|
}
|
|
|
|
|
|
def __iter__(self) -> Metadata:
|
|
url_column = self.column or 0
|
|
for file in self.files:
|
|
with open(file, "r") as f:
|
|
reader = csv.reader(f)
|
|
first_row = next(reader)
|
|
if not(url_or_none(first_row[url_column])):
|
|
# it's a header row, skip it
|
|
logger.debug(f"Skipping header row: {first_row}")
|
|
for row in reader:
|
|
url = row[0]
|
|
logger.debug(f"Processing {url}")
|
|
yield Metadata().set_url(url)
|
|
ArchivingContext.set("folder", "cli")
|
|
|
|
logger.success(f"Processed {len(self.urls)} URL(s)") |