Set up feeder manifests (not merged by source yet)

2026-06-13 05:38:29 +03:00 · 2025-01-23 09:16:42 +00:00
parent c517d35bdf
commit 79684f8348
82 changed files with 721 additions and 730 deletions
--- a/src/auto_archiver/modules/csv_feeder/init.py
+++ b/src/auto_archiver/modules/csv_feeder/init.py
--- a/src/auto_archiver/modules/csv_feeder/manifest.py
+++ b/src/auto_archiver/modules/csv_feeder/manifest.py
@@ -0,0 +1,33 @@
+{
+    "name": "CSV Feeder",
+    "type": ["feeder"],
+    "requires_setup": False,
+    "external_dependencies": {
+        "python": ["loguru"],
+        "bin": [""]
+    },
+    "configs": {
+            "files": {
+                "default": None,
+                "help": "Path to the input file(s) to read the URLs from, comma separated. \
+                        Input files should be formatted with one URL per line",
+                "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
+            },
+            "column": {
+                "default": None,
+                "help": "Column number or name to read the URLs from, 0-indexed",
+            }
+        },
+    "description": """
+    Reads URLs from CSV files and feeds them into the archiving process.
+
+    ### Features
+    - Supports reading URLs from multiple input files, specified as a comma-separated list.
+    - Allows specifying the column number or name to extract URLs from.
+    - Skips header rows if the first value is not a valid URL.
+    - Integrates with the `ArchivingContext` to manage URL feeding.
+
+    ### Setu N
+    - Input files should be formatted with one URL per line.
+    """
+}
--- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py
+++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py
@@ -0,0 +1,44 @@
+from loguru import logger
+import csv
+
+from auto_archiver.feeders import Feeder
+from auto_archiver.core import Metadata, ArchivingContext
+from auto_archiver.utils import url_or_none
+
+class CSVFeeder(Feeder):
+
+    name = "csv_feeder"
+
+
+    @staticmethod
+    def configs() -> dict:
+        return {
+            "files": {
+                "default": None,
+                "help": "Path to the input file(s) to read the URLs from, comma separated. \
+                        Input files should be formatted with one URL per line",
+                "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
+            },
+            "column": {
+                "default": None,
+                "help": "Column number or name to read the URLs from, 0-indexed",
+            }
+        }
+    
+
+    def __iter__(self) -> Metadata:
+        url_column = self.column or 0
+        for file in self.files:
+            with open(file, "r") as f:
+                reader = csv.reader(f)
+                first_row = next(reader)
+                if not(url_or_none(first_row[url_column])):
+                    # it's a header row, skip it
+                    logger.debug(f"Skipping header row: {first_row}")
+                for row in reader:
+                    url = row[0]
+                    logger.debug(f"Processing {url}")
+                    yield Metadata().set_url(url)
+            ArchivingContext.set("folder", "cli")
+
+        logger.success(f"Processed {len(self.urls)} URL(s)")