From b763fc418838b30433e05877d396c02c7d50a0fb Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Sat, 21 Jan 2023 19:44:12 +0000
Subject: [PATCH] final naming cleanup + new feeders/dbs

---
 .gitignore                                    |   3 +-
 src/auto_archiver/__init__.py                 |   2 +-
 src/auto_archiver/__main__.py                 |   4 +-
 src/auto_archiver/archivers/__init__.py       |  18 +-
 src/auto_archiver/archivers/archiver.py       |   6 +-
 src/auto_archiver/archivers/base_archiver.py  | 384 ------------------
 ...am_archiverv2.py => instagram_archiver.py} |   4 +-
 ...ram_archiverv2.py => telegram_archiver.py} |   4 +-
 ...hon_archiverv2.py => telethon_archiver.py} |   4 +-
 ...iktok_archiverv2.py => tiktok_archiver.py} |   4 +-
 ..._archiverv2.py => twitter_api_archiver.py} |   6 +-
 ...tter_archiverv2.py => twitter_archiver.py} |   4 +-
 .../{vk_archiverv2.py => vk_archiver.py}      |   4 +-
 ...dl_archiverv2.py => youtubedl_archiver.py} |   4 +-
 src/auto_archiver/auto_archive.py             | 176 --------
 src/auto_archiver/auto_auto_archive.py        |  55 +--
 src/auto_archiver/core/__init__.py            |   2 +-
 .../core/{v2config.py => config.py}           |  16 +-
 src/auto_archiver/core/metadata.py            |   4 +-
 src/auto_archiver/core/orchestrator.py        |  16 +-
 src/auto_archiver/core/step.py                |   2 +-
 src/auto_archiver/databases/__init__.py       |   3 +-
 src/auto_archiver/databases/csv_db.py         |  33 ++
 src/auto_archiver/databases/database.py       |   1 -
 .../enrichers/wayback_enricher.py             |   4 +-
 src/auto_archiver/feeders/__init__.py         |   3 +-
 src/auto_archiver/feeders/cli_feeder.py       |  33 ++
 src/auto_archiver/storages/__init__.py        |  12 +-
 src/auto_archiver/storages/base_storage.py    |  33 --
 src/auto_archiver/storages/gd_storage.py      | 311 +++++++-------
 src/auto_archiver/storages/local.py           |   4 +-
 src/auto_archiver/storages/s3.py              |   4 +-
 src/auto_archiver/storages/s3_storage.py      |  80 ----
 src/auto_archiver/storages/storage.py         |   6 +-
 34 files changed, 322 insertions(+), 927 deletions(-)
 delete mode 100644 src/auto_archiver/archivers/base_archiver.py
 rename src/auto_archiver/archivers/{instagram_archiverv2.py => instagram_archiver.py} (99%)
 rename src/auto_archiver/archivers/{telegram_archiverv2.py => telegram_archiver.py} (97%)
 rename src/auto_archiver/archivers/{telethon_archiverv2.py => telethon_archiver.py} (99%)
 rename src/auto_archiver/archivers/{tiktok_archiverv2.py => tiktok_archiver.py} (96%)
 rename src/auto_archiver/archivers/{twitter_api_archiverv2.py => twitter_api_archiver.py} (97%)
 rename src/auto_archiver/archivers/{twitter_archiverv2.py => twitter_archiver.py} (98%)
 rename src/auto_archiver/archivers/{vk_archiverv2.py => vk_archiver.py} (97%)
 rename src/auto_archiver/archivers/{youtubedl_archiverv2.py => youtubedl_archiver.py} (97%)
 delete mode 100644 src/auto_archiver/auto_archive.py
 rename src/auto_archiver/core/{v2config.py => config.py} (93%)
 create mode 100644 src/auto_archiver/databases/csv_db.py
 create mode 100644 src/auto_archiver/feeders/cli_feeder.py
 delete mode 100644 src/auto_archiver/storages/base_storage.py
 delete mode 100644 src/auto_archiver/storages/s3_storage.py

diff --git a/.gitignore b/.gitignore
index 2c55563..b3a4b36 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,4 +26,5 @@ instaloader/*
 instaloader.session
 orchestration.yaml
 auto_archiver.egg-info*
-logs*
\ No newline at end of file
+logs*
+*.csv
\ No newline at end of file
diff --git a/src/auto_archiver/__init__.py b/src/auto_archiver/__init__.py
index c02d330..adb8551 100644
--- a/src/auto_archiver/__init__.py
+++ b/src/auto_archiver/__init__.py
@@ -2,6 +2,6 @@ from . import archivers, databases, enrichers, feeders, formatters, storages, ut
 
 # need to manually specify due to cyclical deps
 from .core.orchestrator import ArchivingOrchestrator
-from .core.v2config import ConfigV2
+from .core.config import Config
 # making accessible directly
 from .core.metadata import Metadata
\ No newline at end of file
diff --git a/src/auto_archiver/__main__.py b/src/auto_archiver/__main__.py
index 812e4e2..e8a81c4 100644
--- a/src/auto_archiver/__main__.py
+++ b/src/auto_archiver/__main__.py
@@ -1,8 +1,8 @@
-from . import ConfigV2
+from . import Config
 from . import ArchivingOrchestrator
 
 def main():
-    config = ConfigV2()
+    config = Config()
     config.parse()
     orchestrator = ArchivingOrchestrator(config)
     orchestrator.feed()
diff --git a/src/auto_archiver/archivers/__init__.py b/src/auto_archiver/archivers/__init__.py
index 595ea8c..b5c4faa 100644
--- a/src/auto_archiver/archivers/__init__.py
+++ b/src/auto_archiver/archivers/__init__.py
@@ -10,12 +10,12 @@
 # from .twitter_api_archiver import TwitterApiArchiver
 # from .instagram_archiver import InstagramArchiver
 
-from .archiver import Archiverv2
-from .telethon_archiverv2 import TelethonArchiver
-from .twitter_archiverv2 import TwitterArchiver
-from .twitter_api_archiverv2 import TwitterApiArchiver
-from .instagram_archiverv2 import InstagramArchiver
-from .tiktok_archiverv2 import TiktokArchiver
-from .telegram_archiverv2 import TelegramArchiver
-from .vk_archiverv2 import VkArchiver
-from .youtubedl_archiverv2 import YoutubeDLArchiver
\ No newline at end of file
+from .archiver import Archiver
+from .telethon_archiver import TelethonArchiver
+from .twitter_archiver import TwitterArchiver
+from .twitter_api_archiver import TwitterApiArchiver
+from .instagram_archiver import InstagramArchiver
+from .tiktok_archiver import TiktokArchiver
+from .telegram_archiver import TelegramArchiver
+from .vk_archiver import VkArchiver
+from .youtubedl_archiver import YoutubeDLArchiver
\ No newline at end of file
diff --git a/src/auto_archiver/archivers/archiver.py b/src/auto_archiver/archivers/archiver.py
index d09044e..49efd66 100644
--- a/src/auto_archiver/archivers/archiver.py
+++ b/src/auto_archiver/archivers/archiver.py
@@ -8,16 +8,16 @@ from ..core import Step
 
 
 @dataclass
-class Archiverv2(Step):
+class Archiver(Step):
     name = "archiver"
 
     def __init__(self, config: dict) -> None:
         # without this STEP.__init__ is not called
         super().__init__(config)
 
-    def init(name: str, config: dict) -> Archiverv2:
+    def init(name: str, config: dict) -> Archiver:
         # only for typing...
-        return Step.init(name, config, Archiverv2)
+        return Step.init(name, config, Archiver)
 
     def setup(self) -> None:
         # used when archivers need to login or do other one-time setup
diff --git a/src/auto_archiver/archivers/base_archiver.py b/src/auto_archiver/archivers/base_archiver.py
deleted file mode 100644
index 103b9ff..0000000
--- a/src/auto_archiver/archivers/base_archiver.py
+++ /dev/null
@@ -1,384 +0,0 @@
-import os, datetime, shutil, hashlib, time, requests, re, mimetypes, subprocess
-from dataclasses import dataclass, field
-from abc import ABC, abstractmethod
-from urllib.parse import urlparse
-from random import randrange
-from collections import defaultdict
-
-import ffmpeg
-from loguru import logger
-from selenium.common.exceptions import TimeoutException
-from selenium.webdriver.common.by import By
-from slugify import slugify
-
-from ..configs import Config
-from storages import Storage
-from utils import mkdir_if_not_exists
-
-
-@dataclass
-class ArchiveResult:
-    status: str
-    cdn_url: str = None
-    thumbnail: str = None
-    thumbnail_index: str = None
-    duration: float = None
-    title: str = None
-    timestamp: datetime.datetime = None
-    screenshot: str = None
-    wacz: str = None
-    hash: str = None
-    media: list = field(default_factory=list)
-
-class Archiver(ABC):
-    name = "default"
-    retry_regex = r"retrying at (\d+)$"
-
-    def __init__(self, storage: Storage, config: Config):
-        self.storage = storage
-        self.driver = config.webdriver
-        self.hash_algorithm = config.hash_algorithm
-        self.browsertrix = config.browsertrix_config
-        self.is_docker = config.is_docker
-        self.media = []
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    def __repr__(self):
-        return self.__str__()
-
-    @abstractmethod
-    def download(self, url, check_if_exists=False): pass
-
-    def generateArchiveResult(self, **kwargs):
-        # remove duplicates
-        if "cdn_url" in kwargs:
-            self.add_to_media(kwargs["cdn_url"], None, kwargs.get("hash"))
-        kwargs["media"] = [dict(t) for t in {tuple(d.items()) for d in self.media}]
-        return ArchiveResult(**kwargs)
-
-    def get_netloc(self, url):
-        return urlparse(url).netloc
-
-    def add_to_media(self, cdn_url: str, key: str = None, hash: str = None):
-        media_info = {"url": cdn_url, "mime": self._guess_file_type(cdn_url) or "misc"}
-        if key: media_info["key"] = key
-        if hash: media_info["hash"] = hash
-        self.media.append(media_info)
-
-    def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
-        """
-        Generates an index.html page where each @urls_info is displayed
-        """
-        for ui in urls_info:
-            self.add_to_media(ui["cdn_url"], ui["key"], ui["hash"])
-        page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>
-            <body>
-            <h2>Archived media from {self.name}</h2>
-            <h3><a href="{url}">{url}</a></h3><ul>'''
-
-        for url_info in urls_info:
-            mime_global = self._guess_file_type(url_info["key"])
-            preview = ""
-            if mime_global == "image":
-                preview = f'<img src="{url_info["cdn_url"]}" style="max-height:200px;max-width:400px;"></img>'
-            elif mime_global == "video":
-                preview = f'<video src="{url_info["cdn_url"]}" controls style="max-height:400px;max-width:400px;"></video>'
-            page += f'''<li><a href="{url_info['cdn_url']}">{preview}{url_info['key']}</a>: {url_info['hash']}</li>'''
-
-        page += f"</ul><h2>{self.name} object data:</h2><code>{object}</code>"
-        page += f"</body></html>"
-
-        page_key = self.get_html_key(url)
-        page_filename = os.path.join(Storage.TMP_FOLDER, page_key)
-
-        with open(page_filename, "w") as f:
-            f.write(page)
-
-        page_hash = self.get_hash(page_filename)
-
-        self.storage.upload(page_filename, page_key, extra_args={
-            'ACL': 'public-read', 'ContentType': 'text/html'})
-
-        page_cdn = self.storage.get_cdn_url(page_key)
-        return (page_cdn, page_hash, thumbnail)
-
-    def _guess_file_type(self, path: str):
-        """
-        Receives a URL or filename and returns global mimetype like 'image' or 'video'
-        see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
-        """
-        mime = mimetypes.guess_type(path)[0]
-        if mime is not None:
-            return mime.split("/")[0]
-        return ""
-
-    def download_from_url(self, url, to_filename):
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
-        }
-        d = requests.get(url, headers=headers)
-        with open(to_filename, 'wb') as f:
-            f.write(d.content)
-
-    def generate_media_page(self, urls, url, object):
-        """
-        For a list of media urls, fetch them, upload them
-        and call self.generate_media_page_html with them
-        """
-        for media_url in urls:
-            self.add_to_media(media_url)
-
-        thumbnail = None
-        uploaded_media = []
-        for media_url in urls:
-            key = self._get_key_from_url(media_url, ".jpg")
-
-            filename = os.path.join(Storage.TMP_FOLDER, key)
-            self.download_from_url(media_url, filename)
-            self.storage.upload(filename, key)
-            hash = self.get_hash(filename)
-            cdn_url = self.storage.get_cdn_url(key)
-
-            if thumbnail is None:
-                thumbnail = cdn_url
-            uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
-
-        return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail)
-
-    def get_key(self, filename):
-        """
-        returns a key in the format "[archiverName]_[filename]" includes extension
-        """
-        tail = os.path.split(filename)[1]  # returns filename.ext from full path
-        _id, extension = os.path.splitext(tail)  # returns [filename, .ext]
-        if 'unknown_video' in _id:
-            _id = _id.replace('unknown_video', 'jpg')
-
-        # long filenames can cause problems, so trim them if necessary
-        if len(_id) > 128:
-            _id = _id[-128:]
-
-        return f'{self.name}_{_id}{extension}'
-
-    def get_html_key(self, url):
-        return self._get_key_from_url(url, ".html")
-
-    def _get_key_from_url(self, url, with_extension: str = None, append_datetime: bool = False):
-        """
-        Receives a URL and returns a slugified version of the URL path
-        if a string is passed in @with_extension the slug is appended with it if there is no "." in the slug
-        if @append_date is true, the key adds a timestamp after the URL slug and before the extension
-        """
-        url_path = urlparse(url).path
-        path, ext = os.path.splitext(url_path)
-        slug = slugify(path)
-        if append_datetime:
-            slug += "-" + slugify(datetime.datetime.utcnow().isoformat())
-        if len(ext):
-            slug += ext
-        if with_extension is not None:
-            if "." not in slug:
-                slug += with_extension
-        return self.get_key(slug)
-
-    def get_hash(self, filename):
-        with open(filename, "rb") as f:
-            bytes = f.read()  # read entire file as bytes
-            logger.debug(f'Hash algorithm is {self.hash_algorithm}')
-
-            if self.hash_algorithm == "SHA-256": hash = hashlib.sha256(bytes)
-            elif self.hash_algorithm == "SHA3-512": hash = hashlib.sha3_512(bytes)
-            else: raise Exception(f"Unknown Hash Algorithm of {self.hash_algorithm}")
-
-        return hash.hexdigest()
-
-    def get_screenshot(self, url):
-        logger.debug(f"getting screenshot for {url=}")
-        key = self._get_key_from_url(url, ".png", append_datetime=True)
-        filename = os.path.join(Storage.TMP_FOLDER, key)
-
-        # Accept cookies popup dismiss for ytdlp video
-        if 'facebook.com' in url:
-            try:
-                logger.debug(f'Trying fb click accept cookie popup for {url}')
-                self.driver.get("http://www.facebook.com")
-                foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
-                foo.click()
-                logger.debug(f'fb click worked')
-                # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
-                time.sleep(2)
-            except:
-                logger.warning(f'Failed on fb accept cookies for url {url}')
-
-        try:
-            self.driver.get(url)
-            time.sleep(6)
-        except TimeoutException:
-            logger.info("TimeoutException loading page for screenshot")
-
-        self.driver.save_screenshot(filename)
-        self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'})
-
-        cdn_url = self.storage.get_cdn_url(key)
-        self.add_to_media(cdn_url, key)
-
-        return cdn_url
-
-    def get_wacz(self, url):
-        if not self.browsertrix.enabled:
-            logger.debug(f"Browsertrix WACZ generation is not enabled, skipping.")
-            return
-        if self.is_docker:
-            # TODO: figure out support for browsertrix in docker
-            # see: https://github.com/bellingcat/auto-archiver/issues/66
-            logger.warning(f"Browsertrix WACZ is not yet supported when using DOCKER.")
-            return
-
-        logger.debug(f"getting wacz for {url}")
-        key = self._get_key_from_url(url, ".wacz", append_datetime=True)
-        collection = re.sub('[^0-9a-zA-Z]+', '', key.replace(".wacz", ""))
-
-        browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp")
-        cmd = [
-            "docker", "run",
-            "--rm",  # delete container once it has completed running
-            "-v", f"{browsertrix_home}:/crawls/",
-            # "-it", # this leads to "the input device is not a TTY"
-            "webrecorder/browsertrix-crawler", "crawl",
-            "--url", url,
-            "--scopeType", "page",
-            "--generateWACZ",
-            "--text",
-            "--collection", collection,
-            "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
-            "--behaviorTimeout", str(self.browsertrix.timeout_seconds),
-            "--timeout", str(self.browsertrix.timeout_seconds)
-        ]
-
-        if not os.path.isdir(browsertrix_home):
-            os.mkdir(browsertrix_home)
-
-        if self.browsertrix.profile:
-            shutil.copyfile(self.browsertrix.profile, os.path.join(browsertrix_home, "profile.tar.gz"))
-            cmd.extend(["--profile", "/crawls/profile.tar.gz"])
-
-        try:
-            logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
-            subprocess.run(cmd, check=True)
-        except Exception as e:
-            logger.error(f"WACZ generation failed: {e}")
-            return
-
-        filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
-
-        # do not crash if upload fails
-        try:
-            self.storage.upload(filename, key, extra_args={
-                'ACL': 'public-read', 'ContentType': 'application/zip'})
-        except FileNotFoundError as e:
-            logger.warning(f"Unable to locate and upload WACZ  {filename=}, {key=}")
-
-        # clean up the local browsertrix files
-        try:
-            shutil.rmtree(browsertrix_home)
-        except PermissionError:
-            logger.warn(f"Unable to clean up browsertrix-crawler files in {browsertrix_home}")
-
-        cdn_url = self.storage.get_cdn_url(key)
-        self.add_to_media(cdn_url, key)
-        return cdn_url
-
-    def get_thumbnails(self, filename, key, duration=None):
-        thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep
-        key_folder = key.split('.')[0] + os.path.sep
-
-        mkdir_if_not_exists(thumbnails_folder)
-
-        fps = 0.5
-        if duration is not None:
-            duration = float(duration)
-
-            if duration < 60:
-                fps = 10.0 / duration
-            elif duration < 120:
-                fps = 20.0 / duration
-            else:
-                fps = 40.0 / duration
-
-        stream = ffmpeg.input(filename)
-        stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
-        stream.output(thumbnails_folder + 'out%d.jpg').run()
-
-        thumbnails = os.listdir(thumbnails_folder)
-        cdn_urls = []
-        for fname in thumbnails:
-            if fname[-3:] == 'jpg':
-                thumbnail_filename = thumbnails_folder + fname
-                key = os.path.join(key_folder, fname)
-
-                self.storage.upload(thumbnail_filename, key)
-                cdn_url = self.storage.get_cdn_url(key)
-                cdn_urls.append(cdn_url)
-
-        if len(cdn_urls) == 0:
-            return ('', '')
-
-        key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]
-
-        index_page = f'''<html><head><title>{filename}</title><meta charset="UTF-8"></head>
-            <body>'''
-
-        for t in cdn_urls:
-            index_page += f'<img src="{t}" />'
-
-        index_page += f"</body></html>"
-        index_fname = thumbnails_folder + 'index.html'
-
-        with open(index_fname, 'w') as f:
-            f.write(index_page)
-
-        thumb_index = key_folder + 'index.html'
-
-        self.storage.upload(index_fname, thumb_index, extra_args={
-                            'ACL': 'public-read', 'ContentType': 'text/html'})
-        shutil.rmtree(thumbnails_folder)
-
-        thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)
-
-        return (key_thumb, thumb_index_cdn_url)
-
-    def signal_retry_in(self, min_seconds=1800, max_seconds=7200, **kwargs):
-        """
-        sets state to retry in random between (min_seconds, max_seconds)
-        """
-        now = datetime.datetime.now().timestamp()
-        retry_at = int(now + randrange(min_seconds, max_seconds))
-        logger.debug(f"signaling {retry_at=}")
-        return ArchiveResult(status=f'retrying at {retry_at}', **kwargs)
-
-    def is_retry(status):
-        return re.search(Archiver.retry_regex, status) is not None
-
-    def should_retry_from_status(status):
-        """
-        checks status against message in signal_retry_in
-        returns true if enough time has elapsed, false otherwise
-        """
-        match = re.search(Archiver.retry_regex, status)
-        if match:
-            retry_at = int(match.group(1))
-            now = datetime.datetime.now().timestamp()
-            should_retry = now >= retry_at
-            logger.debug(f"{should_retry=} since {now=} and {retry_at=}")
-            return should_retry
-        return False
-
-    def remove_retry(status):
-        """
-        transforms the status from retry into something else
-        """
-        new_status = re.sub(Archiver.retry_regex, "failed: too many retries", status, 0)
-        logger.debug(f"removing retry message at {status=}, got {new_status=}")
-        return new_status
diff --git a/src/auto_archiver/archivers/instagram_archiverv2.py b/src/auto_archiver/archivers/instagram_archiver.py
similarity index 99%
rename from src/auto_archiver/archivers/instagram_archiverv2.py
rename to src/auto_archiver/archivers/instagram_archiver.py
index 126622a..c0d1b2c 100644
--- a/src/auto_archiver/archivers/instagram_archiverv2.py
+++ b/src/auto_archiver/archivers/instagram_archiver.py
@@ -2,11 +2,11 @@ import re, os, shutil, html, traceback
 import instaloader  # https://instaloader.github.io/as-module.html
 from loguru import logger
 
-from . import Archiverv2
+from . import Archiver
 from ..core import Metadata
 from ..core import Media
 
-class InstagramArchiver(Archiverv2):
+class InstagramArchiver(Archiver):
     """
     Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
     """
diff --git a/src/auto_archiver/archivers/telegram_archiverv2.py b/src/auto_archiver/archivers/telegram_archiver.py
similarity index 97%
rename from src/auto_archiver/archivers/telegram_archiverv2.py
rename to src/auto_archiver/archivers/telegram_archiver.py
index 2f4bf23..b579dc4 100644
--- a/src/auto_archiver/archivers/telegram_archiverv2.py
+++ b/src/auto_archiver/archivers/telegram_archiver.py
@@ -4,12 +4,12 @@ import html
 from bs4 import BeautifulSoup
 from loguru import logger
 
-from . import Archiverv2
+from . import Archiver
 from ..core import Metadata
 from ..core import Media
 
 
-class TelegramArchiver(Archiverv2):
+class TelegramArchiver(Archiver):
     """
     Archiver for telegram that does not require login, but the telethon_archiver is much more advised, will only return if at least one image or one video is found
     """
diff --git a/src/auto_archiver/archivers/telethon_archiverv2.py b/src/auto_archiver/archivers/telethon_archiver.py
similarity index 99%
rename from src/auto_archiver/archivers/telethon_archiverv2.py
rename to src/auto_archiver/archivers/telethon_archiver.py
index 57d27e1..20bf28e 100644
--- a/src/auto_archiver/archivers/telethon_archiverv2.py
+++ b/src/auto_archiver/archivers/telethon_archiver.py
@@ -8,12 +8,12 @@ from loguru import logger
 from tqdm import tqdm
 import re, time, json, os
 
-from . import Archiverv2
+from . import Archiver
 from ..core import Metadata
 from ..core import Media
 
 
-class TelethonArchiver(Archiverv2):
+class TelethonArchiver(Archiver):
     name = "telethon_archiver"
     link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
     invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
diff --git a/src/auto_archiver/archivers/tiktok_archiverv2.py b/src/auto_archiver/archivers/tiktok_archiver.py
similarity index 96%
rename from src/auto_archiver/archivers/tiktok_archiverv2.py
rename to src/auto_archiver/archivers/tiktok_archiver.py
index ea7f670..0c41193 100644
--- a/src/auto_archiver/archivers/tiktok_archiverv2.py
+++ b/src/auto_archiver/archivers/tiktok_archiver.py
@@ -5,12 +5,12 @@ import uuid
 import tiktok_downloader
 from loguru import logger
 
-from . import Archiverv2
+from . import Archiver
 from ..core import Metadata
 from ..core import Media
 
 
-class TiktokArchiver(Archiverv2):
+class TiktokArchiver(Archiver):
     name = "tiktok_archiver"
 
     def __init__(self, config: dict) -> None:
diff --git a/src/auto_archiver/archivers/twitter_api_archiverv2.py b/src/auto_archiver/archivers/twitter_api_archiver.py
similarity index 97%
rename from src/auto_archiver/archivers/twitter_api_archiverv2.py
rename to src/auto_archiver/archivers/twitter_api_archiver.py
index 007193c..821d8d4 100644
--- a/src/auto_archiver/archivers/twitter_api_archiverv2.py
+++ b/src/auto_archiver/archivers/twitter_api_archiver.py
@@ -7,13 +7,13 @@ from loguru import logger
 from pytwitter import Api
 from slugify import slugify
 
-from . import Archiverv2
-from .twitter_archiverv2 import TwitterArchiver
+from . import Archiver
+from .twitter_archiver import TwitterArchiver
 from ..core import Metadata
 from ..core import Media
 
 
-class TwitterApiArchiver(TwitterArchiver, Archiverv2):
+class TwitterApiArchiver(TwitterArchiver, Archiver):
     name = "twitter_api_archiver"
 
     def __init__(self, config: dict) -> None:
diff --git a/src/auto_archiver/archivers/twitter_archiverv2.py b/src/auto_archiver/archivers/twitter_archiver.py
similarity index 98%
rename from src/auto_archiver/archivers/twitter_archiverv2.py
rename to src/auto_archiver/archivers/twitter_archiver.py
index 58942ab..194025d 100644
--- a/src/auto_archiver/archivers/twitter_archiverv2.py
+++ b/src/auto_archiver/archivers/twitter_archiver.py
@@ -7,11 +7,11 @@ from loguru import logger
 from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
 from slugify import slugify
 
-from . import Archiverv2
+from . import Archiver
 from ..core import Metadata
 from ..core import Media
 
-class TwitterArchiver(Archiverv2):
+class TwitterArchiver(Archiver):
     """
     This Twitter Archiver uses unofficial scraping methods.
     """
diff --git a/src/auto_archiver/archivers/vk_archiverv2.py b/src/auto_archiver/archivers/vk_archiver.py
similarity index 97%
rename from src/auto_archiver/archivers/vk_archiverv2.py
rename to src/auto_archiver/archivers/vk_archiver.py
index 4c2ad98..1d76282 100644
--- a/src/auto_archiver/archivers/vk_archiverv2.py
+++ b/src/auto_archiver/archivers/vk_archiver.py
@@ -2,12 +2,12 @@ from loguru import logger
 from vk_url_scraper import VkScraper
 
 from ..utils.misc import dump_payload
-from . import Archiverv2
+from . import Archiver
 from ..core import Metadata
 from ..core import Media
 
 
-class VkArchiver(Archiverv2):
+class VkArchiver(Archiver):
     """"
     VK videos are handled by YTDownloader, this archiver gets posts text and images.
     Currently only works for /wall posts
diff --git a/src/auto_archiver/archivers/youtubedl_archiverv2.py b/src/auto_archiver/archivers/youtubedl_archiver.py
similarity index 97%
rename from src/auto_archiver/archivers/youtubedl_archiverv2.py
rename to src/auto_archiver/archivers/youtubedl_archiver.py
index 0b5916c..443fb17 100644
--- a/src/auto_archiver/archivers/youtubedl_archiverv2.py
+++ b/src/auto_archiver/archivers/youtubedl_archiver.py
@@ -4,12 +4,12 @@ import os
 import yt_dlp
 from loguru import logger
 
-from . import Archiverv2
+from . import Archiver
 from ..core import Metadata
 from ..core import Media
 
 
-class YoutubeDLArchiver(Archiverv2):
+class YoutubeDLArchiver(Archiver):
     name = "youtubedl_enricher"
 
     def __init__(self, config: dict) -> None:
diff --git a/src/auto_archiver/auto_archive.py b/src/auto_archiver/auto_archive.py
deleted file mode 100644
index a797405..0000000
--- a/src/auto_archiver/auto_archive.py
+++ /dev/null
@@ -1,176 +0,0 @@
-import os, datetime, traceback, random, tempfile
-
-from loguru import logger
-from slugify import slugify
-from urllib.parse import quote
-
-from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, InstagramArchiver, ArchiveResult, Archiver
-from utils import GWorksheet, expand_url
-from configs import Config
-from storages import Storage
-
-random.seed()
-
-
-def update_sheet(gw, row, url, result: ArchiveResult):
-    cell_updates = []
-    row_values = gw.get_row(row)
-
-    def batch_if_valid(col, val, final_value=None):
-        final_value = final_value or val
-        if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
-            cell_updates.append((row, col, final_value))
-
-    cell_updates.append((row, 'status', result.status))
-
-    batch_if_valid('archive', result.cdn_url)
-    batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
-    batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
-    batch_if_valid('thumbnail_index', result.thumbnail_index)
-    batch_if_valid('title', result.title)
-    batch_if_valid('duration', result.duration, str(result.duration))
-    batch_if_valid('screenshot', result.screenshot)
-    batch_if_valid('hash', result.hash)
-    if result.wacz is not None:
-        batch_if_valid('wacz', result.wacz)
-        batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}')
-
-    if result.timestamp is not None:
-        if type(result.timestamp) == int:
-            timestamp_string = datetime.datetime.fromtimestamp(result.timestamp).replace(tzinfo=datetime.timezone.utc).isoformat()
-        elif type(result.timestamp) == str:
-            timestamp_string = result.timestamp
-        else:
-            timestamp_string = result.timestamp.isoformat()
-
-        batch_if_valid('timestamp', timestamp_string)
-
-    gw.batch_set_cell(cell_updates)
-
-
-def missing_required_columns(gw: GWorksheet):
-    missing = False
-    for required_col in ['url', 'status']:
-        if not gw.col_exists(required_col):
-            logger.warning(f'Required column for {required_col}: "{gw.columns[required_col]}" not found, skipping worksheet {gw.wks.title}')
-            missing = True
-    return missing
-
-
-def should_process_sheet(c: Config, sheet_name):
-    if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow:
-        # ALLOW rules exist AND sheet name not explicitly allowed
-        return False
-    if len(c.worksheet_block) and sheet_name in c.worksheet_block:
-        # BLOCK rules exist AND sheet name is blocked
-        return False
-    return True
-
-
-def archive_url(c: Config, url: str, folder: str, debug_string: str, is_retry: bool):
-    url = expand_url(url)
-    c.set_folder(folder)
-    storage = c.get_storage()
-
-    # make a new driver so each spreadsheet row is idempotent
-    c.recreate_webdriver()
-
-    # order matters, first to succeed excludes remaining
-    active_archivers = [
-        TelethonArchiver(storage, c),
-        TiktokArchiver(storage, c),
-        TwitterApiArchiver(storage, c),
-        InstagramArchiver(storage, c),
-        YoutubeDLArchiver(storage, c),
-        TelegramArchiver(storage, c),
-        TwitterArchiver(storage, c),
-        VkArchiver(storage, c),
-        WaybackArchiver(storage, c)
-    ]
-
-    for archiver in active_archivers:
-        logger.debug(f'Trying {archiver} on {debug_string}')
-
-        try:
-            result = archiver.download(url, check_if_exists=c.check_if_exists)
-        except KeyboardInterrupt as e: raise e  # so the higher level catch can catch it
-        except Exception as e:
-            result = False
-            logger.error(f'Got unexpected error in {debug_string} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
-
-        if result:
-            success = result.status in ['success', 'already archived']
-            result.status = f"{archiver.name}: {result.status}"
-            if success:
-                logger.success(f'{archiver.name} succeeded on {debug_string}, {url=}')
-                break
-            # only 1 retry possible for now
-            if is_retry and Archiver.is_retry(result.status):
-                result.status = Archiver.remove_retry(result.status)
-            logger.warning(f'{archiver.name} did not succeed on {debug_string}, final status: {result.status}')
-    return result
-
-
-def process_sheet(c: Config):
-    sh = c.gsheets_client.open(c.sheet)
-
-    # loop through worksheets to check
-    for ii, wks in enumerate(sh.worksheets()):
-        if not should_process_sheet(c, wks.title):
-            logger.info(f'Ignoring worksheet "{wks.title}" due to allow/block configurations')
-            continue
-
-        logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}')
-        gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)
-
-        if missing_required_columns(gw): continue
-
-        # archives will default to being in a folder 'doc_name/worksheet_name'
-        default_folder = os.path.join(slugify(c.sheet), slugify(wks.title))
-        c.set_folder(default_folder)
-        storage = c.get_storage()
-
-        # loop through rows in worksheet
-        for row in range(1 + c.header, gw.count_rows() + 1):
-            url = gw.get_cell(row, 'url')
-            original_status = gw.get_cell(row, 'status')
-            status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '')
-
-            is_retry = False
-            if url == '' or status not in ['', None]:
-                is_retry = Archiver.should_retry_from_status(status)
-                if not is_retry: continue
-
-            # All checks done - archival process starts here
-            try:
-                gw.set_cell(row, 'status', 'Archive in progress')
-                result = archive_url(c, url, gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True), f"{row=}", is_retry=is_retry)
-                if result:
-                    update_sheet(gw, row, url, result)
-                else:
-                    gw.set_cell(row, 'status', 'failed: no archiver')
-            except KeyboardInterrupt:
-                # catches keyboard interruptions to do a clean exit
-                logger.warning(f"caught interrupt on {row=}, {url=}")
-                gw.set_cell(row, 'status', '')
-                c.destroy_webdriver()
-                exit()
-            except Exception as e:
-                logger.error(f'Got unexpected error in row {row} for {url=}: {e}\n{traceback.format_exc()}')
-                gw.set_cell(row, 'status', 'failed: unexpected error (see logs)')
-        logger.success(f'Finished worksheet {wks.title}')
-
-
-@logger.catch
-def main():
-    c = Config()
-    c.parse()
-    logger.info(f'Opening document {c.sheet} for header {c.header}')
-    with tempfile.TemporaryDirectory(dir="./") as tmpdir:
-        Storage.TMP_FOLDER = tmpdir
-        process_sheet(c)
-        c.destroy_webdriver()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/src/auto_archiver/auto_auto_archive.py b/src/auto_archiver/auto_auto_archive.py
index 8c794c5..660be4f 100644
--- a/src/auto_archiver/auto_auto_archive.py
+++ b/src/auto_archiver/auto_auto_archive.py
@@ -1,29 +1,34 @@
-import tempfile
-import auto_archive
-from loguru import logger
-from configs import Config
-from storages import Storage
+
+#TODO: refactor GDriveStorage before merging to main
+# is it possible to have something like this with the new pipeline?
 
 
-def main():
-    c = Config()
-    c.parse()
-    logger.info(f'Opening document {c.sheet} to look for sheet names to archive')
-
-    gc = c.gsheets_client
-    sh = gc.open(c.sheet)
-
-    wks = sh.get_worksheet(0)
-    values = wks.get_all_values()
-
-    with tempfile.TemporaryDirectory(dir="./") as tmpdir:
-        Storage.TMP_FOLDER = tmpdir
-        for i in range(11, len(values)):
-            c.sheet = values[i][0]
-            logger.info(f"Processing {c.sheet}")
-            auto_archive.process_sheet(c)
-        c.destroy_webdriver()
+# # import tempfile
+# import auto_archive
+# from loguru import logger
+# from configs import Config
+# from storages import Storage
 
 
-if __name__ == "__main__":
-    main()
+# def main():
+#     c = Config()
+#     c.parse()
+#     logger.info(f'Opening document {c.sheet} to look for sheet names to archive')
+
+#     gc = c.gsheets_client
+#     sh = gc.open(c.sheet)
+
+#     wks = sh.get_worksheet(0)
+#     values = wks.get_all_values()
+
+#     with tempfile.TemporaryDirectory(dir="./") as tmpdir:
+#         Storage.TMP_FOLDER = tmpdir
+#         for i in range(11, len(values)):
+#             c.sheet = values[i][0]
+#             logger.info(f"Processing {c.sheet}")
+#             auto_archive.process_sheet(c)
+#         c.destroy_webdriver()
+
+
+# if __name__ == "__main__":
+#     main()
diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py
index a171478..04f381e 100644
--- a/src/auto_archiver/core/__init__.py
+++ b/src/auto_archiver/core/__init__.py
@@ -4,4 +4,4 @@ from .step import Step
 
 # cannot import ArchivingOrchestrator/Config to avoid circular dep
 # from .orchestrator import ArchivingOrchestrator
-# from .v2config import ConfigV2
\ No newline at end of file
+# from .config import Config
\ No newline at end of file
diff --git a/src/auto_archiver/core/v2config.py b/src/auto_archiver/core/config.py
similarity index 93%
rename from src/auto_archiver/core/v2config.py
rename to src/auto_archiver/core/config.py
index 3db6618..73bd585 100644
--- a/src/auto_archiver/core/v2config.py
+++ b/src/auto_archiver/core/config.py
@@ -5,31 +5,31 @@ from dataclasses import dataclass, field
 from typing import List
 from collections import defaultdict
 
-from ..archivers import Archiverv2
+from ..archivers import Archiver
 from ..feeders import Feeder
 from ..databases import Database
 from ..formatters import Formatter
-from ..storages import StorageV2
+from ..storages import Storage
 from . import Step
 from ..enrichers import Enricher
 
 
 @dataclass
-class ConfigV2:
+class Config:
     # TODO: should Config inherit from Step so it can have it's own configurations?
     # these are only detected if they are put to the respective __init__.py
     configurable_parents = [
         Feeder,
         Enricher,
-        Archiverv2,
+        Archiver,
         Database,
-        StorageV2,
+        Storage,
         Formatter
         # Util
     ]
     feeder: Step  # TODO:= BaseFeeder
     formatter: Formatter
-    archivers: List[Archiverv2] = field(default_factory=[])  # TODO: fix type
+    archivers: List[Archiver] = field(default_factory=[])  # TODO: fix type
     enrichers: List[Enricher] = field(default_factory=[])
     storages: List[Step] = field(default_factory=[])  # TODO: fix type
     databases: List[Database] = field(default_factory=[])
@@ -107,9 +107,9 @@ class ConfigV2:
         self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config)
         self.formatter = Formatter.init(steps.get("formatter", "html_formatter"), self.config)
         self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])]
-        self.archivers = [Archiverv2.init(e, self.config) for e in (steps.get("archivers") or [])]
+        self.archivers = [Archiver.init(e, self.config) for e in (steps.get("archivers") or [])]
         self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])]
-        self.storages = [StorageV2.init(e, self.config) for e in steps.get("storages", [])]
+        self.storages = [Storage.init(e, self.config) for e in steps.get("storages", [])]
 
         print("feeder", self.feeder)
         print("enrichers", [e for e in self.enrichers])
diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py
index bb3079c..f09d7e0 100644
--- a/src/auto_archiver/core/metadata.py
+++ b/src/auto_archiver/core/metadata.py
@@ -4,7 +4,7 @@ from ast import List, Set
 from typing import Any, Union, Dict
 from dataclasses import dataclass, field
 from dataclasses_json import dataclass_json
-import datetime, mimetypes
+import datetime
 from urllib.parse import urlparse
 from loguru import logger
 from dateutil.parser import parse as parse_dt
@@ -17,7 +17,7 @@ class Metadata:
     status: str = "no archiver"
     _processed_at: datetime = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
     metadata: Dict[str, Any] = field(default_factory=dict)
-    tmp_keys: Set[str] = field(default_factory=set, repr=False)  # keys that are not to be saved in DBs
+    tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude":True})  # keys that are not to be saved in DBs
     media: List[Media] = field(default_factory=list)
     final_media: Media = None  # can be overwritten by formatters
     rearchivable: bool = False
diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py
index 19a35aa..bb7902e 100644
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -3,10 +3,10 @@ from ast import List
 from typing import Union, Dict
 from dataclasses import dataclass
 
-from ..archivers import Archiverv2
+from ..archivers import Archiver
 from ..feeders import Feeder
 from ..formatters import Formatter
-from ..storages import StorageV2
+from ..storages import Storage
 from ..enrichers import Enricher
 from ..databases import Database
 from .media import Media
@@ -59,9 +59,9 @@ class ArchivingOrchestrator:
         self.feeder: Feeder = config.feeder
         self.formatter: Formatter = config.formatter
         self.enrichers = config.enrichers
-        self.archivers: List[Archiverv2] = config.archivers
+        self.archivers: List[Archiver] = config.archivers
         self.databases: List[Database] = config.databases
-        self.storages: List[StorageV2] = config.storages
+        self.storages: List[Storage] = config.storages
 
         for a in self.archivers: a.setup()
 
@@ -69,12 +69,12 @@ class ArchivingOrchestrator:
         for item in self.feeder:
             self.feed_item(item)
 
-    def feed_item(self, item:Metadata) -> Metadata:
+    def feed_item(self, item: Metadata) -> Metadata:
         print("ARCHIVING", item)
         try:
             with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
                 item.set_tmp_dir(tmp_dir)
-                result = self.archive(item)
+                return self.archive(item)
         except KeyboardInterrupt:
             # catches keyboard interruptions to do a clean exit
             logger.warning(f"caught interrupt on {item=}")
@@ -84,8 +84,6 @@ class ArchivingOrchestrator:
             logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
             for d in self.databases: d.failed(item)
 
-        return result
-
         # how does this handle the parameters like folder which can be different for each archiver?
         # the storage needs to know where to archive!!
         # solution: feeders have context: extra metadata that they can read or ignore,
@@ -154,7 +152,7 @@ class ArchivingOrchestrator:
                 for prop in m.properties.values():
                     if isinstance(prop, Media):
                         s.store(prop, result)
-                    if isinstance(prop, list) and len(prop)>0 and isinstance(prop[0], Media):
+                    if isinstance(prop, list) and len(prop) > 0 and isinstance(prop[0], Media):
                         for prop_media in prop:
                             s.store(prop_media, result)
 
diff --git a/src/auto_archiver/core/step.py b/src/auto_archiver/core/step.py
index ae10869..4c04c94 100644
--- a/src/auto_archiver/core/step.py
+++ b/src/auto_archiver/core/step.py
@@ -28,7 +28,7 @@ class Step(ABC):
         for sub in child.__subclasses__():
             if sub.name == name:
                 return sub(config)
-        raise ClassFoundException(f"Unable to initialize STEP with {name=}, check your configuration file/step names.")
+        raise ClassFoundException(f"Unable to initialize STEP with {name=}, check your configuration file/step names, and make sure you made the step discoverable by putting it into __init__.py")
 
     def assert_valid_string(self, prop: str) -> None:
         """
diff --git a/src/auto_archiver/databases/__init__.py b/src/auto_archiver/databases/__init__.py
index 3a3e907..1e676ea 100644
--- a/src/auto_archiver/databases/__init__.py
+++ b/src/auto_archiver/databases/__init__.py
@@ -1,3 +1,4 @@
 from .database import Database
 from .gsheet_db import GsheetsDb
-from .console_db import ConsoleDb
\ No newline at end of file
+from .console_db import ConsoleDb
+from .csv_db import CSVDb
\ No newline at end of file
diff --git a/src/auto_archiver/databases/csv_db.py b/src/auto_archiver/databases/csv_db.py
new file mode 100644
index 0000000..72c804b
--- /dev/null
+++ b/src/auto_archiver/databases/csv_db.py
@@ -0,0 +1,33 @@
+import os
+from loguru import logger
+
+from . import Database
+from ..core import Metadata
+from csv import DictWriter
+from dataclasses import asdict
+
+class CSVDb(Database):
+    """
+        Outputs results to a CSV file
+    """
+    name = "csv_db"
+
+    def __init__(self, config: dict) -> None:
+        # without this STEP.__init__ is not called
+        super().__init__(config)
+        self.assert_valid_string("csv_file")
+
+    @staticmethod
+    def configs() -> dict:
+        return {
+            "csv_file": {"default": "db.csv", "help": "CSV file name"}
+        }
+
+    def done(self, item: Metadata) -> None:
+        """archival result ready - should be saved to DB"""
+        logger.success(f"DONE {item}")
+        is_empty = not os.path.isfile(self.csv_file) or os.path.getsize(self.csv_file) == 0
+        with open(self.csv_file, "a", encoding="utf-8") as outf:
+            writer = DictWriter(outf, fieldnames=asdict(Metadata()))
+            if is_empty: writer.writeheader()
+            writer.writerow(asdict(item))
diff --git a/src/auto_archiver/databases/database.py b/src/auto_archiver/databases/database.py
index 3f09c4a..133afaa 100644
--- a/src/auto_archiver/databases/database.py
+++ b/src/auto_archiver/databases/database.py
@@ -18,7 +18,6 @@ class Database(Step, ABC):
         # only for typing...
         return Step.init(name, config, Database)
 
-    @abstractmethod
     def started(self, item: Metadata) -> None:
         """signals the DB that the given item archival has started"""
         pass
diff --git a/src/auto_archiver/enrichers/wayback_enricher.py b/src/auto_archiver/enrichers/wayback_enricher.py
index 945b148..187c3fe 100644
--- a/src/auto_archiver/enrichers/wayback_enricher.py
+++ b/src/auto_archiver/enrichers/wayback_enricher.py
@@ -2,10 +2,10 @@ from loguru import logger
 import time, requests
 
 from . import Enricher
-from ..archivers import Archiverv2
+from ..archivers import Archiver
 from ..core import Metadata
 
-class WaybackArchiverEnricher(Enricher, Archiverv2):
+class WaybackArchiverEnricher(Enricher, Archiver):
     """
     Submits the current URL to the webarchive and returns a job_id or completed archive
     """
diff --git a/src/auto_archiver/feeders/__init__.py b/src/auto_archiver/feeders/__init__.py
index b11cd50..f42f98f 100644
--- a/src/auto_archiver/feeders/__init__.py
+++ b/src/auto_archiver/feeders/__init__.py
@@ -1,2 +1,3 @@
 from.feeder import Feeder
-from .gsheet_feeder import GsheetsFeeder
\ No newline at end of file
+from .gsheet_feeder import GsheetsFeeder
+from .cli_feeder import CLIFeeder
\ No newline at end of file
diff --git a/src/auto_archiver/feeders/cli_feeder.py b/src/auto_archiver/feeders/cli_feeder.py
new file mode 100644
index 0000000..6d52085
--- /dev/null
+++ b/src/auto_archiver/feeders/cli_feeder.py
@@ -0,0 +1,33 @@
+import gspread, os
+
+# from metadata import Metadata
+from loguru import logger
+
+# from . import Enricher
+from . import Feeder
+from ..core import Metadata
+
+
+class CLIFeeder(Feeder):
+    name = "cli_feeder"
+
+    def __init__(self, config: dict) -> None:
+        # without this STEP.__init__ is not called
+        super().__init__(config)
+        assert type(self.urls) == list and len(self.urls) > 0, "Please provide a CSV list of URL(s) to process, with --cli_feeder.urls='url1,url2,url3'"
+
+    @staticmethod
+    def configs() -> dict:
+        return {
+            "urls": {
+                "default": None,
+                "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
+                "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
+            },
+        }
+
+    def __iter__(self) -> Metadata:
+        for url in self.urls:
+            logger.debug(f"Processing {url}")
+            yield Metadata().set_url(url).set("folder", "cli", True)
+        logger.success(f"Processed {len(self.urls)} URL(s)")
diff --git a/src/auto_archiver/storages/__init__.py b/src/auto_archiver/storages/__init__.py
index 7094d25..250004a 100644
--- a/src/auto_archiver/storages/__init__.py
+++ b/src/auto_archiver/storages/__init__.py
@@ -1,9 +1,3 @@
-# we need to explicitly expose the available imports here
-from .base_storage import Storage
-# from .local_storage import LocalStorage, LocalConfig
-# from .s3_storage import S3Config, S3Storage
-# from .gd_storage import GDConfig, GDStorage
-
-from .storage import StorageV2
-from .s3 import S3StorageV2
-from .local import LocalStorageV2
\ No newline at end of file
+from .storage import Storage
+from .s3 import S3Storage
+from .local import LocalStorage
\ No newline at end of file
diff --git a/src/auto_archiver/storages/base_storage.py b/src/auto_archiver/storages/base_storage.py
deleted file mode 100644
index f147678..0000000
--- a/src/auto_archiver/storages/base_storage.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import os, uuid
-from loguru import logger
-from abc import ABC, abstractmethod
-from pathlib import Path
-
-
-class Storage(ABC):
-    TMP_FOLDER = "tmp/"
-
-    @abstractmethod
-    def __init__(self, config): pass
-
-    @abstractmethod
-    def get_cdn_url(self, key): pass
-
-    @abstractmethod
-    def exists(self, key): pass
-
-    @abstractmethod
-    def uploadf(self, file, key, **kwargs): pass
-
-    def clean_key(self, key):
-        # Some storages does not work well with trailing forward slashes and some keys come with that
-        if key.startswith('/'):
-            logger.debug(f'Found and fixed a leading "/" for {key=}')
-            return key[1:]
-        return key
-
-
-    def upload(self, filename: str, key: str, **kwargs):
-        logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
-        with open(filename, 'rb') as f:
-            self.uploadf(f, key, **kwargs)
diff --git a/src/auto_archiver/storages/gd_storage.py b/src/auto_archiver/storages/gd_storage.py
index 3af77f1..09c8938 100644
--- a/src/auto_archiver/storages/gd_storage.py
+++ b/src/auto_archiver/storages/gd_storage.py
@@ -1,178 +1,181 @@
-import os, time
 
-from loguru import logger
-from .base_storage import Storage
-from dataclasses import dataclass
-from googleapiclient.discovery import build
-from googleapiclient.http import MediaFileUpload
-from google.oauth2 import service_account
+#TODO: refactor GDriveStorage before merging to main
+
+# import os, time
+
+# from loguru import logger
+# from .base_storage import Storage
+# from dataclasses import dataclass
+# from googleapiclient.discovery import build
+# from googleapiclient.http import MediaFileUpload
+# from google.oauth2 import service_account
 
 
-from google.oauth2.credentials import Credentials
-from google.auth.transport.requests import Request
+# from google.oauth2.credentials import Credentials
+# from google.auth.transport.requests import Request
 
-@dataclass
-class GDConfig:
-    root_folder_id: str
-    oauth_token_filename: str
-    service_account: str = "service_account.json"
-    folder: str = "default"
+# @dataclass
+# class GDConfig:
+#     root_folder_id: str
+#     oauth_token_filename: str
+#     service_account: str = "service_account.json"
+#     folder: str = "default"
 
-class GDStorage(Storage):
-    def __init__(self, config: GDConfig):
-        self.folder = config.folder
-        self.root_folder_id = config.root_folder_id
+# class GDStorage(Storage):
+#     def __init__(self, config: GDConfig):
+#         self.folder = config.folder
+#         self.root_folder_id = config.root_folder_id
         
-        SCOPES=['https://www.googleapis.com/auth/drive']
+#         SCOPES=['https://www.googleapis.com/auth/drive']
         
-        token_file = config.oauth_token_filename
-        if token_file is not None:
-            """
-            Tokens are refreshed after 1 hour 
-            however keep working for 7 days (tbc)
-            so as long as the job doesn't last for 7 days
-            then this method of refreshing only once per run will work
-            see this link for details on the token
-            https://davemateer.com/2022/04/28/google-drive-with-python#tokens
-            """
-            logger.debug(f'Using GD OAuth token {token_file}')
-            creds = Credentials.from_authorized_user_file(token_file, SCOPES)
+#         token_file = config.oauth_token_filename
+#         if token_file is not None:
+#             """
+#             Tokens are refreshed after 1 hour 
+#             however keep working for 7 days (tbc)
+#             so as long as the job doesn't last for 7 days
+#             then this method of refreshing only once per run will work
+#             see this link for details on the token
+#             https://davemateer.com/2022/04/28/google-drive-with-python#tokens
+#             """
+#             logger.debug(f'Using GD OAuth token {token_file}')
+#             creds = Credentials.from_authorized_user_file(token_file, SCOPES)
 
-            if not creds or not creds.valid:
-                if creds and creds.expired and creds.refresh_token:
-                    logger.debug('Requesting new GD OAuth token')
-                    creds.refresh(Request())
-                else:
-                    raise Exception("Problem with creds - create the token again")
+#             if not creds or not creds.valid:
+#                 if creds and creds.expired and creds.refresh_token:
+#                     logger.debug('Requesting new GD OAuth token')
+#                     creds.refresh(Request())
+#                 else:
+#                     raise Exception("Problem with creds - create the token again")
 
-                # Save the credentials for the next run
-                with open(token_file, 'w') as token:
-                    logger.debug('Saving new GD OAuth token')
-                    token.write(creds.to_json())
-            else:
-                logger.debug('GD OAuth Token valid')
-        else:
-            gd_service_account = config.service_account
-            logger.debug(f'Using GD Service Account {gd_service_account}')
-            creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES)
+#                 # Save the credentials for the next run
+#                 with open(token_file, 'w') as token:
+#                     logger.debug('Saving new GD OAuth token')
+#                     token.write(creds.to_json())
+#             else:
+#                 logger.debug('GD OAuth Token valid')
+#         else:
+#             gd_service_account = config.service_account
+#             logger.debug(f'Using GD Service Account {gd_service_account}')
+#             creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES)
 
-        self.service = build('drive', 'v3', credentials=creds)
+#         self.service = build('drive', 'v3', credentials=creds)
 
-    def get_cdn_url(self, key):
-        """
-        only support files saved in a folder for GD
-        S3 supports folder and all stored in the root
-        """
-        key = self.clean_key(key)
+#     def get_cdn_url(self, key):
+#         """
+#         only support files saved in a folder for GD
+#         S3 supports folder and all stored in the root
+#         """
+#         key = self.clean_key(key)
 
-        full_name = os.path.join(self.folder, key)
-        parent_id, folder_id = self.root_folder_id, None
-        path_parts = full_name.split(os.path.sep)
-        filename = path_parts[-1]
-        logger.info(f"looking for folders for {path_parts[0:-1]} before uploading {filename=}")
-        for folder in path_parts[0:-1]:
-            folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
-            parent_id = folder_id
+#         full_name = os.path.join(self.folder, key)
+#         parent_id, folder_id = self.root_folder_id, None
+#         path_parts = full_name.split(os.path.sep)
+#         filename = path_parts[-1]
+#         logger.info(f"looking for folders for {path_parts[0:-1]} before uploading {filename=}")
+#         for folder in path_parts[0:-1]:
+#             folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
+#             parent_id = folder_id
 
-        # get id of file inside folder (or sub folder)
-        file_id = self._get_id_from_parent_and_name(folder_id, filename)
-        return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
+#         # get id of file inside folder (or sub folder)
+#         file_id = self._get_id_from_parent_and_name(folder_id, filename)
+#         return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
 
-    def exists(self, key):
-        try:
-            self.get_cdn_url(key)
-            return True
-        except: return False
+#     def exists(self, key):
+#         try:
+#             self.get_cdn_url(key)
+#             return True
+#         except: return False
 
-    def uploadf(self, file: str, key: str, **_kwargs):
-        """
-        1. for each sub-folder in the path check if exists or create
-        2. upload file to root_id/other_paths.../filename
-        """
-        key = self.clean_key(key)
+#     def uploadf(self, file: str, key: str, **_kwargs):
+#         """
+#         1. for each sub-folder in the path check if exists or create
+#         2. upload file to root_id/other_paths.../filename
+#         """
+#         key = self.clean_key(key)
 
-        full_name = os.path.join(self.folder, key)
-        parent_id, upload_to = self.root_folder_id, None
-        path_parts = full_name.split(os.path.sep)
-        filename = path_parts[-1]
-        logger.info(f"checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}")
-        for folder in path_parts[0:-1]:
-            upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False)
-            if upload_to is None:
-                upload_to = self._mkdir(folder, parent_id)
-            parent_id = upload_to
+#         full_name = os.path.join(self.folder, key)
+#         parent_id, upload_to = self.root_folder_id, None
+#         path_parts = full_name.split(os.path.sep)
+#         filename = path_parts[-1]
+#         logger.info(f"checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}")
+#         for folder in path_parts[0:-1]:
+#             upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False)
+#             if upload_to is None:
+#                 upload_to = self._mkdir(folder, parent_id)
+#             parent_id = upload_to
 
-        # upload file to gd
-        logger.debug(f'uploading {filename=} to folder id {upload_to}')
-        file_metadata = {
-            'name': [filename],
-            'parents': [upload_to]
-        }
-        media = MediaFileUpload(file, resumable=True)
-        gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute()
-        logger.debug(f'uploadf: uploaded file {gd_file["id"]} succesfully in folder={upload_to}')
+#         # upload file to gd
+#         logger.debug(f'uploading {filename=} to folder id {upload_to}')
+#         file_metadata = {
+#             'name': [filename],
+#             'parents': [upload_to]
+#         }
+#         media = MediaFileUpload(file, resumable=True)
+#         gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute()
+#         logger.debug(f'uploadf: uploaded file {gd_file["id"]} succesfully in folder={upload_to}')
 
-    def upload(self, filename: str, key: str, **kwargs):
-        # GD only requires the filename not a file reader
-        self.uploadf(filename, key, **kwargs)
+#     def upload(self, filename: str, key: str, **kwargs):
+#         # GD only requires the filename not a file reader
+#         self.uploadf(filename, key, **kwargs)
 
-    # gets the Drive folderID if it is there
-    def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
-        """
-        Retrieves the id of a folder or file from its @name and the @parent_id folder
-        Optionally does multiple @retries and sleeps @sleep_seconds between them
-        If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'"
-        If @raise_on_missing will throw error when not found, or returns None
-        Will remember previous calls to avoid duplication if @use_cache - might not have all edge cases tested, so use at own risk
-        Returns the id of the file or folder from its name as a string
-        """
-        # cache logic
-        if use_cache:
-            self.api_cache = getattr(self, "api_cache", {})
-            cache_key = f"{parent_id}_{name}_{use_mime_type}"
-            if cache_key in self.api_cache:
-                logger.debug(f"cache hit for {cache_key=}")
-                return self.api_cache[cache_key]
+#     # gets the Drive folderID if it is there
+#     def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
+#         """
+#         Retrieves the id of a folder or file from its @name and the @parent_id folder
+#         Optionally does multiple @retries and sleeps @sleep_seconds between them
+#         If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'"
+#         If @raise_on_missing will throw error when not found, or returns None
+#         Will remember previous calls to avoid duplication if @use_cache - might not have all edge cases tested, so use at own risk
+#         Returns the id of the file or folder from its name as a string
+#         """
+#         # cache logic
+#         if use_cache:
+#             self.api_cache = getattr(self, "api_cache", {})
+#             cache_key = f"{parent_id}_{name}_{use_mime_type}"
+#             if cache_key in self.api_cache:
+#                 logger.debug(f"cache hit for {cache_key=}")
+#                 return self.api_cache[cache_key]
 
-        # API logic
-        debug_header: str = f"[searching {name=} in {parent_id=}]"
-        query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false "
-        if use_mime_type:
-            query_string += f" and mimeType='application/vnd.google-apps.folder' "
+#         # API logic
+#         debug_header: str = f"[searching {name=} in {parent_id=}]"
+#         query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false "
+#         if use_mime_type:
+#             query_string += f" and mimeType='application/vnd.google-apps.folder' "
 
-        for attempt in range(retries):
-            results = self.service.files().list(
-                q=query_string,
-                spaces='drive',  # ie not appDataFolder or photos
-                fields='files(id, name)'
-            ).execute()
-            items = results.get('files', [])
+#         for attempt in range(retries):
+#             results = self.service.files().list(
+#                 q=query_string,
+#                 spaces='drive',  # ie not appDataFolder or photos
+#                 fields='files(id, name)'
+#             ).execute()
+#             items = results.get('files', [])
 
-            if len(items) > 0:
-                logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}")
-                _id = items[-1]['id']
-                if use_cache: self.api_cache[cache_key] = _id
-                return _id
-            else:
-                logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}.')
-                if attempt < retries - 1:
-                    logger.debug(f'sleeping for {sleep_seconds} second(s)')
-                    time.sleep(sleep_seconds)
+#             if len(items) > 0:
+#                 logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}")
+#                 _id = items[-1]['id']
+#                 if use_cache: self.api_cache[cache_key] = _id
+#                 return _id
+#             else:
+#                 logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}.')
+#                 if attempt < retries - 1:
+#                     logger.debug(f'sleeping for {sleep_seconds} second(s)')
+#                     time.sleep(sleep_seconds)
 
-        if raise_on_missing:
-            raise ValueError(f'{debug_header} not found after {retries} attempt(s)')
-        return None
+#         if raise_on_missing:
+#             raise ValueError(f'{debug_header} not found after {retries} attempt(s)')
+#         return None
 
-    def _mkdir(self, name: str, parent_id: str):
-        """
-        Creates a new GDrive folder @name inside folder @parent_id
-        Returns id of the created folder
-        """
-        logger.debug(f'Creating new folder with {name=} inside {parent_id=}')
-        file_metadata = {
-            'name': [name],
-            'mimeType': 'application/vnd.google-apps.folder',
-            'parents': [parent_id]
-        }
-        gd_folder = self.service.files().create(body=file_metadata, fields='id').execute()
-        return gd_folder.get('id')
+#     def _mkdir(self, name: str, parent_id: str):
+#         """
+#         Creates a new GDrive folder @name inside folder @parent_id
+#         Returns id of the created folder
+#         """
+#         logger.debug(f'Creating new folder with {name=} inside {parent_id=}')
+#         file_metadata = {
+#             'name': [name],
+#             'mimeType': 'application/vnd.google-apps.folder',
+#             'parents': [parent_id]
+#         }
+#         gd_folder = self.service.files().create(body=file_metadata, fields='id').execute()
+#         return gd_folder.get('id')
diff --git a/src/auto_archiver/storages/local.py b/src/auto_archiver/storages/local.py
index 1d44e6f..f5768a9 100644
--- a/src/auto_archiver/storages/local.py
+++ b/src/auto_archiver/storages/local.py
@@ -8,10 +8,10 @@ from slugify import slugify
 
 from ..core import Metadata
 from ..core import Media
-from ..storages import StorageV2
+from ..storages import Storage
 
 
-class LocalStorageV2(StorageV2):
+class LocalStorage(Storage):
     name = "local_storage"
 
     def __init__(self, config: dict) -> None:
diff --git a/src/auto_archiver/storages/s3.py b/src/auto_archiver/storages/s3.py
index 25eb324..a88e2b2 100644
--- a/src/auto_archiver/storages/s3.py
+++ b/src/auto_archiver/storages/s3.py
@@ -4,12 +4,12 @@ import boto3, uuid, os, mimetypes
 from botocore.errorfactory import ClientError
 from ..core import Metadata
 from ..core import Media
-from ..storages import StorageV2
+from ..storages import Storage
 from loguru import logger
 from slugify import slugify
 
 
-class S3StorageV2(StorageV2):
+class S3Storage(Storage):
     name = "s3_storage"
 
     def __init__(self, config: dict) -> None:
diff --git a/src/auto_archiver/storages/s3_storage.py b/src/auto_archiver/storages/s3_storage.py
deleted file mode 100644
index 563d2ea..0000000
--- a/src/auto_archiver/storages/s3_storage.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import uuid, os, mimetypes
-from dataclasses import dataclass
-
-import boto3
-from botocore.errorfactory import ClientError
-
-from .base_storage import Storage
-from dataclasses import dataclass
-from loguru import logger
-
-
-@dataclass
-class S3Config:
-    bucket: str
-    region: str
-    key: str
-    secret: str
-    folder: str = ""
-    endpoint_url: str = "https://{region}.digitaloceanspaces.com"
-    cdn_url: str = "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
-    private: bool = False
-    key_path: str = "default"  # 'default' uses full naming, 'random' uses generated uuid
-
-
-class S3Storage(Storage):
-
-    def __init__(self, config: S3Config):
-        self.bucket = config.bucket
-        self.region = config.region
-        self.folder = config.folder
-        self.private = config.private
-        self.cdn_url = config.cdn_url
-        self.key_path = config.key_path
-        self.key_dict = {}
-
-        self.s3 = boto3.client(
-            's3',
-            region_name=config.region,
-            endpoint_url=config.endpoint_url.format(region=config.region),
-            aws_access_key_id=config.key,
-            aws_secret_access_key=config.secret
-        )
-
-    def _get_path(self, key):
-        """
-        Depends on the self.key_path configuration:
-        * random - assigns a random UUID which can be used in conjunction with "private=false" to have unguessable documents publicly available -> self.folder/randomUUID
-        * default -> defaults to self.folder/key
-        """
-        # defaults to /key
-        final_key = key
-        if self.key_path == "random":
-            if key not in self.key_dict:
-                ext = os.path.splitext(key)[1]
-                self.key_dict[key] = f"{str(uuid.uuid4())}{ext}"
-            final_key = self.key_dict[key]
-        return os.path.join(self.folder, final_key)
-
-    def get_cdn_url(self, key):
-        return self.cdn_url.format(bucket=self.bucket, region=self.region, key=self._get_path(key))
-
-    def exists(self, key):
-        try:
-            self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key))
-            return True
-        except ClientError:
-            return False
-
-    def uploadf(self, file, key, **kwargs):
-        extra_args = kwargs.get("extra_args", {})
-        if not self.private and 'ACL' not in extra_args:
-            extra_args['ACL'] = 'public-read'
-
-        if 'ContentType' not in extra_args:
-            try:
-                extra_args['ContentType'] = mimetypes.guess_type(key)[0]
-            except Exception as e:
-                logger.error(f"Unable to get mimetype for {key=}, error: {e}")
-
-        self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)
diff --git a/src/auto_archiver/storages/storage.py b/src/auto_archiver/storages/storage.py
index 9d6a005..3e4134a 100644
--- a/src/auto_archiver/storages/storage.py
+++ b/src/auto_archiver/storages/storage.py
@@ -10,7 +10,7 @@ from slugify import slugify
 
 
 @dataclass
-class StorageV2(Step):
+class Storage(Step):
     name = "storage"
 
     def __init__(self, config: dict) -> None:
@@ -18,8 +18,8 @@ class StorageV2(Step):
         super().__init__(config)
 
     # only for typing...
-    def init(name: str, config: dict) -> StorageV2:
-        return Step.init(name, config, StorageV2)
+    def init(name: str, config: dict) -> Storage:
+        return Step.init(name, config, Storage)
 
     def store(self, media: Media, item: Metadata) -> None:
         self.set_key(media, item)