Merge pull request #185 from bellingcat/load_modules

Refactor auto-archiver to use a modular structure for feeders/extractors/enrichers etc.
2026-06-07 19:08:30 +03:00 · 2025-02-11 19:21:46 +01:00
parent e8138eac1c 3787577a96
commit e43dda2817
187 changed files with 5678 additions and 2092 deletions
--- a/.pylintrc
+++ b/.pylintrc
@@ -0,0 +1,3 @@
+[MAIN]
+
+ignore-patterns=(.*tests.*.py, __manifest__.py)
--- a/README.md
+++ b/README.md
@@ -218,7 +218,7 @@ configurations:
 ## Running on Google Sheets Feeder (gsheet_feeder)
 The `--gsheet_feeder.sheet` property is the name of the Google Sheet to check for URLs. 
 This sheet must have been shared with the Google Service account used by `gspread`. 
-This sheet must also have specific columns (case-insensitive) in the `header` as specified in [Gsheet.configs](src/auto_archiver/utils/gsheet.py). The default names of these columns and their purpose is:
+This sheet must also have specific columns (case-insensitive) in the `header` as specified in [gsheet_feeder.__manifest__.py](src/auto_archiver/modules/gsheet_feeder/__manifest__.py). The default names of these columns and their purpose is:

 Inputs:

--- a/poetry.lock
+++ b/poetry.lock
@@ -64,14 +64,14 @@ typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""}

 [[package]]
 name = "attrs"
-version = "24.3.0"
+version = "25.1.0"
 description = "Classes Without Boilerplate"
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
 files = [
-    {file = "attrs-24.3.0-py3-none-any.whl", hash = "sha256:ac96cd038792094f438ad1f6ff80837353805ac950cd2aa0e0625ef19850c308"},
-    {file = "attrs-24.3.0.tar.gz", hash = "sha256:8f5c07333d543103541ba7be0e2ce16eeee8130cb0b3f9238ab904ce1e85baff"},
+    {file = "attrs-25.1.0-py3-none-any.whl", hash = "sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a"},
+    {file = "attrs-25.1.0.tar.gz", hash = "sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e"},
 ]

 [package.extras]
@@ -152,18 +152,18 @@ lxml = ["lxml"]

 [[package]]
 name = "boto3"
-version = "1.36.3"
+version = "1.36.6"
 description = "The AWS SDK for Python"
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
 files = [
-    {file = "boto3-1.36.3-py3-none-any.whl", hash = "sha256:f9843a5d06f501d66ada06f5a5417f671823af2cf319e36ceefa1bafaaaaa953"},
-    {file = "boto3-1.36.3.tar.gz", hash = "sha256:53a5307f6a3526ee2f8590e3c45efa504a3ea4532c1bfe4926c0c19bf188d141"},
+    {file = "boto3-1.36.6-py3-none-any.whl", hash = "sha256:6d473f0f340d02b4e9ad5b8e68786a09728101a8b950231b89ebdaf72b6dca21"},
+    {file = "boto3-1.36.6.tar.gz", hash = "sha256:b36feae061dc0793cf311468956a0a9e99215ce38bc99a1a4e55a5b105f16297"},
 ]

 [package.dependencies]
-botocore = ">=1.36.3,<1.37.0"
+botocore = ">=1.36.6,<1.37.0"
 jmespath = ">=0.7.1,<2.0.0"
 s3transfer = ">=0.11.0,<0.12.0"

@@ -172,14 +172,14 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]

 [[package]]
 name = "botocore"
-version = "1.36.3"
+version = "1.36.6"
 description = "Low-level, data-driven core of boto 3."
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
 files = [
-    {file = "botocore-1.36.3-py3-none-any.whl", hash = "sha256:536ab828e6f90dbb000e3702ac45fd76642113ae2db1b7b1373ad24104e89255"},
-    {file = "botocore-1.36.3.tar.gz", hash = "sha256:775b835e979da5c96548ed1a0b798101a145aec3cd46541d62e27dda5a94d7f8"},
+    {file = "botocore-1.36.6-py3-none-any.whl", hash = "sha256:f77bbbb03fb420e260174650fb5c0cc142ec20a96967734eed2b0ef24334ef34"},
+    {file = "botocore-1.36.6.tar.gz", hash = "sha256:4864c53d638da191a34daf3ede3ff1371a3719d952cc0c6bd24ce2836a38dd77"},
 ]

 [package.dependencies]
@@ -798,14 +798,14 @@ uritemplate = ">=3.0.1,<5"

 [[package]]
 name = "google-auth"
-version = "2.37.0"
+version = "2.38.0"
 description = "Google Authentication Library"
 optional = false
 python-versions = ">=3.7"
 groups = ["main"]
 files = [
-    {file = "google_auth-2.37.0-py2.py3-none-any.whl", hash = "sha256:42664f18290a6be591be5329a96fe30184be1a1badb7292a7f686a9659de9ca0"},
-    {file = "google_auth-2.37.0.tar.gz", hash = "sha256:0054623abf1f9c83492c63d3f47e77f0a544caa3d40b2d98e099a611c2dd5d00"},
+    {file = "google_auth-2.38.0-py2.py3-none-any.whl", hash = "sha256:e7dae6694313f434a2727bf2906f27ad259bae090d7aa896590d86feec3d9d4a"},
+    {file = "google_auth-2.38.0.tar.gz", hash = "sha256:8285113607d3b80a3f1543b75962447ba8a09fe85783432a784fdeef6ac094c4"},
 ]

 [package.dependencies]
@@ -958,13 +958,14 @@ files = [

 [[package]]
 name = "instaloader"
-version = "4.14"
+version = "4.14.1"
 description = "Download pictures (or videos) along with their captions and other metadata from Instagram."
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "instaloader-4.14.tar.gz", hash = "sha256:754425eb17af44ce4bb6056e4eacd044a518d13b5efc11b9d80eb229bb96c652"},
+    {file = "instaloader-4.14.1-py3-none-any.whl", hash = "sha256:43356f696231621ea5a93354f9a4578124fe131940ee9aa1e83c20f57e18f26d"},
+    {file = "instaloader-4.14.1.tar.gz", hash = "sha256:a41a7372a18fb096b3ed545469479884de9cf768e12020c0e0e67c488d9d599c"},
 ]

 [package.dependencies]
@@ -1024,7 +1025,7 @@ version = "0.7.3"
 description = "Python logging made (stupidly) simple"
 optional = false
 python-versions = "<4.0,>=3.5"
-groups = ["main"]
+groups = ["main", "dev"]
 files = [
    {file = "loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c"},
    {file = "loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6"},
@@ -1043,7 +1044,7 @@ version = "3.0.0"
 description = "Python port of markdown-it. Markdown parsing, done right!"
 optional = false
 python-versions = ">=3.8"
-groups = ["docs"]
+groups = ["main", "docs"]
 files = [
    {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"},
    {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"},
@@ -1135,14 +1136,14 @@ files = [

 [[package]]
 name = "marshmallow"
-version = "3.25.1"
+version = "3.26.0"
 description = "A lightweight library for converting complex datatypes to and from native Python datatypes."
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "marshmallow-3.25.1-py3-none-any.whl", hash = "sha256:ec5d00d873ce473b7f2ffcb7104286a376c354cab0c2fa12f5573dab03e87210"},
-    {file = "marshmallow-3.25.1.tar.gz", hash = "sha256:f4debda3bb11153d81ac34b0d582bf23053055ee11e791b54b4b35493468040a"},
+    {file = "marshmallow-3.26.0-py3-none-any.whl", hash = "sha256:1287bca04e6a5f4094822ac153c03da5e214a0a60bcd557b140f3e66991b8ca1"},
+    {file = "marshmallow-3.26.0.tar.gz", hash = "sha256:eb36762a1cc76d7abf831e18a3a1b26d3d481bbc74581b8e532a3d3a8115e1cb"},
 ]

 [package.dependencies]
@@ -1179,7 +1180,7 @@ version = "0.1.2"
 description = "Markdown URL utilities"
 optional = false
 python-versions = ">=3.7"
-groups = ["docs"]
+groups = ["main", "docs"]
 files = [
    {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"},
    {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
@@ -1658,7 +1659,7 @@ version = "2.19.1"
 description = "Pygments is a syntax highlighting package written in Python."
 optional = false
 python-versions = ">=3.8"
-groups = ["docs"]
+groups = ["main", "docs"]
 files = [
    {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"},
    {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"},
@@ -1749,6 +1750,24 @@ tomli = {version = ">=1", markers = "python_version < \"3.11\""}
 [package.extras]
 dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]

+[[package]]
+name = "pytest-loguru"
+version = "0.4.0"
+description = "Pytest Loguru"
+optional = false
+python-versions = ">=3.8"
+groups = ["dev"]
+files = [
+    {file = "pytest_loguru-0.4.0-py3-none-any.whl", hash = "sha256:3cc7b9c6b22cb158209ccbabf0d678dacd3f3c7497d6f46f1c338c13bee1ac77"},
+    {file = "pytest_loguru-0.4.0.tar.gz", hash = "sha256:0d9e4e72ae9bfd92f774c666e7353766af11b0b78edd59c290e89be116050f03"},
+]
+
+[package.dependencies]
+loguru = "*"
+
+[package.extras]
+test = ["pytest", "pytest-cov"]
+
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
@@ -1817,7 +1836,7 @@ version = "6.0.2"
 description = "YAML parser and emitter for Python"
 optional = false
 python-versions = ">=3.8"
-groups = ["main", "docs"]
+groups = ["docs"]
 files = [
    {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
    {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
@@ -2035,6 +2054,41 @@ files = [
 [package.dependencies]
 six = ">=1.7.0"

+[[package]]
+name = "rich"
+version = "13.9.4"
+description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
+optional = false
+python-versions = ">=3.8.0"
+groups = ["main"]
+files = [
+    {file = "rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90"},
+    {file = "rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098"},
+]
+
+[package.dependencies]
+markdown-it-py = ">=2.2.0"
+pygments = ">=2.13.0,<3.0.0"
+typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.11\""}
+
+[package.extras]
+jupyter = ["ipywidgets (>=7.5.1,<9)"]
+
+[[package]]
+name = "rich-argparse"
+version = "1.6.0"
+description = "Rich help formatters for argparse and optparse"
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "rich_argparse-1.6.0-py3-none-any.whl", hash = "sha256:fbe70a1d821b3f2fa8958cddf0cae131870a6e9faa04ab52b409cb1eda809bd7"},
+    {file = "rich_argparse-1.6.0.tar.gz", hash = "sha256:092083c30da186f25bcdff8b1d47fdfb571288510fb051e0488a72cc3128de13"},
+]
+
+[package.dependencies]
+rich = ">=11.0.0"
+
 [[package]]
 name = "rsa"
 version = "4.9"
@@ -2050,16 +2104,92 @@ files = [
 [package.dependencies]
 pyasn1 = ">=0.1.3"

+[[package]]
+name = "ruamel-yaml"
+version = "0.18.10"
+description = "ruamel.yaml is a YAML parser/emitter that supports roundtrip preservation of comments, seq/map flow style, and map key order"
+optional = false
+python-versions = ">=3.7"
+groups = ["main"]
+files = [
+    {file = "ruamel.yaml-0.18.10-py3-none-any.whl", hash = "sha256:30f22513ab2301b3d2b577adc121c6471f28734d3d9728581245f1e76468b4f1"},
+    {file = "ruamel.yaml-0.18.10.tar.gz", hash = "sha256:20c86ab29ac2153f80a428e1254a8adf686d3383df04490514ca3b79a362db58"},
+]
+
+[package.dependencies]
+"ruamel.yaml.clib" = {version = ">=0.2.7", markers = "platform_python_implementation == \"CPython\" and python_version < \"3.13\""}
+
+[package.extras]
+docs = ["mercurial (>5.7)", "ryd"]
+jinja2 = ["ruamel.yaml.jinja2 (>=0.2)"]
+
+[[package]]
+name = "ruamel-yaml-clib"
+version = "0.2.12"
+description = "C version of reader, parser and emitter for ruamel.yaml derived from libyaml"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+markers = "platform_python_implementation == \"CPython\""
+files = [
+    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:11f891336688faf5156a36293a9c362bdc7c88f03a8a027c2c1d8e0bcde998e5"},
+    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:a606ef75a60ecf3d924613892cc603b154178ee25abb3055db5062da811fd969"},
+    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd5415dded15c3822597455bc02bcd66e81ef8b7a48cb71a33628fc9fdde39df"},
+    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f66efbc1caa63c088dead1c4170d148eabc9b80d95fb75b6c92ac0aad2437d76"},
+    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:22353049ba4181685023b25b5b51a574bce33e7f51c759371a7422dcae5402a6"},
+    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:932205970b9f9991b34f55136be327501903f7c66830e9760a8ffb15b07f05cd"},
+    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a52d48f4e7bf9005e8f0a89209bf9a73f7190ddf0489eee5eb51377385f59f2a"},
+    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win32.whl", hash = "sha256:3eac5a91891ceb88138c113f9db04f3cebdae277f5d44eaa3651a4f573e6a5da"},
+    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win_amd64.whl", hash = "sha256:ab007f2f5a87bd08ab1499bdf96f3d5c6ad4dcfa364884cb4549aa0154b13a28"},
+    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:4a6679521a58256a90b0d89e03992c15144c5f3858f40d7c18886023d7943db6"},
+    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:d84318609196d6bd6da0edfa25cedfbabd8dbde5140a0a23af29ad4b8f91fb1e"},
+    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb43a269eb827806502c7c8efb7ae7e9e9d0573257a46e8e952f4d4caba4f31e"},
+    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:811ea1594b8a0fb466172c384267a4e5e367298af6b228931f273b111f17ef52"},
+    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cf12567a7b565cbf65d438dec6cfbe2917d3c1bdddfce84a9930b7d35ea59642"},
+    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7dd5adc8b930b12c8fc5b99e2d535a09889941aa0d0bd06f4749e9a9397c71d2"},
+    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1492a6051dab8d912fc2adeef0e8c72216b24d57bd896ea607cb90bb0c4981d3"},
+    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win32.whl", hash = "sha256:bd0a08f0bab19093c54e18a14a10b4322e1eacc5217056f3c063bd2f59853ce4"},
+    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win_amd64.whl", hash = "sha256:a274fb2cb086c7a3dea4322ec27f4cb5cc4b6298adb583ab0e211a4682f241eb"},
+    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:20b0f8dc160ba83b6dcc0e256846e1a02d044e13f7ea74a3d1d56ede4e48c632"},
+    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:943f32bc9dedb3abff9879edc134901df92cfce2c3d5c9348f172f62eb2d771d"},
+    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95c3829bb364fdb8e0332c9931ecf57d9be3519241323c5274bd82f709cebc0c"},
+    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:749c16fcc4a2b09f28843cda5a193e0283e47454b63ec4b81eaa2242f50e4ccd"},
+    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bf165fef1f223beae7333275156ab2022cffe255dcc51c27f066b4370da81e31"},
+    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:32621c177bbf782ca5a18ba4d7af0f1082a3f6e517ac2a18b3974d4edf349680"},
+    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b82a7c94a498853aa0b272fd5bc67f29008da798d4f93a2f9f289feb8426a58d"},
+    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win32.whl", hash = "sha256:e8c4ebfcfd57177b572e2040777b8abc537cdef58a2120e830124946aa9b42c5"},
+    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win_amd64.whl", hash = "sha256:0467c5965282c62203273b838ae77c0d29d7638c8a4e3a1c8bdd3602c10904e4"},
+    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4c8c5d82f50bb53986a5e02d1b3092b03622c02c2eb78e29bec33fd9593bae1a"},
+    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:e7e3736715fbf53e9be2a79eb4db68e4ed857017344d697e8b9749444ae57475"},
+    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b7e75b4965e1d4690e93021adfcecccbca7d61c7bddd8e22406ef2ff20d74ef"},
+    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96777d473c05ee3e5e3c3e999f5d23c6f4ec5b0c38c098b3a5229085f74236c6"},
+    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:3bc2a80e6420ca8b7d3590791e2dfc709c88ab9152c00eeb511c9875ce5778bf"},
+    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:e188d2699864c11c36cdfdada94d781fd5d6b0071cd9c427bceb08ad3d7c70e1"},
+    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4f6f3eac23941b32afccc23081e1f50612bdbe4e982012ef4f5797986828cd01"},
+    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win32.whl", hash = "sha256:6442cb36270b3afb1b4951f060eccca1ce49f3d087ca1ca4563a6eb479cb3de6"},
+    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win_amd64.whl", hash = "sha256:e5b8daf27af0b90da7bb903a876477a9e6d7270be6146906b276605997c7e9a3"},
+    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:fc4b630cd3fa2cf7fce38afa91d7cfe844a9f75d7f0f36393fa98815e911d987"},
+    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:bc5f1e1c28e966d61d2519f2a3d451ba989f9ea0f2307de7bc45baa526de9e45"},
+    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a0e060aace4c24dcaf71023bbd7d42674e3b230f7e7b97317baf1e953e5b519"},
+    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2f1c3765db32be59d18ab3953f43ab62a761327aafc1594a2a1fbe038b8b8a7"},
+    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d85252669dc32f98ebcd5d36768f5d4faeaeaa2d655ac0473be490ecdae3c285"},
+    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e143ada795c341b56de9418c58d028989093ee611aa27ffb9b7f609c00d813ed"},
+    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2c59aa6170b990d8d2719323e628aaf36f3bfbc1c26279c0eeeb24d05d2d11c7"},
+    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win32.whl", hash = "sha256:beffaed67936fbbeffd10966a4eb53c402fafd3d6833770516bf7314bc6ffa12"},
+    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win_amd64.whl", hash = "sha256:040ae85536960525ea62868b642bdb0c2cc6021c9f9d507810c0c604e66f5a7b"},
+    {file = "ruamel.yaml.clib-0.2.12.tar.gz", hash = "sha256:6c8fbb13ec503f99a91901ab46e0b07ae7941cd527393187039aec586fdfd36f"},
+]
+
 [[package]]
 name = "s3transfer"
-version = "0.11.1"
+version = "0.11.2"
 description = "An Amazon S3 Transfer Manager"
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
 files = [
-    {file = "s3transfer-0.11.1-py3-none-any.whl", hash = "sha256:8fa0aa48177be1f3425176dfe1ab85dcd3d962df603c3dbfc585e6bf857ef0ff"},
-    {file = "s3transfer-0.11.1.tar.gz", hash = "sha256:3f25c900a367c8b7f7d8f9c34edc87e300bde424f779dc9f0a8ae4f9df9264f6"},
+    {file = "s3transfer-0.11.2-py3-none-any.whl", hash = "sha256:be6ecb39fadd986ef1701097771f87e4d2f821f27f6071c872143884d2950fbc"},
+    {file = "s3transfer-0.11.2.tar.gz", hash = "sha256:3b39185cb72f5acc77db1a58b6e25b977f28d20496b6e58d6813d75f464d632f"},
 ]

 [package.dependencies]
@@ -2070,14 +2200,14 @@ crt = ["botocore[crt] (>=1.36.0,<2.0a.0)"]

 [[package]]
 name = "selenium"
-version = "4.28.0"
+version = "4.28.1"
 description = "Official Python bindings for Selenium WebDriver"
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "selenium-4.28.0-py3-none-any.whl", hash = "sha256:3d6a2e8e1b850a1078884ea19f4e011ecdc12263434d87a0b78769836fb82dd8"},
-    {file = "selenium-4.28.0.tar.gz", hash = "sha256:a9fae6eef48d470a1b0c6e45185d96f0dafb025e8da4b346cc41e4da3ac54fa0"},
+    {file = "selenium-4.28.1-py3-none-any.whl", hash = "sha256:4238847e45e24e4472cfcf3554427512c7aab9443396435b1623ef406fff1cc1"},
+    {file = "selenium-4.28.1.tar.gz", hash = "sha256:0072d08670d7ec32db901bd0107695a330cecac9f196e3afb3fa8163026e022a"},
 ]

 [package.dependencies]
@@ -2386,14 +2516,14 @@ test = ["pytest"]

 [[package]]
 name = "starlette"
-version = "0.45.2"
+version = "0.45.3"
 description = "The little ASGI library that shines."
 optional = false
 python-versions = ">=3.9"
 groups = ["docs"]
 files = [
-    {file = "starlette-0.45.2-py3-none-any.whl", hash = "sha256:4daec3356fb0cb1e723a5235e5beaf375d2259af27532958e2d79df549dad9da"},
-    {file = "starlette-0.45.2.tar.gz", hash = "sha256:bba1831d15ae5212b22feab2f218bab6ed3cd0fc2dc1d4442443bb1ee52260e0"},
+    {file = "starlette-0.45.3-py3-none-any.whl", hash = "sha256:dfb6d332576f136ec740296c7e8bb8c8a7125044e7c6da30744718880cdd059d"},
+    {file = "starlette-0.45.3.tar.gz", hash = "sha256:2cbcba2a75806f8a41c722141486f37c28e30a0921c5f6fe4346cb0dcee1302f"},
 ]

 [package.dependencies]
@@ -2920,7 +3050,7 @@ version = "1.2.0"
 description = "A small Python utility to set file creation time on Windows"
 optional = false
 python-versions = ">=3.5"
-groups = ["main"]
+groups = ["main", "dev"]
 markers = "sys_platform == \"win32\""
 files = [
    {file = "win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390"},
@@ -2947,14 +3077,14 @@ h11 = ">=0.9.0,<1"

 [[package]]
 name = "yt-dlp"
-version = "2025.1.12"
+version = "2025.1.26"
 description = "A feature-rich command-line audio/video downloader"
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "yt_dlp-2025.1.12-py3-none-any.whl", hash = "sha256:f7ea19afb64f8e457a1b9598ddb67f8deaa313bf1d57abd5612db9272ab10795"},
-    {file = "yt_dlp-2025.1.12.tar.gz", hash = "sha256:8e7e246e2a5a2cff0a9c13db46844a37a547680702012058c94ec18fce0ca25a"},
+    {file = "yt_dlp-2025.1.26-py3-none-any.whl", hash = "sha256:3e76bd896b9f96601021ca192ca0fbdd195e3c3dcc28302a3a34c9bc4979da7b"},
+    {file = "yt_dlp-2025.1.26.tar.gz", hash = "sha256:1c9738266921ad43c568ad01ac3362fb7c7af549276fbec92bd72f140da16240"},
 ]

 [package.extras]
@@ -2970,4 +3100,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10,<3.13"
-content-hash = "4873baccbe879f3e277bbe4354823ee6a494b1d362939f991dfca46ee9c6a906"
+content-hash = "9ca114395e73af8982abbccc25b385bbca62e50ba7cca8239e52e5c1227cb4b0"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,6 @@ dependencies = [
    "pdqhash (>=0.0.0)",
    "pillow (>=0.0.0)",
    "python-slugify (>=0.0.0)",
-    "pyyaml (>=0.0.0)",
    "dateparser (>=0.0.0)",
    "python-twitter-v2 (>=0.0.0)",
    "instaloader (>=0.0.0)",
@@ -47,7 +46,7 @@ dependencies = [
    "cryptography (>=41.0.0,<42.0.0)",
    "boto3 (>=1.28.0,<2.0.0)",
    "dataclasses-json (>=0.0.0)",
-    "yt-dlp (==2025.1.12)",
+    "yt-dlp (>=2025.1.26,<2026.0.0)",
    "numpy (==2.1.3)",
    "vk-url-scraper (>=0.0.0)",
    "requests[socks] (>=0.0.0)",
@@ -57,11 +56,14 @@ dependencies = [
    "retrying (>=0.0.0)",
    "tsp-client (>=0.0.0)",
    "certvalidator (>=0.0.0)",
+    "rich-argparse (>=1.6.0,<2.0.0)",
+    "ruamel-yaml (>=0.18.10,<0.19.0)",
 ]

 [tool.poetry.group.dev.dependencies]
 pytest = "^8.3.4"
 autopep8 = "^2.3.1"
+pytest-loguru = "^0.4.0"

 [tool.poetry.group.docs.dependencies]
 sphinx = "^8.1.3"
--- a/scripts/create_update_gdrive_oauth_token.py
+++ b/scripts/create_update_gdrive_oauth_token.py
@@ -12,7 +12,7 @@ from googleapiclient.errors import HttpError
 # Code below from https://developers.google.com/drive/api/quickstart/python
 # Example invocation: py scripts/create_update_gdrive_oauth_token.py -c secrets/credentials.json -t secrets/gd-token.json

-SCOPES = ['https://www.googleapis.com/auth/drive']
+SCOPES = ["https://www.googleapis.com/auth/drive.file"]


@click.command(
@@ -23,7 +23,7 @@ SCOPES = ['https://www.googleapis.com/auth/drive']
    "-c",
    type=click.Path(exists=True),
    help="path to the credentials.json file downloaded from https://console.cloud.google.com/apis/credentials",
-    required=True
+    required=True,
 )
@click.option(
    "--token",
@@ -31,59 +31,62 @@ SCOPES = ['https://www.googleapis.com/auth/drive']
    type=click.Path(exists=False),
    default="gd-token.json",
    help="file where to place the OAuth token, defaults to gd-token.json which you must then move to where your orchestration file points to, defaults to gd-token.json",
-    required=True
+    required=True,
 )
 def main(credentials, token):
    # The file token.json stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first time.
    creds = None
    if os.path.exists(token):
-        with open(token, 'r') as stream:
+        with open(token, "r") as stream:
            creds_json = json.load(stream)
            # creds = Credentials.from_authorized_user_file(creds_json, SCOPES)
-            creds_json['refresh_token'] = creds_json.get("refresh_token", "")
+            creds_json["refresh_token"] = creds_json.get("refresh_token", "")
            creds = Credentials.from_authorized_user_info(creds_json, SCOPES)

    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
-            print('Requesting new token')
+            print("Requesting new token")
            creds.refresh(Request())
        else:
-            print('First run through so putting up login dialog')
+            print("First run through so putting up login dialog")
            # credentials.json downloaded from https://console.cloud.google.com/apis/credentials
            flow = InstalledAppFlow.from_client_secrets_file(credentials, SCOPES)
            creds = flow.run_local_server(port=55192)
        # Save the credentials for the next run
-        with open(token, 'w') as token:
-            print('Saving new token')
+        with open(token, "w") as token:
+            print("Saving new token")
            token.write(creds.to_json())
    else:
-        print('Token valid')
+        print("Token valid")

    try:
-        service = build('drive', 'v3', credentials=creds)
+        service = build("drive", "v3", credentials=creds)

        # About the user
        results = service.about().get(fields="*").execute()
-        emailAddress = results['user']['emailAddress']
+        emailAddress = results["user"]["emailAddress"]
        print(emailAddress)

        # Call the Drive v3 API and return some files
-        results = service.files().list(
-            pageSize=10, fields="nextPageToken, files(id, name)").execute()
-        items = results.get('files', [])
+        results = (
+            service.files()
+            .list(pageSize=10, fields="nextPageToken, files(id, name)")
+            .execute()
+        )
+        items = results.get("files", [])

        if not items:
-            print('No files found.')
+            print("No files found.")
            return
-        print('Files:')
+        print("Files:")
        for item in items:
-            print(u'{0} ({1})'.format(item['name'], item['id']))
+            print("{0} ({1})".format(item["name"], item["id"]))

    except HttpError as error:
-        print(f'An error occurred: {error}')
+        print(f"An error occurred: {error}")


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/scripts/telegram_setup.py
+++ b/scripts/telegram_setup.py
@@ -0,0 +1,29 @@
+"""
+This script is used to create a new session file for the Telegram client.
+To do this you must first create a Telegram application at https://my.telegram.org/apps
+And store your id and hash in the environment variables TELEGRAM_API_ID and TELEGRAM_API_HASH.
+Create a .env file, or add the following to your environment :
+```
+export TELEGRAM_API_ID=[YOUR_ID_HERE]
+export TELEGRAM_API_HASH=[YOUR_HASH_HERE]
+```
+Then run this script to create a new session file.
+
+You will need to provide your phone number and a 2FA code the first time you run this script.
+"""
+
+
+import os
+from telethon.sync import TelegramClient
+from loguru import logger
+
+
+# Create a
+API_ID = os.getenv("TELEGRAM_API_ID")
+API_HASH = os.getenv("TELEGRAM_API_HASH")
+SESSION_FILE = "secrets/anon-insta"
+
+os.makedirs("secrets", exist_ok=True)
+with TelegramClient(SESSION_FILE, API_ID, API_HASH) as client:
+    logger.success(f"New session file created: {SESSION_FILE}.session")
+
--- a/src/auto_archiver/init.py
+++ b/src/auto_archiver/init.py
@@ -1,7 +0,0 @@
-from . import archivers, databases, enrichers, feeders, formatters, storages, utils, core
-
-# need to manually specify due to cyclical deps
-from .core.orchestrator import ArchivingOrchestrator
-from .core.config import Config
-# making accessible directly
-from .core.metadata import Metadata
--- a/src/auto_archiver/main.py
+++ b/src/auto_archiver/main.py
@@ -1,13 +1,9 @@
 """ Entry point for the auto_archiver package. """
-from . import Config
-from . import ArchivingOrchestrator
+from auto_archiver.core.orchestrator import ArchivingOrchestrator
+import sys

 def main():
-    config = Config()
-    config.parse()
-    orchestrator = ArchivingOrchestrator(config)
-    for r in orchestrator.feed(): pass
-
+    ArchivingOrchestrator().run(sys.argv[1:])

 if __name__ == "__main__":
    main()
--- a/src/auto_archiver/archivers/init.py
+++ b/src/auto_archiver/archivers/init.py
@@ -1,16 +0,0 @@
-"""
-Archivers are responsible for retrieving the content from various external platforms.
-They act as specialized modules, each tailored to interact with a specific platform,
-service, or data source. The archivers collectively enable the tool to comprehensively
-collect and preserve a variety of content types, such as posts, images, videos and metadata.
-
-"""
-from .archiver import Archiver
-from .telethon_archiver import TelethonArchiver
-from .twitter_api_archiver import TwitterApiArchiver
-from .instagram_archiver import InstagramArchiver
-from .instagram_tbot_archiver import InstagramTbotArchiver
-from .telegram_archiver import TelegramArchiver
-from .vk_archiver import VkArchiver
-from .generic_archiver.generic_archiver import GenericArchiver as YoutubeDLArchiver
-from .instagram_api_archiver import InstagramAPIArchiver
--- a/src/auto_archiver/archivers/generic_archiver/init.py
+++ b/src/auto_archiver/archivers/generic_archiver/init.py
@@ -1 +0,0 @@
-from .generic_archiver import GenericArchiver
--- a/src/auto_archiver/archivers/instagram_tbot_archiver.py
+++ b/src/auto_archiver/archivers/instagram_tbot_archiver.py
@@ -1,104 +0,0 @@
-"""
-InstagramTbotArchiver Module
-
-This module provides functionality to archive Instagram content (posts, stories, etc.) using a Telegram bot (`instagram_load_bot`).
-It interacts with the Telegram API via the Telethon library to send Instagram URLs to the bot, which retrieves the
-relevant media and metadata. The fetched content is saved as `Media` objects in a temporary directory and returned as a
-`Metadata` object.
-"""
-
-import shutil
-from telethon.sync import TelegramClient
-from loguru import logger
-import time, os
-from sqlite3 import OperationalError
-from . import Archiver
-from ..core import Metadata, Media, ArchivingContext
-from ..utils import random_str
-
-
-class InstagramTbotArchiver(Archiver):
-    """
-    calls a telegram bot to fetch instagram posts/stories... and gets available media from it
-    https://github.com/adw0rd/instagrapi
-    https://t.me/instagram_load_bot
-    """
-    name = "instagram_tbot_archiver"
-
-    def __init__(self, config: dict) -> None:
-        super().__init__(config)
-        self.assert_valid_string("api_id")
-        self.assert_valid_string("api_hash")
-        self.timeout = int(self.timeout)
-
-    @staticmethod
-    def configs() -> dict:
-        return {
-            "api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
-            "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
-            "session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
-            "timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."},
-        }
-
-    def setup(self) -> None:
-        """
-        1. makes a copy of session_file that is removed in cleanup
-        2. checks if the session file is valid
-        """
-        logger.info(f"SETUP {self.name} checking login...")
-
-        # make a copy of the session that is used exclusively with this archiver instance
-        new_session_file = os.path.join("secrets/", f"instabot-{time.strftime('%Y-%m-%d')}{random_str(8)}.session")
-        shutil.copy(self.session_file + ".session", new_session_file)
-        self.session_file = new_session_file.replace(".session", "")
-
-        try:
-            self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
-        except OperationalError as e:
-            logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_archiver. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}")
-
-        with self.client.start():
-            logger.success(f"SETUP {self.name} login works.")
-
-    def cleanup(self) -> None:
-        logger.info(f"CLEANUP {self.name}.")
-        session_file_name = self.session_file + ".session"
-        if os.path.exists(session_file_name):
-            os.remove(session_file_name)
-        
-    def download(self, item: Metadata) -> Metadata:
-        url = item.get_url()
-        if not "instagram.com" in url: return False
-
-        result = Metadata()
-        tmp_dir = ArchivingContext.get_tmp_dir()
-        with self.client.start():
-            chat = self.client.get_entity("instagram_load_bot")
-            since_id = self.client.send_message(entity=chat, message=url).id
-
-            attempts = 0
-            seen_media = []
-            message = ""
-            time.sleep(3)
-            # media is added before text by the bot so it can be used as a stop-logic mechanism
-            while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
-                attempts += 1
-                time.sleep(1)
-                for post in self.client.iter_messages(chat, min_id=since_id):
-                    since_id = max(since_id, post.id)
-                    if post.media and post.id not in seen_media:
-                        filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
-                        media = self.client.download_media(post.media, filename_dest)
-                        if media: 
-                            result.add_media(Media(media))
-                            seen_media.append(post.id)
-                    if post.message: message += post.message
-
-            if "You must enter a URL to a post" in message: 
-                logger.debug(f"invalid link {url=} for {self.name}: {message}")
-                return False
-                
-            if message:
-                result.set_content(message).set_title(message[:128])
-
-            return result.success("insta-via-bot")
--- a/src/auto_archiver/archivers/youtubedl_archiver.py
+++ b/src/auto_archiver/archivers/youtubedl_archiver.py
@@ -1,2 +0,0 @@
-# temporary hack, as we implement module
-from .generic_archiver.generic_archiver import GenericArchiver as YoutubeDLArchiver
--- a/src/auto_archiver/core/init.py
+++ b/src/auto_archiver/core/init.py
@@ -3,9 +3,15 @@
 """
 from .metadata import Metadata
 from .media import Media
-from .step import Step
-from .context import ArchivingContext
+from .module import BaseModule

 # cannot import ArchivingOrchestrator/Config to avoid circular dep
 # from .orchestrator import ArchivingOrchestrator
-# from .config import Config
+# from .config import Config
+
+from .database import Database
+from .enricher import Enricher
+from .feeder import Feeder
+from .storage import Storage
+from .extractor import Extractor
+from .formatter import Formatter
--- a/src/auto_archiver/core/base_module.py
+++ b/src/auto_archiver/core/base_module.py
@@ -0,0 +1,146 @@
+
+from urllib.parse import urlparse
+from typing import  Mapping, Any
+from abc import ABC
+from copy import deepcopy, copy
+from tempfile import TemporaryDirectory
+from auto_archiver.utils import url as UrlUtil
+
+from loguru import logger
+
+class BaseModule(ABC):
+
+    """
+    Base module class. All modules should inherit from this class.
+
+    The exact methods a class implements will depend on the type of module it is,
+    however modules can have a .setup() method to run any setup code
+    (e.g. logging in to a site, spinning up a browser etc.)
+
+    See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
+    a subclass can be of multiple types. For example, a module that extracts data from
+    a website and stores it in a database would be both an 'extractor' and a 'database' module.
+
+    Each module is a python package, and should have a __manifest__.py file in the
+    same directory as the module file. The __manifest__.py specifies the module information
+    like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the
+    default manifest structure.
+
+    """
+
+    MODULE_TYPES = [
+        'feeder',
+        'extractor',
+        'enricher',
+        'database',
+        'storage',
+        'formatter'
+    ]
+
+    _DEFAULT_MANIFEST = {
+    'name': '', # the display name of the module
+    'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
+    'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES
+    'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
+    'description': '', # a description of the module
+    'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
+    'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
+    'version': '1.0', # the version of the module
+    'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
+}
+
+    config: Mapping[str, Any]
+    authentication: Mapping[str, Mapping[str, str]]
+    name: str
+
+    # this is set by the orchestrator prior to archiving
+    tmp_dir: TemporaryDirectory = None
+
+    @property
+    def storages(self) -> list:
+        return self.config.get('storages', [])
+
+    def config_setup(self, config: dict):
+
+        authentication = config.get('authentication', {})
+        # extract out concatenated sites
+        for key, val in copy(authentication).items():
+            if "," in key:
+                for site in key.split(","):
+                    authentication[site] = val
+                del authentication[key]
+
+        # this is important. Each instance is given its own deepcopied config, so modules cannot
+        # change values to affect other modules
+        config = deepcopy(config)
+        authentication = deepcopy(config.pop('authentication', {}))
+
+        self.authentication = authentication
+        self.config = config
+        for key, val in config.get(self.name, {}).items():
+            setattr(self, key, val)
+
+    def setup(self):
+        # For any additional setup required by modules, e.g. autehntication
+        pass
+
+    def auth_for_site(self, site: str, extract_cookies=True) -> Mapping[str, Any]:
+        """
+        Returns the authentication information for a given site. This is used to authenticate
+        with a site before extracting data. The site should be the domain of the site, e.g. 'twitter.com'
+        
+        extract_cookies: bool - whether or not to extract cookies from the given browser and return the 
+        cookie jar (disabling can speed up) processing if you don't actually need the cookies jar
+
+        Currently, the dict can have keys of the following types:
+        - username: str - the username to use for login
+        - password: str - the password to use for login
+        - api_key: str - the API key to use for login
+        - api_secret: str - the API secret to use for login
+        - cookie: str - a cookie string to use for login (specific to this site)
+        - cookies_jar: YoutubeDLCookieJar | http.cookiejar.MozillaCookieJar - a cookie jar compatible with requests (e.g. `requests.get(cookies=cookie_jar)`)
+        """
+        # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
+        # for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?
+
+        site = UrlUtil.domain_for_url(site)
+        # add the 'www' version of the site to the list of sites to check
+        authdict = {}
+
+
+        for to_try in [site, f"www.{site}"]:
+            if to_try in self.authentication:
+                authdict.update(self.authentication[to_try])
+                break
+
+        # do a fuzzy string match just to print a warning - don't use it since it's insecure
+        if not authdict:
+            for key in self.authentication.keys():
+                if key in site or site in key:
+                    logger.debug(f"Could not find exact authentication information for site '{site}'. \
+                                    did find information for '{key}' which is close, is this what you meant? \
+                                    If so, edit your authentication settings to make sure it exactly matches.")
+
+        def get_ytdlp_cookiejar(args):
+            import yt_dlp
+            from yt_dlp import parse_options
+            logger.debug(f"Extracting cookies from settings: {args[1]}")
+            # parse_options returns a named tuple as follows, we only need the ydl_options part
+            # collections.namedtuple('ParsedOptions', ('parser', 'options', 'urls', 'ydl_opts'))
+            ytdlp_opts = getattr(parse_options(args), 'ydl_opts')
+            return yt_dlp.YoutubeDL(ytdlp_opts).cookiejar
+
+        # get the cookies jar, prefer the browser cookies than the file
+        if 'cookies_from_browser' in self.authentication:
+            authdict['cookies_from_browser'] = self.authentication['cookies_from_browser']
+            if extract_cookies:
+                authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies-from-browser', self.authentication['cookies_from_browser']])
+        elif 'cookies_file' in self.authentication:
+            authdict['cookies_file'] = self.authentication['cookies_file']
+            if extract_cookies:
+                authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies', self.authentication['cookies_file']])
+        
+        return authdict
+    
+    def repr(self):
+        return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
--- a/src/auto_archiver/core/config.py
+++ b/src/auto_archiver/core/config.py
@@ -5,125 +5,159 @@ flexible setup in various environments.

 """

-import importlib
 import argparse
-import yaml
-from dataclasses import dataclass, field
-from typing import List
-from collections import defaultdict
+from ruamel.yaml import YAML, CommentedMap, add_representer
+
 from loguru import logger

-from ..archivers import Archiver
-from ..feeders import Feeder
-from ..databases import Database
-from ..formatters import Formatter
-from ..storages import Storage
-from ..enrichers import Enricher
-from . import Step
-from ..utils import update_nested_dict
+from copy import deepcopy
+from .module import BaseModule

+from typing import Any, List, Type, Tuple

-@dataclass
-class Config:
-    configurable_parents = [
-        Feeder,
-        Enricher,
-        Archiver,
-        Database,
-        Storage,
-        Formatter
-        # Util
-    ]
-    feeder: Feeder
-    formatter: Formatter
-    archivers: List[Archiver] = field(default_factory=[])
-    enrichers: List[Enricher] = field(default_factory=[])
-    storages: List[Storage] = field(default_factory=[])
-    databases: List[Database] = field(default_factory=[])
+_yaml: YAML = YAML()

-    def __init__(self) -> None:
-        self.defaults = {}
-        self.cli_ops = {}
-        self.config = {}
+EMPTY_CONFIG = _yaml.load("""
+# Auto Archiver Configuration
+# Steps are the modules that will be run in the order they are defined

-    def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}):
+steps:""" + "".join([f"\n   {module}s: []" for module in BaseModule.MODULE_TYPES]) + \
+"""
+
+# Global configuration
+
+# Authentication
+# a dictionary of authentication information that can be used by extractors to login to website. 
+# you can use a comma separated list for multiple domains on the same line (common usecase: x.com,twitter.com)
+# Common login 'types' are username/password, cookie, api key/token.
+# There are two special keys for using cookies, they are: cookies_file and cookies_from_browser. 
+# Some Examples:
+# facebook.com:
+#   username: "my_username"
+#   password: "my_password"
+# or for a site that uses an API key:
+# twitter.com,x.com:
+#   api_key
+#   api_secret
+# youtube.com:
+#   cookie: "login_cookie=value ; other_cookie=123" # multiple 'key=value' pairs should be separated by ;
+
+authentication: {}
+
+# These are the global configurations that are used by the modules
+
+logging:
+  level: INFO
+""")
+# note: 'logging' is explicitly added above in order to better format the config file
+
+class DefaultValidatingParser(argparse.ArgumentParser):
+
+    def error(self, message):
        """
-        if yaml_config_filename is provided, the --config argument is ignored, 
-        useful for library usage when the config values are preloaded
-        overwrite_configs is a dict that overwrites the yaml file contents
+        Override of error to format a nicer looking error message using logger
        """
-        # 1. parse CLI values
-        if use_cli:
-            parser = argparse.ArgumentParser(
-                # prog = "auto-archiver",
-                description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!",
-                epilog="Check the code at https://github.com/bellingcat/auto-archiver"
-            )
+        logger.error("Problem with configuration file (tip: use --help to see the available options):")
+        logger.error(message)
+        self.exit(2)

-            parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml')
-            parser.add_argument('--version', action='version', version=importlib.metadata.version('auto_archiver'))
+    def parse_known_args(self, args=None, namespace=None):
+        """
+        Override of parse_known_args to also check the 'defaults' values - which are passed in from the config file
+        """
+        for action in self._actions:
+            if not namespace or action.dest not in namespace:
+                # for actions that are required and already have a default value, remove the 'required' check
+                if action.required and action.default is not None:
+                    action.required = False

-        # Iterate over all step subclasses to gather default configs and CLI arguments
-        for configurable in self.configurable_parents:
-            child: Step
-            for child in configurable.__subclasses__():
-                assert child.configs() is not None and type(child.configs()) == dict, f"class '{child.name}' should have a configs method returning a dict."
-                for config, details in child.configs().items():
-                    assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
-                    assert "." not in config, f"config property cannot contain dots('.'): {config}"
-                    config_path = f"{child.name}.{config}"
+                if action.default is not None:
+                    try:
+                        self._check_value(action, action.default)
+                    except argparse.ArgumentError as e:
+                        logger.error(f"You have an invalid setting in your configuration file ({action.dest}):")
+                        logger.error(e)
+                        exit()

-                    if use_cli:
-                        try:
-                            parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None))
-                        except argparse.ArgumentError:
-                            # captures cases when a Step is used in 2 flows, eg: wayback enricher vs wayback archiver
-                            pass
+        return super().parse_known_args(args, namespace)

-                    self.defaults[config_path] = details["default"]
-                    if "cli_set" in details:
-                        self.cli_ops[config_path] = details["cli_set"]

-        if use_cli:
-            args = parser.parse_args()
-            yaml_config_filename = yaml_config_filename or getattr(args, "config")
-        else: args = {}
+def to_dot_notation(yaml_conf: CommentedMap | dict) -> dict:
+    dotdict = {}

-        # 2. read YAML config file (or use provided value)
-        self.yaml_config = self.read_yaml(yaml_config_filename)
-        update_nested_dict(self.yaml_config, overwrite_configs)
+    def process_subdict(subdict, prefix=""):
+        for key, value in subdict.items():
+            if is_dict_type(value):
+                process_subdict(value, f"{prefix}{key}.")
+            else:
+                dotdict[f"{prefix}{key}"] = value

-        # 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default
-        self.config = defaultdict(dict)
-        for config_path, default in self.defaults.items():
-            child, config = tuple(config_path.split("."))
-            val = getattr(args, config_path, None)
-            if val is not None and config_path in self.cli_ops:
-                val = self.cli_ops[config_path](val, default)
-            if val is None:
-                val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default)
-            self.config[child][config] = val
-        self.config = dict(self.config)
+    process_subdict(yaml_conf)
+    return dotdict

-        # 4. STEPS: read steps and validate they exist
-        steps = self.yaml_config.get("steps", {})
-        assert "archivers" in steps, "your configuration steps are missing the archivers property"
-        assert "storages" in steps, "your configuration steps are missing the storages property"
+def from_dot_notation(dotdict: dict) -> dict:
+    normal_dict = {}

-        self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config)
-        self.formatter = Formatter.init(steps.get("formatter", "mute_formatter"), self.config)
-        self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])]
-        self.archivers = [Archiver.init(e, self.config) for e in (steps.get("archivers") or [])]
-        self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])]
-        self.storages = [Storage.init(e, self.config) for e in steps.get("storages", [])]
+    def add_part(key, value, current_dict):
+        if "." in key:
+            key_parts = key.split(".")
+            current_dict.setdefault(key_parts[0], {})
+            add_part(".".join(key_parts[1:]), value, current_dict[key_parts[0]])
+        else:
+            current_dict[key] = value

-        logger.info(f"FEEDER: {self.feeder.name}")
-        logger.info(f"ENRICHERS: {[x.name for x in self.enrichers]}")
-        logger.info(f"ARCHIVERS: {[x.name for x in self.archivers]}")
-        logger.info(f"DATABASES: {[x.name for x in self.databases]}")
-        logger.info(f"STORAGES: {[x.name for x in self.storages]}")
-        logger.info(f"FORMATTER: {self.formatter.name}")
+    for key, value in dotdict.items():
+        add_part(key, value, normal_dict)

-    def read_yaml(self, yaml_filename: str) -> dict:
+    return normal_dict
+
+
+def is_list_type(value):
+    return isinstance(value, list) or isinstance(value, tuple) or isinstance(value, set)
+
+def is_dict_type(value):
+    return isinstance(value, dict) or isinstance(value, CommentedMap)
+
+def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
+    yaml_dict: CommentedMap = deepcopy(yaml_dict)
+
+    # first deal with lists, since 'update' replaces lists from a in b, but we want to extend
+    def update_dict(subdict, yaml_subdict):
+        for key, value in subdict.items():
+            if not yaml_subdict.get(key):
+                yaml_subdict[key] = value
+                continue
+
+            if is_dict_type(value):
+                update_dict(value, yaml_subdict[key])
+            elif is_list_type(value):
+                yaml_subdict[key].extend(s for s in value if s not in yaml_subdict[key])
+            else:
+                yaml_subdict[key] = value
+
+    update_dict(from_dot_notation(dotdict), yaml_dict)
+
+    return yaml_dict
+
+def read_yaml(yaml_filename: str) -> CommentedMap:
+    config = None
+    try:
        with open(yaml_filename, "r", encoding="utf-8") as inf:
-            return yaml.safe_load(inf)
+            config = _yaml.load(inf)
+    except FileNotFoundError:
+        pass
+
+    if not config:
+        config = EMPTY_CONFIG
+    
+    return config
+
+# TODO: make this tidier/find a way to notify of which keys should not be stored
+
+
+def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
+    config_to_save = deepcopy(config)
+
+    config_to_save.pop('urls', None)
+    with open(yaml_filename, "w", encoding="utf-8") as outf:
+        _yaml.dump(config_to_save, outf)
--- a/src/auto_archiver/core/context.py
+++ b/src/auto_archiver/core/context.py
@@ -1,64 +0,0 @@
-""" ArchivingContext provides a global context for managing configurations and temporary data during the archiving process.
-
-This singleton class allows for:
- Storing and retrieving key-value pairs that are accessible throughout the application lifecycle.
- Marking certain values to persist across resets using `keep_on_reset`.
- Managing temporary directories and other shared data used during the archiving process.
-
-### Key Features:
- Creates a single global instance.
- Reset functionality allows for clearing configurations, with options for partial or full resets.
- Custom getters and setters for commonly used context values like temporary directories.
-
-"""
-
-class ArchivingContext:
-    """
-    Singleton context class for managing global configurations and temporary data.
-
-    ArchivingContext._get_instance() to retrieve it if needed
-    otherwise just 
-    ArchivingContext.set(key, value)
-    and 
-    ArchivingContext.get(key, default)
-
-    When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True)
-        reset(full_reset=True) will recreate everything including the keep_on_reset status
-    """
-    _instance = None
-
-    def __init__(self):
-        self.configs = {}
-        self.keep_on_reset = set()
-
-    @staticmethod
-    def get_instance():
-        if ArchivingContext._instance is None:
-            ArchivingContext._instance = ArchivingContext()
-        return ArchivingContext._instance
-
-    @staticmethod
-    def set(key, value, keep_on_reset: bool = False):
-        ac = ArchivingContext.get_instance()
-        ac.configs[key] = value
-        if keep_on_reset: ac.keep_on_reset.add(key)
-
-    @staticmethod
-    def get(key: str, default=None):
-        return ArchivingContext.get_instance().configs.get(key, default)
-
-    @staticmethod
-    def reset(full_reset: bool = False):
-        ac = ArchivingContext.get_instance()
-        if full_reset: ac.keep_on_reset = set()
-        ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
-
-    # ---- custom getters/setters for widely used context values
-
-    @staticmethod
-    def set_tmp_dir(tmp_dir: str):
-        ArchivingContext.get_instance().configs["tmp_dir"] = tmp_dir
-
-    @staticmethod
-    def get_tmp_dir() -> str:
-        return ArchivingContext.get_instance().configs.get("tmp_dir")
--- a/src/auto_archiver/databases/database.py
+++ b/src/auto_archiver/databases/database.py
@@ -1,22 +1,10 @@
 from __future__ import annotations
-from dataclasses import dataclass
-from abc import abstractmethod, ABC
+from abc import abstractmethod
 from typing import Union

-from ..core import Metadata, Step
+from auto_archiver.core import Metadata, BaseModule

-
-@dataclass
-class Database(Step, ABC):
-    name = "database"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-
-    def init(name: str, config: dict) -> Database:
-        # only for typing...
-        return Step.init(name, config, Database)
+class Database(BaseModule):

    def started(self, item: Metadata) -> None:
        """signals the DB that the given item archival has started"""
--- a/src/auto_archiver/core/enricher.py
+++ b/src/auto_archiver/core/enricher.py
@@ -0,0 +1,19 @@
+"""
+Enrichers are modular components that enhance archived content by adding
+context, metadata, or additional processing.
+
+These add additional information to the context, such as screenshots, hashes, and metadata.
+They are designed to work within the archiving pipeline, operating on `Metadata` objects after
+the archiving step and before storage or formatting.
+
+Enrichers are optional but highly useful for making the archived data more powerful.
+"""
+from __future__ import annotations
+from abc import abstractmethod
+from auto_archiver.core import Metadata, BaseModule
+
+class Enricher(BaseModule):
+    """Base classes and utilities for enrichers in the Auto-Archiver system."""
+
+    @abstractmethod
+    def enrich(self, to_enrich: Metadata) -> None: pass
--- a/src/auto_archiver/archivers/archiver.py
+++ b/src/auto_archiver/archivers/archiver.py
@@ -1,7 +1,7 @@
-""" The `archiver` module defines the base functionality for implementing archivers in the media archiving framework.
-    This class provides common utility methods and a standard interface for archivers.
+""" The `extractor` module defines the base functionality for implementing extractors in the media archiving framework.
+    This class provides common utility methods and a standard interface for extractors.

-    Factory method to initialize an archiver instance based on its name.
+    Factory method to initialize an extractor instance based on its name.


 """
@@ -11,48 +11,44 @@ from abc import abstractmethod
 from dataclasses import dataclass
 import mimetypes
 import os
-import mimetypes, requests
+import mimetypes
+import requests
 from loguru import logger
 from retrying import retry
+import re

-from ..core import Metadata, Step, ArchivingContext
+from auto_archiver.core import Metadata, BaseModule


-@dataclass
-class Archiver(Step):
+class Extractor(BaseModule):
    """
-    Base class for implementing archivers in the media archiving framework.
+    Base class for implementing extractors in the media archiving framework.
    Subclasses must implement the `download` method to define platform-specific behavior.
    """

-    name = "archiver"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-
-    def init(name: str, config: dict) -> Archiver:
-        # only for typing...
-        return Step.init(name, config, Archiver)
-
-    def setup(self) -> None:
-        # used when archivers need to login or do other one-time setup
-        pass
+    valid_url: re.Pattern = None

    def cleanup(self) -> None:
-        # called when archivers are done, or upon errors, cleanup any resources
+        # called when extractors are done, or upon errors, cleanup any resources
        pass

    def sanitize_url(self, url: str) -> str:
        # used to clean unnecessary URL parameters OR unfurl redirect links
        return url
    
+    def match_link(self, url: str) -> re.Match:
+        return self.valid_url.match(url)
+
    def suitable(self, url: str) -> bool:
        """
-        Returns True if this archiver can handle the given URL
-        
+        Returns True if this extractor can handle the given URL
+
        Should be overridden by subclasses
+
        """
+        if self.valid_url:
+            return self.match_link(url) is not None
+        
        return True

    def _guess_file_type(self, path: str) -> str:
@@ -74,7 +70,7 @@ class Archiver(Step):
            to_filename = url.split('/')[-1].split('?')[0]
            if len(to_filename) > 64:
                to_filename = to_filename[-64:]
-        to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
+        to_filename = os.path.join(self.tmp_dir, to_filename)
        if verbose: logger.debug(f"downloading {url[0:50]=} {to_filename=}")
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
@@ -84,8 +80,8 @@ class Archiver(Step):
            d.raise_for_status()

            # get mimetype from the response headers
-            if not Path(to_filename).suffix:
-                content_type = d.headers.get('Content-Type')
+            if not mimetypes.guess_type(to_filename)[0]:
+                content_type = d.headers.get('Content-Type') or self._guess_file_type(url)
                extension = mimetypes.guess_extension(content_type)
                if extension:
                    to_filename += extension
@@ -94,10 +90,16 @@ class Archiver(Step):
                for chunk in d.iter_content(chunk_size=8192):
                    f.write(chunk)
            return to_filename
-        
+
        except requests.RequestException as e:
            logger.warning(f"Failed to fetch the Media URL: {e}")

    @abstractmethod
-    def download(self, item: Metadata) -> Metadata:
-        pass
+    def download(self, item: Metadata) -> Metadata | False:
+        """
+        Downloads the media from the given URL and returns a Metadata object with the downloaded media.
+        
+        If the URL is not supported or the download fails, this method should return False.
+
+        """
+        pass
--- a/src/auto_archiver/core/feeder.py
+++ b/src/auto_archiver/core/feeder.py
@@ -0,0 +1,9 @@
+from __future__ import annotations
+from abc import abstractmethod
+from auto_archiver.core import Metadata
+from auto_archiver.core import BaseModule
+
+class Feeder(BaseModule):
+
+    @abstractmethod
+    def __iter__(self) -> Metadata: return None
--- a/src/auto_archiver/core/formatter.py
+++ b/src/auto_archiver/core/formatter.py
@@ -0,0 +1,9 @@
+from __future__ import annotations
+from abc import abstractmethod
+from auto_archiver.core import Metadata, Media, BaseModule
+
+
+class Formatter(BaseModule):
+
+    @abstractmethod
+    def format(self, item: Metadata) -> Media: return None
--- a/src/auto_archiver/core/media.py
+++ b/src/auto_archiver/core/media.py
@@ -11,11 +11,6 @@ from dataclasses import dataclass, field
 from dataclasses_json import dataclass_json, config
 import mimetypes

-import ffmpeg
-from ffmpeg._run import Error
-
-from .context import ArchivingContext
-
 from loguru import logger


@@ -39,12 +34,11 @@ class Media:
    _mimetype: str = None  # eg: image/jpeg
    _stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True))  # always exclude

-    def store(self: Media, override_storages: List = None, url: str = "url-not-available", metadata: Any = None):
+    def store(self: Media, metadata: Any, url: str = "url-not-available", storages: List[Any] = None) -> None:
        # 'Any' typing for metadata to avoid circular imports. Stores the media
        # into the provided/available storages [Storage] repeats the process for
        # its properties, in case they have inner media themselves for now it
        # only goes down 1 level but it's easy to make it recursive if needed.
-        storages = override_storages or ArchivingContext.get("storages")
        if not len(storages):
            logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
            return
@@ -69,8 +63,9 @@ class Media:
                        for inner_media in prop_media.all_inner_media(include_self=True):
                            yield inner_media

-    def is_stored(self) -> bool:
-        return len(self.urls) > 0 and len(self.urls) == len(ArchivingContext.get("storages"))
+    def is_stored(self, in_storage) -> bool:
+        # checks if the media is already stored in the given storage
+        return len(self.urls) > 0 and len(self.urls) == len(in_storage.config["steps"]["storages"])

    def set(self, key: str, value: Any) -> Media:
        self.properties[key] = value
@@ -106,6 +101,12 @@ class Media:
        return self.mimetype.startswith("image")

    def is_valid_video(self) -> bool:
+        # Note: this is intentional, to only import ffmpeg here - when the method is called
+        # this speeds up loading the module. We check that 'ffmpeg' is available on startup
+        # when we load each manifest file
+        import ffmpeg
+        from ffmpeg._run import Error
+
        # checks for video streams with ffmpeg, or min file size for a video
        # self.is_video() should be used together with this method
        try:
--- a/src/auto_archiver/core/metadata.py
+++ b/src/auto_archiver/core/metadata.py
@@ -20,8 +20,6 @@ from dateutil.parser import parse as parse_dt
 from loguru import logger

 from .media import Media
-from .context import ArchivingContext
-

@dataclass_json  # annotation order matters
@dataclass
@@ -32,6 +30,7 @@ class Metadata:

    def __post_init__(self):
        self.set("_processed_at", datetime.datetime.now(datetime.timezone.utc))
+        self._context = {}

    def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
        """
@@ -45,6 +44,7 @@ class Metadata:
        if overwrite_left:
            if right.status and len(right.status):
                self.status = right.status
+            self._context.update(right._context)
            for k, v in right.metadata.items():
                assert k not in self.metadata or type(v) == type(self.get(k))
                if type(v) not in [dict, list, set] or k not in self.metadata:
@@ -57,12 +57,11 @@ class Metadata:
            return right.merge(self)
        return self

-    def store(self: Metadata, override_storages: List = None):
+    def store(self, storages=[]):
        # calls .store for all contained media. storages [Storage]
        self.remove_duplicate_media_by_hash()
-        storages = override_storages or ArchivingContext.get("storages")
        for media in self.media:
-            media.store(override_storages=storages, url=self.get_url(), metadata=self)
+            media.store(url=self.get_url(), metadata=self, storages=storages)

    def set(self, key: str, val: Any) -> Metadata:
        self.metadata[key] = val
@@ -206,3 +205,10 @@ class Metadata:
            if len(r.media) > len(most_complete.media): most_complete = r
            elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r
        return most_complete
+
+    def set_context(self, key: str, val: Any) -> Metadata:
+        self._context[key] = val
+        return self
+    
+    def get_context(self, key: str, default: Any = None) -> Any:
+        return self._context.get(key, default)
--- a/src/auto_archiver/core/module.py
+++ b/src/auto_archiver/core/module.py
@@ -0,0 +1,249 @@
+"""
+Defines the Step abstract base class, which acts as a blueprint for steps in the archiving pipeline
+by handling user configuration, validating the steps properties, and implementing dynamic instantiation.
+
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import List
+import shutil
+import ast
+import copy
+import sys
+from importlib.util import find_spec
+import os
+from os.path import join
+from loguru import logger
+import auto_archiver
+from .base_module import BaseModule
+
+_LAZY_LOADED_MODULES = {}
+
+MANIFEST_FILE = "__manifest__.py"
+
+
+def setup_paths(paths: list[str]) -> None:
+    """
+    Sets up the paths for the modules to be loaded from
+    
+    This is necessary for the modules to be imported correctly
+    
+    """
+    for path in paths:
+        # check path exists, if it doesn't, log a warning
+        if not os.path.exists(path):
+            logger.warning(f"Path '{path}' does not exist. Skipping...")
+            continue
+
+        # see odoo/module/module.py -> initialize_sys_path
+        if path not in auto_archiver.modules.__path__:
+                auto_archiver.modules.__path__.append(path)
+
+    # sort based on the length of the path, so that the longest path is last in the list
+    auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
+
+def get_module(module_name: str, config: dict) -> BaseModule:
+    """
+    Gets and sets up a module using the provided config
+    
+    This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy)
+    
+    """
+    return get_module_lazy(module_name).load(config)
+
+def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBaseModule:
+    """
+    Lazily loads a module, returning a LazyBaseModule
+    
+    This has all the information about the module, but does not load the module itself or its dependencies
+    
+    To load an actual module, call .setup() on a lazy module
+    
+    """
+    if module_name in _LAZY_LOADED_MODULES:
+        return _LAZY_LOADED_MODULES[module_name]
+
+    available = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
+    if not available:
+        raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
+    return available[0]
+
+def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
+    
+    # search through all valid 'modules' paths. Default is 'modules' in the current directory
+
+    # see odoo/modules/module.py -> get_modules
+    def is_really_module(module_path):
+        if os.path.isfile(join(module_path, MANIFEST_FILE)):
+            return True
+
+    all_modules = []
+
+    for module_folder in auto_archiver.modules.__path__:
+        # walk through each module in module_folder and check if it has a valid manifest
+        try:
+            possible_modules = os.listdir(module_folder)
+        except FileNotFoundError:
+            logger.warning(f"Module folder {module_folder} does not exist")
+            continue
+
+        for possible_module in possible_modules:
+            if limit_to_modules and possible_module not in limit_to_modules:
+                continue
+
+            possible_module_path = join(module_folder, possible_module)
+            if not is_really_module(possible_module_path):
+                continue
+            if _LAZY_LOADED_MODULES.get(possible_module):
+                continue
+            lazy_module = LazyBaseModule(possible_module, possible_module_path)
+
+            _LAZY_LOADED_MODULES[possible_module] = lazy_module
+
+            all_modules.append(lazy_module)
+    
+    if not suppress_warnings:
+        for module in limit_to_modules:
+            if not any(module == m.name for m in all_modules):
+                logger.warning(f"Module '{module}' not found. Are you sure it's installed?")
+
+    return all_modules
+
+@dataclass
+class LazyBaseModule:
+
+    """
+    A lazy module class, which only loads the manifest and does not load the module itself.
+
+    This is useful for getting information about a module without actually loading it.
+
+    """
+    name: str
+    type: list
+    description: str
+    path: str
+
+    _manifest: dict = None
+    _instance: BaseModule = None
+    _entry_point: str = None
+
+    def __init__(self, module_name, path):
+        self.name = module_name
+        self.path = path
+
+    @property
+    def entry_point(self):
+        if not self._entry_point and not self.manifest['entry_point']:
+            # try to create the entry point from the module name
+            self._entry_point = f"{self.name}::{self.name.replace('_', ' ').title().replace(' ', '')}"
+        return self._entry_point
+
+    @property
+    def dependencies(self) -> dict:
+        return self.manifest['dependencies']
+    
+    @property
+    def configs(self) -> dict:
+        return self.manifest['configs']
+    
+    @property
+    def requires_setup(self) -> bool:
+        return self.manifest['requires_setup']
+    
+    @property
+    def display_name(self) -> str:
+        return self.manifest['name']
+
+    @property
+    def manifest(self) -> dict:
+        if self._manifest:
+            return self._manifest
+        # print(f"Loading manifest for module {module_path}")
+        # load the manifest file
+        manifest = copy.deepcopy(BaseModule._DEFAULT_MANIFEST)
+
+        with open(join(self.path, MANIFEST_FILE)) as f:
+            try:
+                manifest.update(ast.literal_eval(f.read()))
+            except (ValueError, TypeError, SyntaxError, MemoryError, RecursionError) as e:
+                logger.error(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}")
+            
+        self._manifest = manifest
+        self.type = manifest['type']
+        self._entry_point = manifest['entry_point']
+        self.description = manifest['description']
+        self.version = manifest['version']
+
+        return manifest
+
+    def load(self, config) -> BaseModule:
+
+        if self._instance:
+            return self._instance
+
+        # check external dependencies are installed
+        def check_deps(deps, check):
+            for dep in deps:
+                if not len(dep):
+                    # clear out any empty strings that a user may have erroneously added
+                    continue
+                if not check(dep):
+                    logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
+                    exit(1)
+
+        def check_python_dep(dep):
+            # first check if it's a module:
+            try:
+                m = get_module_lazy(dep, suppress_warnings=True)
+                try:
+                # we must now load this module and set it up with the config
+                    m.load(config)
+                    return True
+                except:
+                    logger.error(f"Unable to setup module '{dep}' for use in module '{self.name}'")
+                    return False
+            except IndexError:
+                # not a module, continue
+                pass
+
+            return find_spec(dep)
+
+        check_deps(self.dependencies.get('python', []), check_python_dep)
+        check_deps(self.dependencies.get('bin', []), lambda dep: shutil.which(dep))
+
+
+        logger.debug(f"Loading module '{self.display_name}'...")
+
+        for qualname in [self.name, f'auto_archiver.modules.{self.name}']:
+            try:
+                # first import the whole module, to make sure it's working properly
+                __import__(qualname)
+                break
+            except ImportError:
+                pass
+
+        # then import the file for the entry point
+        file_name, class_name = self.entry_point.split('::')
+        sub_qualname = f'{qualname}.{file_name}'
+
+        __import__(f'{qualname}.{file_name}', fromlist=[self.entry_point])
+        # finally, get the class instance
+        instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
+        if not getattr(instance, 'name', None):
+            instance.name = self.name
+
+        if not getattr(instance, 'display_name', None):
+            instance.display_name = self.display_name
+
+        self._instance = instance
+
+        # merge the default config with the user config
+        default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
+        config[self.name] = default_config  | config.get(self.name, {})
+        instance.config_setup(config)
+        instance.setup()
+        return instance
+
+    def __repr__(self):
+        return f"Module<'{self.display_name}' ({self.name})>"
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -5,48 +5,358 @@
 """

 from __future__ import annotations
-from typing import Generator, Union, List
+from typing import Generator, Union, List, Type
 from urllib.parse import urlparse
 from ipaddress import ip_address
+import argparse
+import os
+import sys
+import json
+from tempfile import TemporaryDirectory
+import traceback

-from .context import ArchivingContext
+from rich_argparse import RichHelpFormatter

-from ..archivers import Archiver
-from ..feeders import Feeder
-from ..formatters import Formatter
-from ..storages import Storage
-from ..enrichers import Enricher
-from ..databases import Database
-from .metadata import Metadata

-import tempfile, traceback
+from .metadata import Metadata, Media
+from auto_archiver.version import __version__
+from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
+from .module import available_modules, LazyBaseModule, get_module, setup_paths
+from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
+from .module import BaseModule
+
 from loguru import logger


+DEFAULT_CONFIG_FILE = "orchestration.yaml"
+
+class JsonParseAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        try:
+            setattr(namespace, self.dest, json.loads(values))
+        except json.JSONDecodeError as e:
+            raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}")
+
+
+class AuthenticationJsonParseAction(JsonParseAction):
+    def __call__(self, parser, namespace, values, option_string=None):
+        super().__call__(parser, namespace, values, option_string)
+        auth_dict = getattr(namespace, self.dest)
+        if isinstance(auth_dict, str):
+            # if it's a string
+            try:
+                with open(auth_dict, 'r') as f:
+                    try:
+                        auth_dict = json.load(f)
+                    except json.JSONDecodeError:
+                        # maybe it's yaml, try that
+                        auth_dict = _yaml.load(f)
+            except:
+                pass
+
+        if not isinstance(auth_dict, dict):
+            raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
+        for site, auth in auth_dict.items():
+            if not isinstance(site, str) or not isinstance(auth, dict):
+                raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
+        setattr(namespace, self.dest, auth_dict)
+class UniqueAppendAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        if not hasattr(namespace, self.dest):
+            setattr(namespace, self.dest, [])
+        for value in values:
+            if value not in getattr(namespace, self.dest):
+                getattr(namespace, self.dest).append(value)
+
 class ArchivingOrchestrator:
-    def __init__(self, config) -> None:
-        self.feeder: Feeder = config.feeder
-        self.formatter: Formatter = config.formatter
-        self.enrichers: List[Enricher] = config.enrichers
-        self.archivers: List[Archiver] = config.archivers
-        self.databases: List[Database] = config.databases
-        self.storages: List[Storage] = config.storages
-        ArchivingContext.set("storages", self.storages, keep_on_reset=True)

-        try: 
-            for a in self.all_archivers_for_setup(): a.setup()
-        except (KeyboardInterrupt, Exception) as e:
-            logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}")
-            self.cleanup()
+    feeders: List[Type[Feeder]]
+    extractors: List[Type[Extractor]]
+    enrichers: List[Type[Enricher]]
+    databases: List[Type[Database]]
+    storages: List[Type[Storage]]
+    formatters: List[Type[Formatter]]
+    
+    def setup_basic_parser(self):
+        parser = argparse.ArgumentParser(
+                prog="auto-archiver",
+                add_help=False,
+                description="""
+                Auto Archiver is a CLI tool to archive media/metadata from online URLs;
+                it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!
+                """,
+                epilog="Check the code at https://github.com/bellingcat/auto-archiver",
+                formatter_class=RichHelpFormatter,
+        )
+        parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit')
+        parser.add_argument('--version', action='version', version=__version__)
+        parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
+        parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
+        # override the default 'help' so we can inject all the configs and show those
+        parser.add_argument('-s', '--store', dest='store', default=False, help='Store the created config in the config file', action=argparse.BooleanOptionalAction)
+        parser.add_argument('--module_paths', dest='module_paths', nargs='+', default=[], help='additional paths to search for modules', action=UniqueAppendAction)

+        self.basic_parser = parser
+        return parser
+
+    def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
+        parser = DefaultValidatingParser(
+            add_help=False,
+        )
+        self.add_additional_args(parser)
+
+        # check what mode we're in
+        # if we have a config file, use that to decide which modules to load
+        # if simple, we'll load just the modules that has requires_setup = False
+        # if full, we'll load all modules
+        # TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser'
+        # but should we add them? Or should we just add them to the 'complete' parser?
+        if yaml_config != EMPTY_CONFIG:
+            # only load the modules enabled in config
+            # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
+            enabled_modules = []
+            # first loads the modules from the config file, then from the command line
+            for config in [yaml_config['steps'], basic_config.__dict__]:
+                for module_type in BaseModule.MODULE_TYPES:
+                    enabled_modules.extend(config.get(f"{module_type}s", []))
+
+            # clear out duplicates, but keep the order
+            enabled_modules = list(dict.fromkeys(enabled_modules))
+            avail_modules = available_modules(with_manifest=True, limit_to_modules=enabled_modules, suppress_warnings=True)
+            self.add_module_args(avail_modules, parser)
+        elif basic_config.mode == 'simple':
+            simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup]
+            self.add_module_args(simple_modules, parser)
+
+            # for simple mode, we use the cli_feeder and any modules that don't require setup
+            yaml_config['steps']['feeders'] = ['cli_feeder']
+            
+            # add them to the config
+            for module in simple_modules:
+                for module_type in module.type:
+                    yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
+        else:
+            # load all modules, they're not using the 'simple' mode
+            self.add_module_args(available_modules(with_manifest=True), parser)
+
+        parser.set_defaults(**to_dot_notation(yaml_config))
+
+        # reload the parser with the new arguments, now that we have them
+        parsed, unknown = parser.parse_known_args(unused_args)
+
+        # merge the new config with the old one
+        self.config = merge_dicts(vars(parsed), yaml_config)
+        # clean out args from the base_parser that we don't want in the config
+        for key in vars(basic_config):
+            self.config.pop(key, None)
+
+        # setup the logging
+        self.setup_logging()
+
+        if unknown:
+            logger.warning(f"Ignoring unknown/unused arguments: {unknown}\nPerhaps you don't have this module enabled?")
+        
+        if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
+            logger.info(f"Storing configuration file to {basic_config.config_file}")
+            store_yaml(self.config, basic_config.config_file)
+        
+        return self.config
+    
+    def add_additional_args(self, parser: argparse.ArgumentParser = None):
+        if not parser:
+            parser = self.parser
+
+
+        # allow passing URLs directly on the command line
+        parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
+
+        parser.add_argument('--feeders', dest='steps.feeders', nargs='+', default=['cli_feeder'], help='the feeders to use', action=UniqueAppendAction)
+        parser.add_argument('--enrichers', dest='steps.enrichers',  nargs='+', help='the enrichers to use', action=UniqueAppendAction)
+        parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction)
+        parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction)
+        parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction)
+        parser.add_argument('--formatters', dest='steps.formatters', nargs='+', help='the formatter to use', action=UniqueAppendAction)
+
+        parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
+                                                                            (token, username etc.) that extractors can use to log into \
+                                                                            a website. If passing this on the command line, use a JSON string. \
+                                                                            You may also pass a path to a valid JSON/YAML file which will be parsed.',\
+                                                                            default={},
+                                                                            action=AuthenticationJsonParseAction)
+        # logging arguments
+        parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper)
+        parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
+        parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
+
+
+    def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
+
+        if not modules:
+            modules = available_modules(with_manifest=True)
+
+        module: LazyBaseModule
+        for module in modules:
+
+            if not module.configs:
+                # this module has no configs, don't show anything in the help
+                # (TODO: do we want to show something about this module though, like a description?)
+                continue
+
+            group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")
+
+            for name, kwargs in module.configs.items():
+                if not kwargs.get('metavar', None):
+                    # make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR]
+                    kwargs['metavar'] = name.upper()
+
+                if kwargs.get('required', False):
+                    # required args shouldn't have a 'default' value, remove it
+                    kwargs.pop('default', None)
+
+                kwargs.pop('cli_set', None)
+                should_store = kwargs.pop('should_store', False)
+                kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
+                try:
+                    kwargs['type'] = getattr(validators, kwargs.get('type', '__invalid__'))
+                except AttributeError:
+                    kwargs['type'] = __builtins__.get(kwargs.get('type'), str)
+                arg = group.add_argument(f"--{module.name}.{name}", **kwargs)
+                arg.should_store = should_store
+
+    def show_help(self, basic_config: dict):
+        # for the help message, we want to load *all* possible modules and show the help
+            # add configs as arg parser arguments
+        
+        self.add_additional_args(self.basic_parser)
+        self.add_module_args(parser=self.basic_parser)
+        self.basic_parser.print_help()
+        self.basic_parser.exit()
+    
+    def setup_logging(self):
+        # setup loguru logging
+        logger.remove(0) # remove the default logger
+        logging_config = self.config['logging']
+        logger.add(sys.stderr, level=logging_config['level'])
+        if log_file := logging_config['file']:
+            logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
+
+    def install_modules(self, modules_by_type):
+        """
+        Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the 
+        orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type
+        are loaded, the program will exit with an error message.
+        """
+        
+        invalid_modules = []
+        for module_type in BaseModule.MODULE_TYPES:
+
+            step_items = []
+            modules_to_load = modules_by_type[f"{module_type}s"]
+            assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
+
+            def check_steps_ok():
+                if not len(step_items):
+                    logger.error(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.")
+                    if len(modules_to_load):
+                        logger.error(f"Tried to load the following modules, but none were available: {modules_to_load}")
+                    exit()
+
+                if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1:
+                    logger.error(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
+                    exit()
+
+            for module in modules_to_load:
+                if module == 'cli_feeder':
+                    # pseudo module, don't load it
+                    urls = self.config['urls']
+                    if not urls:
+                        logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
+                        exit()
+                    # cli_feeder is a pseudo module, it just takes the command line args
+                    def feed(self) -> Generator[Metadata]:
+                        for url in urls:
+                            logger.debug(f"Processing URL: '{url}'")
+                            yield Metadata().set_url(url)
+
+                    pseudo_module = type('CLIFeeder', (Feeder,), {
+                        'name': 'cli_feeder',
+                        'display_name': 'CLI Feeder',
+                        '__iter__': feed
+
+                    })()
+  
+
+                    pseudo_module.__iter__ = feed
+                    step_items.append(pseudo_module)
+                    continue
+
+                if module in invalid_modules:
+                    continue
+                try:
+                    loaded_module: BaseModule = get_module(module, self.config)
+                except (KeyboardInterrupt, Exception) as e:
+                    logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
+                    if module_type == 'extractor' and loaded_module.name == module:
+                        loaded_module.cleanup()
+                    exit()
+
+                if not loaded_module:
+                    invalid_modules.append(module)
+                    continue
+                if loaded_module:
+                    step_items.append(loaded_module)
+
+            check_steps_ok()
+            setattr(self, f"{module_type}s", step_items)
+    
+    def load_config(self, config_file: str) -> dict:
+        if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
+            logger.error(f"The configuration file {config_file} was  not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
+            exit()
+
+        return read_yaml(config_file)
+
+    def run(self, args: list) -> None:
+        
+        self.setup_basic_parser()
+
+        # parse the known arguments for now (basically, we want the config file)
+        basic_config, unused_args = self.basic_parser.parse_known_args(args)
+
+        # setup any custom module paths, so they'll show in the help and for arg parsing
+        setup_paths(basic_config.module_paths)
+
+        # if help flag was called, then show the help
+        if basic_config.help:
+            self.show_help(basic_config)
+
+        yaml_config = self.load_config(basic_config.config_file)
+        self.setup_complete_parser(basic_config, yaml_config, unused_args)
+
+        logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
+        self.install_modules(self.config['steps'])
+
+        # log out the modules that were loaded
+        for module_type in BaseModule.MODULE_TYPES:
+            logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
+
+        for _ in self.feed():
+            pass

    def cleanup(self)->None:
        logger.info("Cleaning up")
-        for a in self.all_archivers_for_setup(): a.cleanup()
+        for e in self.extractors:
+            e.cleanup()

    def feed(self) -> Generator[Metadata]:
-        for item in self.feeder:
-            yield self.feed_item(item)
+
+        url_count = 0
+        for feeder in self.feeders:
+            for item in feeder:
+                yield self.feed_item(item)
+                url_count += 1
+
+        logger.success(f"Processed {url_count} URL(s)")
        self.cleanup()

    def feed_item(self, item: Metadata) -> Metadata:
@@ -55,22 +365,33 @@ class ArchivingOrchestrator:
            - catches keyboard interruptions to do a clean exit
            - catches any unexpected error, logs it, and does a clean exit
        """
+        tmp_dir: TemporaryDirectory = None
        try:
-            ArchivingContext.reset()
-            with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
-                ArchivingContext.set_tmp_dir(tmp_dir)
-                return self.archive(item)
+            tmp_dir = TemporaryDirectory(dir="./")
+            # set tmp_dir on all modules
+            for m in self.all_modules:
+                m.tmp_dir = tmp_dir.name
+            return self.archive(item)
        except KeyboardInterrupt:
            # catches keyboard interruptions to do a clean exit
            logger.warning(f"caught interrupt on {item=}")
-            for d in self.databases: d.aborted(item)
+            for d in self.databases:
+                d.aborted(item)
            self.cleanup()
            exit()
        except Exception as e:
            logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
            for d in self.databases:
-                if type(e) == AssertionError: d.failed(item, str(e))
-                else: d.failed(item)
+                if type(e) == AssertionError:
+                    d.failed(item, str(e))
+                else:
+                    d.failed(item, reason="unexpected error")
+        finally:
+            if tmp_dir:
+                # remove the tmp_dir from all modules
+                for m in self.all_modules:
+                    m.tmp_dir = None
+                tmp_dir.cleanup()


    def archive(self, result: Metadata) -> Union[Metadata, None]:
@@ -83,12 +404,19 @@ class ArchivingOrchestrator:
            5. Store all downloaded/generated media
            6. Call selected Formatter and store formatted if needed
        """
+
        original_url = result.get_url().strip()
-        self.assert_valid_url(original_url)
+        try:
+            self.assert_valid_url(original_url)
+        except AssertionError as e:
+            logger.error(f"Error archiving URL {original_url}: {e}")
+            raise e

        # 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
        url = original_url
-        for a in self.archivers: url = a.sanitize_url(url)
+        for a in self.extractors:
+            url = a.sanitize_url(url)
+
        result.set_url(url)
        if original_url != url: result.set("original_url", original_url)

@@ -96,8 +424,8 @@ class ArchivingOrchestrator:
        cached_result = None
        for d in self.databases:
            d.started(result)
-            if (local_result := d.fetch(result)):
-                cached_result = (cached_result or Metadata()).merge(local_result)
+            if local_result := d.fetch(result):
+                cached_result = (cached_result or Metadata()).merge(local_result).merge(result)
        if cached_result:
            logger.debug("Found previously archived entry")
            for d in self.databases:
@@ -106,9 +434,9 @@ class ArchivingOrchestrator:
                    logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
            return cached_result

-        # 3 - call archivers until one succeeds
-        for a in self.archivers:
-            logger.info(f"Trying archiver {a.name} for {url}")
+        # 3 - call extractors until one succeeds
+        for a in self.extractors:
+            logger.info(f"Trying extractor {a.name} for {url}")
            try:
                result.merge(a.download(result))
                if result.is_success(): break
@@ -122,11 +450,12 @@ class ArchivingOrchestrator:
                logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")

        # 5 - store all downloaded/generated media
-        result.store()
+        result.store(storages=self.storages)

        # 6 - format and store formatted if needed
-        if (final_media := self.formatter.format(result)):
-            final_media.store(url=url, metadata=result)
+        final_media: Media
+        if final_media := self.formatters[0].format(result):
+            final_media.store(url=url, metadata=result, storages=self.storages)
            result.set_final_media(final_media)

        if result.is_empty():
@@ -160,5 +489,9 @@ class ArchivingOrchestrator:
            assert not ip.is_link_local, f"Invalid IP used"
            assert not ip.is_private, f"Invalid IP used"

-    def all_archivers_for_setup(self) -> List[Archiver]:
-        return self.archivers + [e for e in self.enrichers if isinstance(e, Archiver)]
+
+    # Helper Properties
+    
+    @property
+    def all_modules(self) -> List[Type[BaseModule]]:
+        return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters
--- a/src/auto_archiver/core/step.py
+++ b/src/auto_archiver/core/step.py
@@ -1,48 +0,0 @@
-"""
-Defines the Step abstract base class, which acts as a blueprint for steps in the archiving pipeline
-by handling user configuration, validating the steps properties, and implementing dynamic instantiation.
-
-"""
-
-from __future__ import annotations
-from dataclasses import dataclass
-from inspect import ClassFoundException
-from typing import Type
-from abc import ABC
-
-
-@dataclass
-class Step(ABC):
-    name: str = None
-
-    def __init__(self, config: dict) -> None:
-        # Initialises each step by reading the relevant entries
-        # reads the configs into object properties
-        # self.config = config[self.name]
-        for k, v in config.get(self.name, {}).items():
-            self.__setattr__(k, v)
-
-    @staticmethod
-    def configs() -> dict: return {}
-
-    def init(name: str, config: dict, child: Type[Step]) -> Step:
-        """
-        Attempts to instantiate a subclass of the provided `child` type
-        matching the given `name`.
-        Raises ClassFoundException if no matching subclass is found.
-        TODO: cannot find subclasses of child.subclasses
-        """
-        for sub in child.__subclasses__():
-            if sub.name == name:
-                return sub(config)
-        raise ClassFoundException(f"Unable to initialize STEP with {name=}, check your configuration file/step names, and make sure you made the step discoverable by putting it into __init__.py")
-
-    def assert_valid_string(self, prop: str) -> None:
-        """
-        Receives a property name and ensures it exists and is a valid non-empty string,
-        raising an AssertionError if not.
-        TODO: replace assertions with custom exceptions
-        """
-        assert hasattr(self, prop), f"property {prop} not found"
-        s = getattr(self, prop)
-        assert s is not None and type(s) == str and len(s) > 0, f"invalid property {prop} value '{s}', it should be a valid string"
--- a/src/auto_archiver/core/storage.py
+++ b/src/auto_archiver/core/storage.py
@@ -0,0 +1,65 @@
+from __future__ import annotations
+from abc import abstractmethod
+from typing import IO
+import os
+
+from loguru import logger
+from slugify import slugify
+
+from auto_archiver.utils.misc import random_str
+
+from auto_archiver.core import Media, BaseModule, Metadata
+from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
+from auto_archiver.core.module import get_module
+class Storage(BaseModule):
+
+    def store(self, media: Media, url: str, metadata: Metadata=None) -> None:
+        if media.is_stored(in_storage=self): 
+            logger.debug(f"{media.key} already stored, skipping")
+            return
+        self.set_key(media, url, metadata)
+        self.upload(media, metadata=metadata)
+        media.add_url(self.get_cdn_url(media))
+
+    @abstractmethod
+    def get_cdn_url(self, media: Media) -> str: pass
+
+    @abstractmethod
+    def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
+
+    def upload(self, media: Media, **kwargs) -> bool:
+        logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
+        with open(media.filename, 'rb') as f:
+            return self.uploadf(f, media, **kwargs)
+
+    def set_key(self, media: Media, url, metadata: Metadata) -> None:
+        """takes the media and optionally item info and generates a key"""
+        if media.key is not None and len(media.key) > 0: return
+        folder = metadata.get_context('folder', '')
+        filename, ext = os.path.splitext(media.filename)
+
+        # Handle path_generator logic
+        path_generator = self.config.get("path_generator", "url")
+        if path_generator == "flat":
+            path = ""
+            filename = slugify(filename)  # Ensure filename is slugified
+        elif path_generator == "url":
+            path = slugify(url)
+        elif path_generator == "random":
+            path = self.config.get("random_path", random_str(24), True)
+        else:
+            raise ValueError(f"Invalid path_generator: {path_generator}")
+
+        # Handle filename_generator logic
+        filename_generator = self.config.get("filename_generator", "random")
+        if filename_generator == "random":
+            filename = random_str(24)
+        elif filename_generator == "static":
+            # load the hash_enricher module
+            he = get_module(HashEnricher, self.config)
+            hd = he.calculate_hash(media.filename)
+            filename = hd[:24]
+        else:
+            raise ValueError(f"Invalid filename_generator: {filename_generator}")
+
+        media.key = os.path.join(folder, path, f"{filename}{ext}")
--- a/src/auto_archiver/core/validators.py
+++ b/src/auto_archiver/core/validators.py
@@ -0,0 +1,19 @@
+# used as validators for config values. Should raise an exception if the value is invalid.
+from pathlib import Path
+import argparse
+
+def example_validator(value):
+    if "example" not in value:
+        raise argparse.ArgumentTypeError(f"{value} is not a valid value for this argument")
+    return value
+
+def positive_number(value):
+    if value < 0:
+        raise argparse.ArgumentTypeError(f"{value} is not a positive number")
+    return value
+
+
+def valid_file(value):
+    if not Path(value).is_file():
+        raise argparse.ArgumentTypeError(f"File '{value}' does not exist.")
+    return value
--- a/src/auto_archiver/databases/init.py
+++ b/src/auto_archiver/databases/init.py
@@ -1,10 +0,0 @@
-""" Databases are used to store the outputs from running the Autp Archiver.
-
-
-"""
-from .database import Database
-from .gsheet_db import GsheetsDb
-from .console_db import ConsoleDb
-from .csv_db import CSVDb
-from .api_db import AAApiDb
-from .atlos_db import AtlosDb
--- a/src/auto_archiver/databases/api_db.py
+++ b/src/auto_archiver/databases/api_db.py
@@ -1,70 +0,0 @@
-from typing import Union
-import requests, os
-from loguru import logger
-
-from . import Database
-from ..core import Metadata
-
-
-class AAApiDb(Database):
-    """
-        Connects to auto-archiver-api instance
-    """
-    name = "auto_archiver_api_db"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        self.allow_rearchive = bool(self.allow_rearchive)
-        self.store_results = bool(self.store_results)
-        self.assert_valid_string("api_endpoint")
-
-    @staticmethod
-    def configs() -> dict:
-        return {
-            "api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
-            "api_token": {"default": None, "help": "API Bearer token."},
-            "public": {"default": False, "help": "whether the URL should be publicly available via the API"},
-            "author_id": {"default": None, "help": "which email to assign as author"},
-            "group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
-            "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"},
-            "store_results": {"default": True, "help": "when set, will send the results to the API database."},
-            "tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))},
-        }
-    def fetch(self, item: Metadata) -> Union[Metadata, bool]:
-        """ query the database for the existence of this item.
-            Helps avoid re-archiving the same URL multiple times.
-        """
-        if not self.allow_rearchive: return
-        
-        params = {"url": item.get_url(), "limit": 15}
-        headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
-        response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers)
-
-        if response.status_code == 200:
-            if len(response.json()):
-                logger.success(f"API returned {len(response.json())} previously archived instance(s)")
-                fetched_metadata = [Metadata.from_dict(r["result"]) for r in response.json()]
-                return Metadata.choose_most_complete(fetched_metadata)
-        else:
-            logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
-        return False
-
-
-    def done(self, item: Metadata, cached: bool=False) -> None:
-        """archival result ready - should be saved to DB"""
-        if not self.store_results: return
-        if cached: 
-            logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
-            return
-        logger.debug(f"saving archive of {item.get_url()} to the AA API.")
-
-        payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
-        headers = {"Authorization": f"Bearer {self.api_token}"}
-        response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers)
-
-        if response.status_code == 200:
-            logger.success(f"AA API: {response.json()}")
-        else:
-            logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
-
--- a/src/auto_archiver/enrichers/init.py
+++ b/src/auto_archiver/enrichers/init.py
@@ -1,24 +0,0 @@
-"""
-Enrichers are modular components that enhance archived content by adding
-context, metadata, or additional processing.
-
-These add additional information to the context, such as screenshots, hashes, and metadata.
-They are designed to work within the archiving pipeline, operating on `Metadata` objects after
-the archiving step and before storage or formatting.
-
-Enrichers are optional but highly useful for making the archived data more powerful.
-
-
-"""
-from .enricher import Enricher
-from .screenshot_enricher import ScreenshotEnricher 
-from .wayback_enricher import WaybackArchiverEnricher
-from .hash_enricher import HashEnricher
-from .thumbnail_enricher import ThumbnailEnricher
-from .wacz_enricher import WaczArchiverEnricher
-from .whisper_enricher import WhisperEnricher
-from .pdq_hash_enricher import PdqHashEnricher
-from .metadata_enricher import MetadataEnricher
-from .meta_enricher import MetaEnricher
-from .ssl_enricher import SSLEnricher
-from .timestamping_enricher import TimestampingEnricher
--- a/src/auto_archiver/enrichers/enricher.py
+++ b/src/auto_archiver/enrichers/enricher.py
@@ -1,22 +0,0 @@
-""" Base classes and utilities for enrichers in the Auto-Archiver system.
-"""
-from __future__ import annotations
-from dataclasses import dataclass
-from abc import abstractmethod, ABC
-from ..core import Metadata, Step
-
-@dataclass
-class Enricher(Step, ABC):
-    name = "enricher"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        
-
-    # only for typing...
-    def init(name: str, config: dict) -> Enricher:
-        return Step.init(name, config, Enricher)
-
-    @abstractmethod
-    def enrich(self, to_enrich: Metadata) -> None: pass
--- a/src/auto_archiver/enrichers/hash_enricher.py
+++ b/src/auto_archiver/enrichers/hash_enricher.py
@@ -1,75 +0,0 @@
-""" Hash Enricher for generating cryptographic hashes of media files.
-
-The `HashEnricher` calculates cryptographic hashes (e.g., SHA-256, SHA3-512)
-for media files stored in `Metadata` objects. These hashes are used for
-validating content integrity, ensuring data authenticity, and identifying
-exact duplicates. The hash is computed by reading the file's bytes in chunks,
-making it suitable for handling large files efficiently.
-
-"""
-import hashlib
-from loguru import logger
-
-from . import Enricher
-from ..core import Metadata, ArchivingContext
-
-
-class HashEnricher(Enricher):
-    """
-    Calculates hashes for Media instances
-    """
-    name = "hash_enricher"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        algos = self.configs()["algorithm"]
-        algo_choices = algos["choices"]
-        if not getattr(self, 'algorithm', None):
-            if not config.get('algorithm'):
-                logger.warning(f"No hash algorithm selected, defaulting to {algos['default']}")
-                self.algorithm = algos["default"]
-            else:
-                self.algorithm = config["algorithm"]
-
-        assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
-
-        if not getattr(self, 'chunksize', None):
-            if config.get('chunksize'):
-                self.chunksize = config["chunksize"]
-            else:
-                self.chunksize = self.configs()["chunksize"]["default"]
-
-        self.chunksize = int(self.chunksize)
-        assert self.chunksize >= -1, "read length must be non-negative or -1"
-
-        ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)
-
-    @staticmethod
-    def configs() -> dict:
-        return {
-            "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
-            "chunksize": {"default": int(1.6e7), "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
-        }
-
-    def enrich(self, to_enrich: Metadata) -> None:
-        url = to_enrich.get_url()
-        logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
-
-        for i, m in enumerate(to_enrich.media):
-            if len(hd := self.calculate_hash(m.filename)):
-                to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
-
-    def calculate_hash(self, filename) -> str:
-        hash = None
-        if self.algorithm == "SHA-256":
-            hash = hashlib.sha256()
-        elif self.algorithm == "SHA3-512":
-            hash = hashlib.sha3_512()
-        else: return ""
-        with open(filename, "rb") as f:
-            while True:
-                buf = f.read(self.chunksize)
-                if not buf: break
-                hash.update(buf)
-        return hash.hexdigest()
--- a/src/auto_archiver/enrichers/screenshot_enricher.py
+++ b/src/auto_archiver/enrichers/screenshot_enricher.py
@@ -1,51 +0,0 @@
-from loguru import logger
-import time, os
-import base64
-
-from selenium.common.exceptions import TimeoutException
-
-
-from . import Enricher
-from ..utils import Webdriver, UrlUtil, random_str  
-from ..core import Media, Metadata, ArchivingContext
-
-class ScreenshotEnricher(Enricher):
-    name = "screenshot_enricher"
-
-    @staticmethod
-    def configs() -> dict:
-        return {
-            "width": {"default": 1280, "help": "width of the screenshots"},
-            "height": {"default": 720, "help": "height of the screenshots"},
-            "timeout": {"default": 60, "help": "timeout for taking the screenshot"},
-            "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
-            "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
-            "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
-            "print_options": {"default": {}, "help": "options to pass to the pdf printer"}
-        }
-
-    def enrich(self, to_enrich: Metadata) -> None:
-        url = to_enrich.get_url()
-
-        if UrlUtil.is_auth_wall(url):
-            logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
-            return
-
-        logger.debug(f"Enriching screenshot for {url=}")
-        with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy, print_options=self.print_options) as driver:
-            try:
-                driver.get(url)
-                time.sleep(int(self.sleep_before_screenshot))
-                screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png")
-                driver.save_screenshot(screenshot_file)
-                to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
-                if self.save_to_pdf:
-                    pdf_file = os.path.join(ArchivingContext.get_tmp_dir(), f"pdf_{random_str(8)}.pdf")
-                    pdf = driver.print_page(driver.print_options)
-                    with open(pdf_file, "wb") as f:
-                        f.write(base64.b64decode(pdf))
-                    to_enrich.add_media(Media(filename=pdf_file), id="pdf")
-            except TimeoutException:
-                logger.info("TimeoutException loading page for screenshot")
-            except Exception as e:
-                logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
--- a/src/auto_archiver/feeders/init.py
+++ b/src/auto_archiver/feeders/init.py
@@ -1,7 +0,0 @@
-""" Feeders handle the input of media into the Auto Archiver.
-
-"""
-from.feeder import Feeder
-from .gsheet_feeder import GsheetsFeeder
-from .cli_feeder import CLIFeeder
-from .atlos_feeder import AtlosFeeder
--- a/src/auto_archiver/feeders/cli_feeder.py
+++ b/src/auto_archiver/feeders/cli_feeder.py
@@ -1,32 +0,0 @@
-from loguru import logger
-
-from . import Feeder
-from ..core import Metadata, ArchivingContext
-
-
-class CLIFeeder(Feeder):
-    name = "cli_feeder"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        if type(self.urls) != list or len(self.urls) == 0:
-            raise Exception("CLI Feeder did not receive any URL to process")
-
-    @staticmethod
-    def configs() -> dict:
-        return {
-            "urls": {
-                "default": None,
-                "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
-                "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
-            },
-        }
-
-    def __iter__(self) -> Metadata:
-        for url in self.urls:
-            logger.debug(f"Processing {url}")
-            yield Metadata().set_url(url)
-            ArchivingContext.set("folder", "cli")
-
-        logger.success(f"Processed {len(self.urls)} URL(s)")
--- a/src/auto_archiver/feeders/csv_feeder.py
+++ b/src/auto_archiver/feeders/csv_feeder.py
@@ -1,41 +0,0 @@
-from loguru import logger
-import csv
-
-from . import Feeder
-from ..core import Metadata, ArchivingContext
-from ..utils import url_or_none
-
-class CSVFeeder(Feeder):
-
-    @staticmethod
-    def configs() -> dict:
-        return {
-            "files": {
-                "default": None,
-                "help": "Path to the input file(s) to read the URLs from, comma separated. \
-                        Input files should be formatted with one URL per line",
-                "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
-            },
-            "column": {
-                "default": None,
-                "help": "Column number or name to read the URLs from, 0-indexed",
-            }
-        }
-    
-
-    def __iter__(self) -> Metadata:
-        url_column = self.column or 0
-        for file in self.files:
-            with open(file, "r") as f:
-                reader = csv.reader(f)
-                first_row = next(reader)
-                if not(url_or_none(first_row[url_column])):
-                    # it's a header row, skip it
-                    logger.debug(f"Skipping header row: {first_row}")
-                for row in reader:
-                    url = row[0]
-                    logger.debug(f"Processing {url}")
-                    yield Metadata().set_url(url)
-            ArchivingContext.set("folder", "cli")
-
-        logger.success(f"Processed {len(self.urls)} URL(s)")
--- a/src/auto_archiver/feeders/feeder.py
+++ b/src/auto_archiver/feeders/feeder.py
@@ -1,21 +0,0 @@
-from __future__ import annotations
-from dataclasses import dataclass
-from abc import abstractmethod
-from ..core import Metadata
-from ..core import Step
-
-
-@dataclass
-class Feeder(Step):
-    name = "feeder"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-
-    def init(name: str, config: dict) -> Feeder:
-        # only for code typing
-        return Step.init(name, config, Feeder)
-
-    @abstractmethod
-    def __iter__(self) -> Metadata: return None
--- a/src/auto_archiver/feeders/gsheet_feeder.py
+++ b/src/auto_archiver/feeders/gsheet_feeder.py
@@ -1,105 +0,0 @@
-"""
-GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver.
-
-This reads data from Google Sheets and filters rows based on user-defined rules.
-The filtered rows are processed into `Metadata` objects.
-
-### Key properties
- validates the sheet's structure and filters rows based on input configurations.
- Ensures only rows with valid URLs and unprocessed statuses are included.
-"""
-import gspread, os
-
-from loguru import logger
-from slugify import slugify
-
-# from . import Enricher
-from . import Feeder
-from ..core import Metadata, ArchivingContext
-from ..utils import Gsheets, GWorksheet
-
-
-class GsheetsFeeder(Gsheets, Feeder):
-    name = "gsheet_feeder"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        self.gsheets_client = gspread.service_account(filename=self.service_account)
-
-    @staticmethod
-    def configs() -> dict:
-        return dict(
-            Gsheets.configs(),
-            ** {
-                "allow_worksheets": {
-                    "default": set(),
-                    "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
-                    "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
-                },
-                "block_worksheets": {
-                    "default": set(),
-                    "help": "(CSV) explicitly block some worksheets from being processed",
-                    "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
-                },
-                "use_sheet_names_in_stored_paths": {
-                    "default": True,
-                    "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
-                }
-            })
-
-    def __iter__(self) -> Metadata:
-        sh = self.open_sheet()
-        for ii, wks in enumerate(sh.worksheets()):
-            if not self.should_process_sheet(wks.title):
-                logger.debug(f"SKIPPED worksheet '{wks.title}' due to allow/block rules")
-                continue
-
-            logger.info(f'Opening worksheet {ii=}: {wks.title=} header={self.header}')
-            gw = GWorksheet(wks, header_row=self.header, columns=self.columns)
-
-            if len(missing_cols := self.missing_required_columns(gw)):
-                logger.warning(f"SKIPPED worksheet '{wks.title}' due to missing required column(s) for {missing_cols}")
-                continue
-
-            for row in range(1 + self.header, gw.count_rows() + 1):
-                url = gw.get_cell(row, 'url').strip()
-                if not len(url): continue
-
-                original_status = gw.get_cell(row, 'status')
-                status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
-                # TODO: custom status parser(?) aka should_retry_from_status
-                if status not in ['', None]: continue
-
-                # All checks done - archival process starts here
-                m = Metadata().set_url(url)
-                ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
-                if gw.get_cell_or_default(row, 'folder', "") is None:
-                    folder = ''
-                else:
-                    folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
-                if len(folder):
-                    if self.use_sheet_names_in_stored_paths:
-                        ArchivingContext.set("folder", os.path.join(folder, slugify(self.sheet), slugify(wks.title)), True)
-                    else:
-                        ArchivingContext.set("folder", folder, True)
-
-                yield m
-
-            logger.success(f'Finished worksheet {wks.title}')
-
-    def should_process_sheet(self, sheet_name: str) -> bool:
-        if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
-            # ALLOW rules exist AND sheet name not explicitly allowed
-            return False
-        if len(self.block_worksheets) and sheet_name in self.block_worksheets:
-            # BLOCK rules exist AND sheet name is blocked
-            return False
-        return True
-
-    def missing_required_columns(self, gw: GWorksheet) -> list:
-        missing = []
-        for required_col in ['url', 'status']:
-            if not gw.col_exists(required_col):
-                missing.append(required_col)
-        return missing
--- a/src/auto_archiver/formatters/init.py
+++ b/src/auto_archiver/formatters/init.py
@@ -1,4 +0,0 @@
-""" Formatters for the output of the content. """
-from .formatter import Formatter
-from .html_formatter import HtmlFormatter
-from .mute_formatter import MuteFormatter
--- a/src/auto_archiver/formatters/formatter.py
+++ b/src/auto_archiver/formatters/formatter.py
@@ -1,20 +0,0 @@
-from __future__ import annotations
-from dataclasses import dataclass
-from abc import abstractmethod
-from ..core import Metadata, Media, Step
-
-
-@dataclass
-class Formatter(Step):
-    name = "formatter"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-
-    def init(name: str, config: dict) -> Formatter:
-        # only for code typing
-        return Step.init(name, config, Formatter)
-
-    @abstractmethod
-    def format(self, item: Metadata) -> Media: return None
--- a/src/auto_archiver/formatters/mute_formatter.py
+++ b/src/auto_archiver/formatters/mute_formatter.py
@@ -1,16 +0,0 @@
-from __future__ import annotations
-from dataclasses import dataclass
-
-from ..core import Metadata, Media
-from . import Formatter
-
-
-@dataclass
-class MuteFormatter(Formatter):
-    name = "mute_formatter"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-
-    def format(self, item: Metadata) -> Media: return None
--- a/src/auto_archiver/modules/api_db/init.py
+++ b/src/auto_archiver/modules/api_db/init.py
@@ -0,0 +1 @@
+from .api_db import AAApiDb
--- a/src/auto_archiver/modules/api_db/manifest.py
+++ b/src/auto_archiver/modules/api_db/manifest.py
@@ -0,0 +1,54 @@
+{
+    "name": "Auto-Archiver API Database",
+    "type": ["database"],
+    "entry_point": "api_db::AAApiDb",
+    "requires_setup": True,
+    "dependencies": {
+        "python": ["requests", "loguru"],
+    },
+    "configs": {
+        "api_endpoint": {
+            "required": True,
+            "help": "API endpoint where calls are made to",
+        },
+        "api_token": {"default": None,
+                      "help": "API Bearer token."},
+        "public": {
+            "default": False,
+            "type": "bool",
+            "help": "whether the URL should be publicly available via the API",
+        },
+        "author_id": {"default": None, "help": "which email to assign as author"},
+        "group_id": {
+            "default": None,
+            "help": "which group of users have access to the archive in case public=false as author",
+        },
+        "use_api_cache": {
+            "default": True,
+            "type": "bool",
+            "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
+        },
+        "store_results": {
+            "default": True,
+            "type": "bool",
+            "help": "when set, will send the results to the API database.",
+        },
+        "tags": {
+            "default": [],
+            "help": "what tags to add to the archived URL",
+        },
+    },
+    "description": """
+     Provides integration with the Auto-Archiver API for querying and storing archival data.
+
+### Features
+- **API Integration**: Supports querying for existing archives and submitting results.
+- **Duplicate Prevention**: Avoids redundant archiving when `use_api_cache` is disabled.
+- **Configurable**: Supports settings like API endpoint, authentication token, tags, and permissions.
+- **Tagging and Metadata**: Adds tags and manages metadata for archives.
+- **Optional Storage**: Archives results conditionally based on configuration.
+
+### Setup
+Requires access to an Auto-Archiver API instance and a valid API token.
+     """,
+}
--- a/src/auto_archiver/modules/api_db/api_db.py
+++ b/src/auto_archiver/modules/api_db/api_db.py
@@ -0,0 +1,55 @@
+from typing import Union
+
+import os
+import requests
+from loguru import logger
+
+from auto_archiver.core import Database
+from auto_archiver.core import Metadata
+
+
+class AAApiDb(Database):
+    """Connects to auto-archiver-api instance"""
+
+    def fetch(self, item: Metadata) -> Union[Metadata, bool]:
+        """ query the database for the existence of this item.
+            Helps avoid re-archiving the same URL multiple times.
+        """
+        if not self.use_api_cache: return
+
+        params = {"url": item.get_url(), "limit": 15}
+        headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
+        response = requests.get(os.path.join(self.api_endpoint, "url/search"), params=params, headers=headers)
+
+        if response.status_code == 200:
+            if len(response.json()):
+                logger.success(f"API returned {len(response.json())} previously archived instance(s)")
+                fetched_metadata = [Metadata.from_dict(r["result"]) for r in response.json()]
+                return Metadata.choose_most_complete(fetched_metadata)
+        else:
+            logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
+        return False
+
+    def done(self, item: Metadata, cached: bool = False) -> None:
+        """archival result ready - should be saved to DB"""
+        if not self.store_results: return
+        if cached:
+            logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
+            return
+        logger.debug(f"saving archive of {item.get_url()} to the AA API.")
+
+        payload = {
+            'author_id': self.author_id,
+            'url': item.get_url(),
+            'public': self.public,
+            'group_id': self.group_id,
+            'tags': list(self.tags),
+            'result': item.to_json(),
+        }
+        headers = {"Authorization": f"Bearer {self.api_token}"}
+        response = requests.post(os.path.join(self.api_endpoint, "interop/submit-archive"), json=payload, headers=headers)
+
+        if response.status_code == 201:
+            logger.success(f"AA API: {response.json()}")
+        else:
+            logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
--- a/src/auto_archiver/modules/atlos_db/init.py
+++ b/src/auto_archiver/modules/atlos_db/init.py
@@ -0,0 +1 @@
+from atlos_db import AtlosDb
--- a/src/auto_archiver/modules/atlos_db/manifest.py
+++ b/src/auto_archiver/modules/atlos_db/manifest.py
@@ -0,0 +1,36 @@
+{
+    "name": "Atlos Database",
+    "type": ["database"],
+    "entry_point": "atlos_db::AtlosDb",
+    "requires_setup": True,
+    "dependencies":
+        {"python": ["loguru",
+                    ""],
+         "bin": [""]},
+    "configs": {
+        "api_token": {
+            "default": None,
+            "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
+        },
+        "atlos_url": {
+            "default": "https://platform.atlos.org",
+            "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
+            "type": "str"
+        },
+    },
+    "description": """
+Handles integration with the Atlos platform for managing archival results.
+
+### Features
+- Outputs archival results to the Atlos API for storage and tracking.
+- Updates failure status with error details when archiving fails.
+- Processes and formats metadata, including ISO formatting for datetime fields.
+- Skips processing for items without an Atlos ID.
+
+### Setup
+Required configs:
+- atlos_url: Base URL for the Atlos API.
+- api_token: Authentication token for API access.
+"""
+,
+}
--- a/src/auto_archiver/modules/atlos_db/atlos_db.py
+++ b/src/auto_archiver/modules/atlos_db/atlos_db.py
@@ -1,13 +1,10 @@
-import os
 from typing import Union
-from loguru import logger
-from csv import DictWriter
-from dataclasses import asdict
-import requests

-from . import Database
-from ..core import Metadata
-from ..utils import get_atlos_config_options
+import requests
+from loguru import logger
+
+from auto_archiver.core import Database
+from auto_archiver.core import Metadata


 class AtlosDb(Database):
@@ -15,16 +12,6 @@ class AtlosDb(Database):
    Outputs results to Atlos
    """

-    name = "atlos_db"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-
-    @staticmethod
-    def configs() -> dict:
-        return get_atlos_config_options()
-
    def failed(self, item: Metadata, reason: str) -> None:
        """Update DB accordingly for failure"""
        # If the item has no Atlos ID, there's nothing for us to do
--- a/src/auto_archiver/modules/atlos_db/base_configs.py
+++ b/src/auto_archiver/modules/atlos_db/base_configs.py
@@ -0,0 +1,13 @@
+def get_atlos_config_options():
+    return {
+        "api_token": {
+            "default": None,
+            "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
+            "type": str
+        },
+        "atlos_url": {
+            "default": "https://platform.atlos.org",
+            "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
+            "type": str
+        },
+    }
--- a/src/auto_archiver/modules/atlos_feeder/init.py
+++ b/src/auto_archiver/modules/atlos_feeder/init.py
@@ -0,0 +1 @@
+from .atlos_feeder import AtlosFeeder
--- a/src/auto_archiver/modules/atlos_feeder/manifest.py
+++ b/src/auto_archiver/modules/atlos_feeder/manifest.py
@@ -0,0 +1,34 @@
+{
+    "name": "Atlos Feeder",
+    "type": ["feeder"],
+    "requires_setup": True,
+    "dependencies": {
+        "python": ["loguru", "requests"],
+    },
+    "configs": {
+        "api_token": {
+            "type": "str",
+            "required": True,
+            "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
+        },
+        "atlos_url": {
+            "default": "https://platform.atlos.org",
+            "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
+            "type": "str"
+        },
+    },
+    "description": """
+    AtlosFeeder: A feeder module that integrates with the Atlos API to fetch source material URLs for archival.
+
+    ### Features
+    - Connects to the Atlos API to retrieve a list of source material URLs.
+    - Filters source materials based on visibility, processing status, and metadata.
+    - Converts filtered source materials into `Metadata` objects with the relevant `atlos_id` and URL.
+    - Iterates through paginated results using a cursor for efficient API interaction.
+
+    ### Notes
+    - Requires an Atlos API endpoint and a valid API token for authentication.
+    - Ensures only unprocessed, visible, and ready-to-archive URLs are returned.
+    - Handles pagination transparently when retrieving data from the Atlos API.
+    """
+}
--- a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py
+++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py
@@ -1,23 +1,11 @@
-from loguru import logger
 import requests
+from loguru import logger

-from . import Feeder
-from ..core import Metadata, ArchivingContext
-from ..utils import get_atlos_config_options
+from auto_archiver.core import Feeder
+from auto_archiver.core import Metadata


 class AtlosFeeder(Feeder):
-    name = "atlos_feeder"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        if type(self.api_token) != str:
-            raise Exception("Atlos Feeder did not receive an Atlos API token")
-
-    @staticmethod
-    def configs() -> dict:
-        return get_atlos_config_options()

    def __iter__(self) -> Metadata:
        # Get all the urls from the Atlos API
@@ -52,5 +40,3 @@ class AtlosFeeder(Feeder):

            if len(data["results"]) == 0 or cursor is None:
                break
-
-        logger.success(f"Processed {count} URL(s)")
--- a/src/auto_archiver/modules/atlos_storage/atlos_storage.py
+++ b/src/auto_archiver/modules/atlos_storage/atlos_storage.py
@@ -1,23 +1,15 @@
-import os
-from typing import IO, List, Optional
-from loguru import logger
-import requests
 import hashlib
+import os
+from typing import IO, Optional

-from ..core import Media, Metadata
-from ..storages import Storage
-from ..utils import get_atlos_config_options
+import requests
+from loguru import logger
+
+from auto_archiver.core import Media, Metadata
+from auto_archiver.core import Storage


 class AtlosStorage(Storage):
-    name = "atlos_storage"
-
-    def __init__(self, config: dict) -> None:
-        super().__init__(config)
-
-    @staticmethod
-    def configs() -> dict:
-        return dict(Storage.configs(), **get_atlos_config_options())

    def get_cdn_url(self, _media: Media) -> str:
        # It's not always possible to provide an exact URL, because it's
--- a/src/auto_archiver/modules/console_db/init.py
+++ b/src/auto_archiver/modules/console_db/init.py
@@ -0,0 +1 @@
+from .console_db import ConsoleDb
--- a/src/auto_archiver/modules/console_db/manifest.py
+++ b/src/auto_archiver/modules/console_db/manifest.py
@@ -0,0 +1,22 @@
+{
+    "name": "Console Database",
+    "type": ["database"],
+    "requires_setup": False,
+    "dependencies": {
+        "python": ["loguru"],
+    },
+    "description": """
+Provides a simple database implementation that outputs archival results and status updates to the console.
+
+### Features
+- Logs the status of archival tasks directly to the console, including:
+  - started
+  - failed (with error details)
+  - aborted
+  - done (with optional caching status)
+- Useful for debugging or lightweight setups where no external database is required.
+
+### Setup
+No additional configuration is required.
+""",
+}
--- a/src/auto_archiver/modules/console_db/console_db.py
+++ b/src/auto_archiver/modules/console_db/console_db.py
@@ -1,22 +1,13 @@
 from loguru import logger

-from . import Database
-from ..core import Metadata
+from auto_archiver.core import Database
+from auto_archiver.core import Metadata


 class ConsoleDb(Database):
    """
        Outputs results to the console
    """
-    name = "console_db"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-
-    @staticmethod
-    def configs() -> dict:
-        return {}

    def started(self, item: Metadata) -> None:
        logger.warning(f"STARTED {item}")
--- a/src/auto_archiver/modules/csv_db/init.py
+++ b/src/auto_archiver/modules/csv_db/init.py
@@ -0,0 +1 @@
+from .csv_db import CSVDb
--- a/src/auto_archiver/modules/csv_db/manifest.py
+++ b/src/auto_archiver/modules/csv_db/manifest.py
@@ -0,0 +1,23 @@
+{
+    "name": "CSV Database",
+    "type": ["database"],
+    "requires_setup": False,
+    "dependencies": {"python": ["loguru"]
+                              },
+    'entry_point': 'csv_db::CSVDb',
+    "configs": {
+            "csv_file": {"default": "db.csv", "help": "CSV file name"}
+        },
+    "description": """
+Handles exporting archival results to a CSV file.
+
+### Features
+- Saves archival metadata as rows in a CSV file.
+- Automatically creates the CSV file with a header if it does not exist.
+- Appends new metadata entries to the existing file.
+
+### Setup
+Required config:
+- csv_file: Path to the CSV file where results will be stored (default: "db.csv").
+""",
+}
--- a/src/auto_archiver/modules/csv_db/csv_db.py
+++ b/src/auto_archiver/modules/csv_db/csv_db.py
@@ -3,26 +3,14 @@ from loguru import logger
 from csv import DictWriter
 from dataclasses import asdict

-from . import Database
-from ..core import Metadata
+from auto_archiver.core import Database
+from auto_archiver.core import Metadata


 class CSVDb(Database):
    """
        Outputs results to a CSV file
    """
-    name = "csv_db"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        self.assert_valid_string("csv_file")
-
-    @staticmethod
-    def configs() -> dict:
-        return {
-            "csv_file": {"default": "db.csv", "help": "CSV file name"}
-        }

    def done(self, item: Metadata, cached: bool=False) -> None:
        """archival result ready - should be saved to DB"""
--- a/src/auto_archiver/modules/csv_feeder/init.py
+++ b/src/auto_archiver/modules/csv_feeder/init.py
@@ -0,0 +1 @@
+from .csv_feeder import CSVFeeder
--- a/src/auto_archiver/modules/csv_feeder/manifest.py
+++ b/src/auto_archiver/modules/csv_feeder/manifest.py
@@ -0,0 +1,37 @@
+{
+    "name": "CSV Feeder",
+    "type": ["feeder"],
+    "requires_setup": False,
+    "dependencies": {
+        "python": ["loguru"],
+        "bin": [""]
+    },
+    'requires_setup': True,
+    'entry_point': "csv_feeder::CSVFeeder",
+    "configs": {
+            "files": {
+                "default": None,
+                "help": "Path to the input file(s) to read the URLs from, comma separated. \
+                        Input files should be formatted with one URL per line",
+                "required": True,
+                "type": "valid_file",
+                "nargs": "+",
+            },
+            "column": {
+                "default": None,
+                "help": "Column number or name to read the URLs from, 0-indexed",
+            }
+        },
+    "description": """
+    Reads URLs from CSV files and feeds them into the archiving process.
+
+    ### Features
+    - Supports reading URLs from multiple input files, specified as a comma-separated list.
+    - Allows specifying the column number or name to extract URLs from.
+    - Skips header rows if the first value is not a valid URL.
+
+    ### Setup
+    - Input files should be formatted with one URL per line, with or without a header row.
+    - If you have a header row, you can specify the column number or name to read URLs from using the 'column' config option.
+    """
+}
--- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py
+++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py
@@ -0,0 +1,38 @@
+from loguru import logger
+import csv
+
+from auto_archiver.core import Feeder
+from auto_archiver.core import Metadata
+from auto_archiver.utils import url_or_none
+
+class CSVFeeder(Feeder):
+
+    column = None
+
+
+    def __iter__(self) -> Metadata:
+        for file in self.files:
+            with open(file, "r") as f:
+                reader = csv.reader(f)
+                first_row = next(reader)
+                url_column = self.column or 0
+                if isinstance(url_column, str):
+                    try:
+                        url_column = first_row.index(url_column)
+                    except ValueError:
+                        logger.error(f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?")
+                        return
+                elif not(url_or_none(first_row[url_column])):
+                    # it's a header row, but we've been given a column number already
+                    logger.debug(f"Skipping header row: {first_row}")
+                else:
+                    # first row isn't a header row, rewind the file
+                    f.seek(0)
+
+                for row in reader:
+                    if not url_or_none(row[url_column]):
+                        logger.warning(f"Not a valid URL in row: {row}, skipping")
+                        continue
+                    url = row[url_column]
+                    logger.debug(f"Processing {url}")
+                    yield Metadata().set_url(url)
--- a/src/auto_archiver/modules/gdrive_storage/init.py
+++ b/src/auto_archiver/modules/gdrive_storage/init.py
@@ -0,0 +1 @@
+from .gdrive_storage import GDriveStorage
--- a/src/auto_archiver/modules/gdrive_storage/manifest.py
+++ b/src/auto_archiver/modules/gdrive_storage/manifest.py
@@ -0,0 +1,99 @@
+{
+    "name": "Google Drive Storage",
+    "type": ["storage"],
+    "author": "Dave Mateer",
+    "entry_point": "gdrive_storage::GDriveStorage",
+    "requires_setup": True,
+    "dependencies": {
+        "python": [
+            "loguru",
+            "googleapiclient",
+            "google",
+        ],
+    },
+    "configs": {
+        "path_generator": {
+            "default": "url",
+            "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
+            "choices": ["flat", "url", "random"],
+        },
+        "filename_generator": {
+            "default": "static",
+            "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
+            "choices": ["random", "static"],
+        },
+        "root_folder_id": {"required": True,
+                           "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
+        "oauth_token": {"default": None,
+                        "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
+        "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."},
+    },
+    "description": """
+    
+    GDriveStorage: A storage module for saving archived content to Google Drive.
+
+    Author: Dave Mateer, (And maintained by: )
+    Source Documentation: https://davemateer.com/2022/04/28/google-drive-with-python
+
+    ### Features
+    - Saves media files to Google Drive, organizing them into folders based on the provided path structure.
+    - Supports OAuth token-based authentication or service account credentials for API access.
+    - Automatically creates folders in Google Drive if they don't exist.
+    - Retrieves CDN URLs for stored files, enabling easy sharing and access.
+
+    ### Notes
+    - Requires setup with either a Google OAuth token or a service account JSON file.
+    - Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure.
+    - Automatically handles Google Drive API token refreshes for long-running jobs.
+    
+    ## Overview
+This module integrates Google Drive as a storage backend, enabling automatic folder creation and file uploads. It supports authentication via **service accounts** (recommended for automation) or **OAuth tokens** (for user-based authentication).
+
+## Features
+- Saves files to Google Drive, organizing them into structured folders.
+- Supports both **service account** and **OAuth token** authentication.
+- Automatically creates folders if they don't exist.
+- Generates public URLs for easy file sharing.
+
+## Setup Guide
+1. **Enable Google Drive API**
+   - Create a Google Cloud project at [Google Cloud Console](https://console.cloud.google.com/)
+   - Enable the **Google Drive API**.
+
+2. **Set Up a Google Drive Folder**
+   - Create a folder in **Google Drive** and copy its **folder ID** from the URL.
+   - Add the **folder ID** to your configuration (`orchestration.yaml`):
+     ```yaml
+     root_folder_id: "FOLDER_ID"
+     ```
+
+3. **Authentication Options**
+   - **Option 1: Service Account (Recommended)**
+     - Create a **service account** in Google Cloud IAM.
+     - Download the JSON key file and save it as:
+       ```
+       secrets/service_account.json
+       ```
+     - **Share your Drive folder** with the service account’s `client_email` (found in the JSON file).
+     
+   - **Option 2: OAuth Token (User Authentication)**
+     - Create OAuth **Desktop App credentials** in Google Cloud.
+     - Save the credentials as:
+       ```
+       secrets/oauth_credentials.json
+       ```
+     - Generate an OAuth token by running:
+       ```sh
+       python scripts/create_update_gdrive_oauth_token.py -c secrets/oauth_credentials.json
+       ```
+
+    
+    Notes on the OAuth token:
+    Tokens are refreshed after 1 hour however keep working for 7 days (tbc)
+    so as long as the job doesn't last for 7 days then this method of refreshing only once per run will work
+    see this link for details on the token:
+    https://davemateer.com/2022/04/28/google-drive-with-python#tokens
+    
+    
+"""
+}
--- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
+++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
@@ -1,79 +1,67 @@

-import shutil, os, time, json
+import json
+import os
+import time
 from typing import IO
-from loguru import logger

-from googleapiclient.discovery import build
-from googleapiclient.http import MediaFileUpload
+from google.auth.transport.requests import Request
 from google.oauth2 import service_account
 from google.oauth2.credentials import Credentials
-from google.auth.transport.requests import Request
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaFileUpload
+from loguru import logger
+
+from auto_archiver.core import Media
+from auto_archiver.core import Storage
+

-from ..core import Media
-from . import Storage


 class GDriveStorage(Storage):
-    name = "gdrive_storage"

-    def __init__(self, config: dict) -> None:
-        super().__init__(config)
+    def setup(self) -> None:
+        self.scopes = ['https://www.googleapis.com/auth/drive']
+        # Initialize Google Drive service
+        self._setup_google_drive_service()

-        SCOPES = ['https://www.googleapis.com/auth/drive']
-
-        if self.oauth_token is not None:
-            """
-            Tokens are refreshed after 1 hour 
-            however keep working for 7 days (tbc)
-            so as long as the job doesn't last for 7 days
-            then this method of refreshing only once per run will work
-            see this link for details on the token
-            https://davemateer.com/2022/04/28/google-drive-with-python#tokens
-            """
-            logger.debug(f'Using GD OAuth token {self.oauth_token}')
-            # workaround for missing 'refresh_token' in from_authorized_user_file
-            with open(self.oauth_token, 'r') as stream:
-                creds_json = json.load(stream)
-                creds_json['refresh_token'] = creds_json.get("refresh_token", "")
-            creds = Credentials.from_authorized_user_info(creds_json, SCOPES)
-            # creds = Credentials.from_authorized_user_file(self.oauth_token, SCOPES)
-
-            if not creds or not creds.valid:
-                if creds and creds.expired and creds.refresh_token:
-                    logger.debug('Requesting new GD OAuth token')
-                    creds.refresh(Request())
-                else:
-                    raise Exception("Problem with creds - create the token again")
-
-                # Save the credentials for the next run
-                with open(self.oauth_token, 'w') as token:
-                    logger.debug('Saving new GD OAuth token')
-                    token.write(creds.to_json())
-            else:
-                logger.debug('GD OAuth Token valid')
+    def _setup_google_drive_service(self):
+        """Initialize Google Drive service based on provided credentials."""
+        if self.oauth_token:
+            logger.debug(f"Using Google Drive OAuth token: {self.oauth_token}")
+            self.service = self._initialize_with_oauth_token()
+        elif self.service_account:
+            logger.debug(f"Using Google Drive service account: {self.service_account}")
+            self.service = self._initialize_with_service_account()
        else:
-            gd_service_account = self.service_account
-            logger.debug(f'Using GD Service Account {gd_service_account}')
-            creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES)
+            raise ValueError("Missing credentials: either `oauth_token` or `service_account` must be provided.")

-        self.service = build('drive', 'v3', credentials=creds)
+    def _initialize_with_oauth_token(self):
+        """Initialize Google Drive service with OAuth token."""
+        with open(self.oauth_token, 'r') as stream:
+            creds_json = json.load(stream)
+            creds_json['refresh_token'] = creds_json.get("refresh_token", "")

-    @staticmethod
-    def configs() -> dict:
-        return dict(
-            Storage.configs(),
-            ** {
-                "root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
-                "oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
-                "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."},
-            })
+        creds = Credentials.from_authorized_user_info(creds_json, self.scopes)
+        if not creds.valid and creds.expired and creds.refresh_token:
+            creds.refresh(Request())
+            with open(self.oauth_token, 'w') as token_file:
+                logger.debug("Saving refreshed OAuth token.")
+                token_file.write(creds.to_json())
+        elif not creds.valid:
+            raise ValueError("Invalid OAuth token. Please regenerate the token.")
+
+        return build('drive', 'v3', credentials=creds)
+
+    def _initialize_with_service_account(self):
+        """Initialize Google Drive service with service account."""
+        creds = service_account.Credentials.from_service_account_file(self.service_account, scopes=self.scopes)
+        return build('drive', 'v3', credentials=creds)

    def get_cdn_url(self, media: Media) -> str:
        """
        only support files saved in a folder for GD
        S3 supports folder and all stored in the root
        """
-
        # full_name = os.path.join(self.folder, media.key)
        parent_id, folder_id = self.root_folder_id, None
        path_parts = media.key.split(os.path.sep)
@@ -82,13 +70,16 @@ class GDriveStorage(Storage):
        for folder in path_parts[0:-1]:
            folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
            parent_id = folder_id
-
        # get id of file inside folder (or sub folder)
-        file_id = self._get_id_from_parent_and_name(folder_id, filename)
+        file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=True)
+        if not file_id:
+            #
+            logger.info(f"file {filename} not found in folder {folder_id}")
+            return None
        return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"

    def upload(self, media: Media, **kwargs) -> bool:
-        logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}')
+        logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
        """
        1. for each sub-folder in the path check if exists or create
        2. upload file to root_id/other_paths.../filename
@@ -116,7 +107,13 @@ class GDriveStorage(Storage):
    # must be implemented even if unused
    def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass

-    def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
+    def _get_id_from_parent_and_name(self, parent_id: str,
+                                     name: str,
+                                     retries: int = 1,
+                                     sleep_seconds: int = 10,
+                                     use_mime_type: bool = False,
+                                     raise_on_missing: bool = True,
+                                     use_cache=False):
        """
        Retrieves the id of a folder or file from its @name and the @parent_id folder
        Optionally does multiple @retries and sleeps @sleep_seconds between them
@@ -179,8 +176,3 @@ class GDriveStorage(Storage):
        gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields='id').execute()
        return gd_folder.get('id')

-    # def exists(self, key):
-    #     try:
-    #         self.get_cdn_url(key)
-    #         return True
-    #     except: return False
--- a/src/auto_archiver/modules/generic_extractor/init.py
+++ b/src/auto_archiver/modules/generic_extractor/init.py
@@ -0,0 +1 @@
+from .generic_extractor import GenericExtractor
--- a/src/auto_archiver/modules/generic_extractor/manifest.py
+++ b/src/auto_archiver/modules/generic_extractor/manifest.py
@@ -0,0 +1,68 @@
+{
+    "name": "Generic Extractor",
+    "version": "0.1.0",
+    "author": "Bellingcat",
+    "type": ["extractor"],
+    "requires_setup": False,
+    "dependencies": {
+        "python": ["yt_dlp", "requests", "loguru", "slugify"],
+    },
+    "description": """
+This is the generic extractor used by auto-archiver, which uses `yt-dlp` under the hood.
+
+This module is responsible for downloading and processing media content from platforms
+supported by `yt-dlp`, such as YouTube, Facebook, and others. It provides functionality
+for retrieving videos, subtitles, comments, and other metadata, and it integrates with
+the broader archiving framework.
+
+### Features
+- Supports downloading videos and playlists.
+- Retrieves metadata like titles, descriptions, upload dates, and durations.
+- Downloads subtitles and comments when enabled.
+- Configurable options for handling live streams, proxies, and more.
+- Supports authentication of websites using the 'authentication' settings from your orchestration.
+
+### Dropins
+- For websites supported by `yt-dlp` that also contain posts in addition to videos
+ (e.g. Facebook, Twitter, Bluesky), dropins can be created to extract post data and create 
+ metadata objects. Some dropins are included in this generic_archiver by default, but
+custom dropins can be created to handle additional websites and passed to the archiver
+via the command line using the `--dropins` option (TODO!).
+""",
+    "configs": {
+        "subtitles": {"default": True, "help": "download subtitles if available", "type": "bool"},
+        "comments": {
+            "default": False,
+            "help": "download all comments if available, may lead to large metadata",
+            "type": "bool",
+        },
+        "livestreams": {
+            "default": False,
+            "help": "if set, will download live streams, otherwise will skip them; see --max-filesize for more control",
+            "type": "bool",
+        },
+        "live_from_start": {
+            "default": False,
+            "help": "if set, will download live streams from their earliest available moment, otherwise starts now.",
+            "type": "bool",
+        },
+        "proxy": {
+            "default": "",
+            "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port",
+        },
+        "end_means_success": {
+            "default": True,
+            "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve.",
+            "type": "bool",
+        },
+        "allow_playlist": {
+            "default": False,
+            "help": "If True will also download playlists, set to False if the expectation is to download a single video.",
+            "type": "bool",
+        },
+        "max_downloads": {
+            "default": "inf",
+            "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
+        },
+    },
+}
--- a/src/auto_archiver/archivers/generic_archiver/bluesky.py
+++ b/src/auto_archiver/archivers/generic_archiver/bluesky.py
@@ -1,17 +1,12 @@
-import os
-import mimetypes
-
-import requests
 from loguru import logger

-from auto_archiver.core.context import ArchivingContext
-from auto_archiver.archivers.archiver import Archiver
+from auto_archiver.core.extractor import Extractor
 from auto_archiver.core.metadata import Metadata, Media
 from .dropin import GenericDropin, InfoExtractor

 class Bluesky(GenericDropin):

-    def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
+    def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
        result = Metadata()
        result.set_url(url)
        result.set_title(post["record"]["text"])
@@ -28,21 +23,10 @@ class Bluesky(GenericDropin):

    def extract_post(self, url: str, ie_instance: InfoExtractor) -> dict:
        # TODO: If/when this PR (https://github.com/yt-dlp/yt-dlp/pull/12098) is merged on ytdlp, remove the comments and delete the code below
-        # handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
-        # return ie_instance._extract_post(handle=handle, post_id=video_id)
-
        handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
-        return ie_instance._download_json(
-            'https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread',
-            video_id, query={
-                'uri': f'at://{handle}/app.bsky.feed.post/{video_id}',
-                'depth': 0,
-                'parentHeight': 0,
-            })['thread']['post']
+        return ie_instance._extract_post(handle=handle, post_id=video_id)

-
-
-    def _download_bsky_embeds(self, post: dict, archiver: Archiver) -> list[Media]:
+    def _download_bsky_embeds(self, post: dict, archiver: Extractor) -> list[Media]:
        """
        Iterates over image(s) or video in a Bluesky post and downloads them        
        """
@@ -55,11 +39,11 @@ class Bluesky(GenericDropin):
        for image_media in image_medias:
            url = media_url.format(image_media['image']['ref']['$link'], post['author']['did'])
            image_media = archiver.download_from_url(url)
-            media.append(image_media)
+            media.append(Media(image_media))
        for video_media in video_medias:
            url = media_url.format(video_media['ref']['$link'], post['author']['did'])
            video_media = archiver.download_from_url(url)
-            media.append(video_media)
+            media.append(Media(video_media))
        return media


--- a/src/auto_archiver/archivers/generic_archiver/dropin.py
+++ b/src/auto_archiver/archivers/generic_archiver/dropin.py
@@ -1,6 +1,6 @@
 from yt_dlp.extractor.common import InfoExtractor
 from auto_archiver.core.metadata import Metadata
-from auto_archiver.archivers.archiver import Archiver
+from auto_archiver.core.extractor import Extractor

 class GenericDropin:
    """Base class for dropins for the generic extractor.
@@ -30,7 +30,7 @@ class GenericDropin:
        raise NotImplementedError("This method should be implemented in the subclass")
    

-    def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
+    def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
        """
        This method should create a Metadata object from the post data.
        """
--- a/src/auto_archiver/modules/generic_extractor/facebook.py
+++ b/src/auto_archiver/modules/generic_extractor/facebook.py
@@ -0,0 +1,18 @@
+from .dropin import GenericDropin
+
+
+class Facebook(GenericDropin):
+    def extract_post(self, url: str, ie_instance):
+        video_id = ie_instance._match_valid_url(url).group('id')
+        ie_instance._download_webpage(
+            url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
+        webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group('id'))
+
+        # TODO: fix once https://github.com/yt-dlp/yt-dlp/pull/12275 is merged
+        post_data = ie_instance._extract_metadata(webpage)
+        return post_data
+    
+    def create_metadata(self, post: dict, ie_instance, archiver, url):
+        metadata = archiver.create_metadata(url)
+        metadata.set_title(post.get('title')).set_content(post.get('description')).set_post_data(post)
+        return metadata
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -1,53 +1,17 @@
-"""
-This is the generic archiver used by auto-archiver, which uses `yt-dlp` under the hood.
-
-This module is responsible for downloading and processing media content from platforms
-supported by `yt-dlp`, such as YouTube, Facebook, and others. It provides functionality
-for retrieving videos, subtitles, comments, and other metadata, and it integrates with
-the broader archiving framework.
-
-### Features
- Supports downloading videos and playlists.
- Retrieves metadata like titles, descriptions, upload dates, and durations.
- Downloads subtitles and comments when enabled.
- Configurable options for handling live streams, proxies, and more.
-
-### Dropins
- For websites supported by `yt-dlp` that also contain posts in addition to videos
- (e.g. Facebook, Twitter, Bluesky), dropins can be created to extract post data and create 
- metadata objects. Some dropins are included in this generic_archiver by default, but
-custom dropins can be created to handle additional websites and passed to the archiver
-via the command line using the `--dropins` option (TODO!).
-
-"""
-
-
 import datetime, os, yt_dlp, pysubs2
 import importlib
-from typing import Type
+from typing import Generator, Type
 from yt_dlp.extractor.common import InfoExtractor

 from loguru import logger

-from auto_archiver.archivers.archiver import Archiver
-from ...core import Metadata, Media, ArchivingContext
+from auto_archiver.core.extractor import Extractor
+from auto_archiver.core import Metadata, Media

-class GenericArchiver(Archiver):
-    name = "youtubedl_archiver" #left as is for backwards compat
+class GenericExtractor(Extractor):
    _dropins = {}

-    def __init__(self, config: dict) -> None:
-        super().__init__(config)
-        self.subtitles = bool(self.subtitles)
-        self.comments = bool(self.comments)
-        self.livestreams = bool(self.livestreams)
-        self.live_from_start = bool(self.live_from_start)
-        self.end_means_success = bool(self.end_means_success)
-        self.allow_playlist = bool(self.allow_playlist)
-        self.max_downloads = self.max_downloads
-
-
-    def suitable_extractors(self, url: str) -> list[str]:
+    def suitable_extractors(self, url: str) -> Generator[str, None, None]:
        """
        Returns a list of valid extractors for the given URL"""
        for info_extractor in yt_dlp.YoutubeDL()._ies.values():
@@ -152,11 +116,12 @@ class GenericArchiver(Archiver):

    def get_metadata_for_post(self, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
        """
-        Calls into the ytdlp InfoExtract subclass to use the prive _extract_post method to get the post metadata.
+        Calls into the ytdlp InfoExtract subclass to use the private _extract_post method to get the post metadata.
        """

        ie_instance = info_extractor(downloader=ydl)
        dropin = self.dropin_for_name(info_extractor.ie_key())
+
        if not dropin:
            # TODO: add a proper link to 'how to create your own dropin'
            logger.debug(f"""Could not find valid dropin for {info_extractor.IE_NAME}.
@@ -207,6 +172,7 @@ class GenericArchiver(Archiver):
        return self.add_metadata(data, info_extractor, url, result)
    
    def dropin_for_name(self, dropin_name: str, additional_paths = [], package=__package__) -> Type[InfoExtractor]:
+        dropin_name = dropin_name.lower()

        if dropin_name == "generic":
            # no need for a dropin for the generic extractor (?)
@@ -300,19 +266,35 @@ class GenericArchiver(Archiver):
    def download(self, item: Metadata) -> Metadata:
        url = item.get_url()

-        if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
-            logger.debug('Using Facebook cookie')
-            yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
-        
-        ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
+        #TODO: this is a temporary hack until this issue is closed: https://github.com/yt-dlp/yt-dlp/issues/11025
+        if url.startswith("https://ya.ru"):
+            url = url.replace("https://ya.ru", "https://yandex.ru")
+            item.set("replaced_url", url)

-        if item.netloc in ['youtube.com', 'www.youtube.com']:
-            if self.cookies_from_browser:
-                logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube')
-                ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,)
-            elif self.cookie_file:
-                logger.debug(f'Using cookies from file {self.cookie_file}')
-                ydl_options['cookiefile'] = self.cookie_file
+
+        ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'), 
+                       'quiet': False, 'noplaylist': not self.allow_playlist ,
+                       'writesubtitles': self.subtitles,'writeautomaticsub': self.subtitles,
+                       "live_from_start": self.live_from_start, "proxy": self.proxy,
+                       "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
+        
+        # set up auth
+        auth = self.auth_for_site(url, extract_cookies=False)
+        # order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file
+        if auth:
+            if 'username' in auth and 'password' in auth:
+                logger.debug(f'Using provided auth username and password for {url}')
+                ydl_options['username'] = auth['username']
+                ydl_options['password'] = auth['password']
+            elif 'cookie' in auth:
+                logger.debug(f'Using provided auth cookie for {url}')
+                yt_dlp.utils.std_headers['cookie'] = auth['cookie']
+            elif 'cookie_from_browser' in auth:
+                logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}')
+                ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
+            elif 'cookies_file' in auth:
+                logger.debug(f'Using cookies from file {self.cookie_file} for {url}')
+                ydl_options['cookiesfile'] = auth['cookies_file']

        ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"

@@ -320,6 +302,6 @@ class GenericArchiver(Archiver):
            result = self.download_for_extractor(info_extractor, url, ydl)
            if result:
                return result
-       
+

        return False
--- a/src/auto_archiver/archivers/generic_archiver/truth.py
+++ b/src/auto_archiver/archivers/generic_archiver/truth.py
@@ -2,7 +2,7 @@ from typing import Type

 from auto_archiver.utils import traverse_obj
 from auto_archiver.core.metadata import Metadata, Media
-from auto_archiver.archivers.archiver import Archiver
+from auto_archiver.core.extractor import Extractor
 from yt_dlp.extractor.common import InfoExtractor

 from dateutil.parser import parse as parse_dt
@@ -19,7 +19,7 @@ class Truth(GenericDropin):
    def skip_ytdlp_download(self, url, ie_instance: Type[InfoExtractor]) -> bool:
        return True

-    def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
+    def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
        """
        Creates metadata from a truth social post
        
--- a/src/auto_archiver/archivers/generic_archiver/twitter.py
+++ b/src/auto_archiver/archivers/generic_archiver/twitter.py
@@ -5,8 +5,8 @@ from loguru import logger
 from slugify import slugify

 from auto_archiver.core.metadata import Metadata, Media
-from auto_archiver.utils import UrlUtil
-from auto_archiver.archivers.archiver import Archiver
+from auto_archiver.utils import url as UrlUtil
+from auto_archiver.core.extractor import Extractor

 from .dropin import GenericDropin, InfoExtractor

@@ -32,7 +32,7 @@ class Twitter(GenericDropin):
        twid = ie_instance._match_valid_url(url).group('id')
        return ie_instance._extract_status(twid=twid)

-    def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
+    def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
        result = Metadata()
        try:
            if not tweet.get("user") or not tweet.get("created_at"):
--- a/src/auto_archiver/modules/gsheet_db/init.py
+++ b/src/auto_archiver/modules/gsheet_db/init.py
@@ -0,0 +1 @@
+from .gsheet_db import GsheetsDb
--- a/src/auto_archiver/modules/gsheet_db/manifest.py
+++ b/src/auto_archiver/modules/gsheet_db/manifest.py
@@ -0,0 +1,38 @@
+{
+    "name": "Google Sheets Database",
+    "type": ["database"],
+    "entry_point": "gsheet_db::GsheetsDb",
+    "requires_setup": True,
+    "dependencies": {
+        "python": ["loguru", "gspread", "slugify"],
+    },
+    "configs": {
+        "allow_worksheets": {
+            "default": set(),
+            "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
+        },
+        "block_worksheets": {
+            "default": set(),
+            "help": "(CSV) explicitly block some worksheets from being processed",
+        },
+        "use_sheet_names_in_stored_paths": {
+            "default": True,
+            "type": "bool",
+            "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
+        }
+    },
+    "description": """
+    GsheetsDatabase:
+    Handles integration with Google Sheets for tracking archival tasks.
+
+### Features
+- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.
+- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.
+- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.
+- Skips redundant updates for empty or invalid data fields.
+
+### Notes
+- Currently works only with metadata provided by GsheetFeeder. 
+- Requires configuration of a linked Google Sheet and appropriate API credentials.
+    """
+}
--- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py
+++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py
@@ -1,47 +1,38 @@
 from typing import Union, Tuple
-import datetime
 from urllib.parse import quote

 from loguru import logger

-from . import Database
-from ..core import Metadata, Media, ArchivingContext
-from ..utils import GWorksheet
+from auto_archiver.core import Database
+from auto_archiver.core import Metadata, Media
+from auto_archiver.modules.gsheet_feeder import GWorksheet
+from auto_archiver.utils.misc import get_current_timestamp


 class GsheetsDb(Database):
    """
-        NB: only works if GsheetFeeder is used. 
-        could be updated in the future to support non-GsheetFeeder metadata 
+    NB: only works if GsheetFeeder is used.
+    could be updated in the future to support non-GsheetFeeder metadata
    """
-    name = "gsheet_db"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-
-    @staticmethod
-    def configs() -> dict:
-        return {}

    def started(self, item: Metadata) -> None:
        logger.warning(f"STARTED {item}")
        gw, row = self._retrieve_gsheet(item)
-        gw.set_cell(row, 'status', 'Archive in progress')
+        gw.set_cell(row, "status", "Archive in progress")

-    def failed(self, item: Metadata, reason:str) -> None:
+    def failed(self, item: Metadata, reason: str) -> None:
        logger.error(f"FAILED {item}")
-        self._safe_status_update(item, f'Archive failed {reason}')
+        self._safe_status_update(item, f"Archive failed {reason}")

    def aborted(self, item: Metadata) -> None:
        logger.warning(f"ABORTED {item}")
-        self._safe_status_update(item, '')
+        self._safe_status_update(item, "")

    def fetch(self, item: Metadata) -> Union[Metadata, bool]:
        """check if the given item has been archived already"""
        return False

-    def done(self, item: Metadata, cached: bool=False) -> None:
+    def done(self, item: Metadata, cached: bool = False) -> None:
        """archival result ready - should be saved to DB"""
        logger.success(f"DONE {item.get_url()}")
        gw, row = self._retrieve_gsheet(item)
@@ -53,23 +44,25 @@ class GsheetsDb(Database):
        def batch_if_valid(col, val, final_value=None):
            final_value = final_value or val
            try:
-                if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
+                if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
                    cell_updates.append((row, col, final_value))
            except Exception as e:
                logger.error(f"Unable to batch {col}={final_value} due to {e}")
+
        status_message = item.status
        if cached:
            status_message = f"[cached] {status_message}"
-        cell_updates.append((row, 'status', status_message))
+        cell_updates.append((row, "status", status_message))

        media: Media = item.get_final_media()
        if hasattr(media, "urls"):
-            batch_if_valid('archive', "\n".join(media.urls))
-        batch_if_valid('date', True, datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat())
-        batch_if_valid('title', item.get_title())
-        batch_if_valid('text', item.get("content", ""))
-        batch_if_valid('timestamp', item.get_timestamp())
-        if media: batch_if_valid('hash', media.get("hash", "not-calculated"))
+            batch_if_valid("archive", "\n".join(media.urls))
+        batch_if_valid("date", True, get_current_timestamp())
+        batch_if_valid("title", item.get_title())
+        batch_if_valid("text", item.get("content", ""))
+        batch_if_valid("timestamp", item.get_timestamp())
+        if media:
+            batch_if_valid("hash", media.get("hash", "not-calculated"))

        # merge all pdq hashes into a single string, if present
        pdq_hashes = []
@@ -78,35 +71,44 @@ class GsheetsDb(Database):
            if pdq := m.get("pdq_hash"):
                pdq_hashes.append(pdq)
        if len(pdq_hashes):
-            batch_if_valid('pdq_hash', ",".join(pdq_hashes))
+            batch_if_valid("pdq_hash", ",".join(pdq_hashes))

-        if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
-            batch_if_valid('screenshot', "\n".join(screenshot.urls))
+        if (screenshot := item.get_media_by_id("screenshot")) and hasattr(
+            screenshot, "urls"
+        ):
+            batch_if_valid("screenshot", "\n".join(screenshot.urls))

-        if (thumbnail := item.get_first_image("thumbnail")):
+        if thumbnail := item.get_first_image("thumbnail"):
            if hasattr(thumbnail, "urls"):
-                batch_if_valid('thumbnail', f'=IMAGE("{thumbnail.urls[0]}")')
+                batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")')

-        if (browsertrix := item.get_media_by_id("browsertrix")):
-            batch_if_valid('wacz', "\n".join(browsertrix.urls))
-            batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls]))
+        if browsertrix := item.get_media_by_id("browsertrix"):
+            batch_if_valid("wacz", "\n".join(browsertrix.urls))
+            batch_if_valid(
+                "replaywebpage",
+                "\n".join(
+                    [
+                        f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}"
+                        for wacz in browsertrix.urls
+                    ]
+                ),
+            )

        gw.batch_set_cell(cell_updates)

    def _safe_status_update(self, item: Metadata, new_status: str) -> None:
        try:
            gw, row = self._retrieve_gsheet(item)
-            gw.set_cell(row, 'status', new_status)
+            gw.set_cell(row, "status", new_status)
        except Exception as e:
            logger.debug(f"Unable to update sheet: {e}")

    def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
-        # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
-        if gsheet := ArchivingContext.get("gsheet"):
+
+        if gsheet := item.get_context("gsheet"):
            gw: GWorksheet = gsheet.get("worksheet")
            row: int = gsheet.get("row")
        elif self.sheet_id:
-            print(self.sheet_id)
-
+            logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")

        return gw, row
--- a/src/auto_archiver/modules/gsheet_feeder/init.py
+++ b/src/auto_archiver/modules/gsheet_feeder/init.py
@@ -0,0 +1,2 @@
+from .gworksheet import GWorksheet
+from .gsheet_feeder import GsheetsFeeder
--- a/src/auto_archiver/modules/gsheet_feeder/manifest.py
+++ b/src/auto_archiver/modules/gsheet_feeder/manifest.py
@@ -0,0 +1,71 @@
+{
+    "name": "Google Sheets Feeder",
+    "type": ["feeder"],
+    "entry_point": "gsheet_feeder::GsheetsFeeder",
+    "requires_setup": True,
+    "dependencies": {
+        "python": ["loguru", "gspread", "slugify"],
+    },
+    "configs": {
+        "sheet": {"default": None, "help": "name of the sheet to archive"},
+        "sheet_id": {
+            "default": None,
+            "help": "(alternative to sheet name) the id of the sheet to archive",
+        },
+        "header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
+        "service_account": {
+            "default": "secrets/service_account.json",
+            "help": "service account JSON file path",
+        },
+        "columns": {
+            "default": {
+                "url": "link",
+                "status": "archive status",
+                "folder": "destination folder",
+                "archive": "archive location",
+                "date": "archive date",
+                "thumbnail": "thumbnail",
+                "timestamp": "upload timestamp",
+                "title": "upload title",
+                "text": "text content",
+                "screenshot": "screenshot",
+                "hash": "hash",
+                "pdq_hash": "perceptual hashes",
+                "wacz": "wacz",
+                "replaywebpage": "replaywebpage",
+            },
+            "help": "names of columns in the google sheet (stringified JSON object)",
+            "type": "auto_archiver.utils.json_loader",
+        },
+        "allow_worksheets": {
+            "default": set(),
+            "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
+        },
+        "block_worksheets": {
+            "default": set(),
+            "help": "(CSV) explicitly block some worksheets from being processed",
+        },
+        "use_sheet_names_in_stored_paths": {
+            "default": True,
+            "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
+            "type": "bool",
+        },
+    },
+    "description": """
+    GsheetsFeeder 
+    A Google Sheets-based feeder for the Auto Archiver.
+
+    This reads data from Google Sheets and filters rows based on user-defined rules.
+    The filtered rows are processed into `Metadata` objects.
+
+    ### Features
+    - Validates the sheet structure and filters rows based on input configurations.
+    - Processes only worksheets allowed by the `allow_worksheets` and `block_worksheets` configurations.
+    - Ensures only rows with valid URLs and unprocessed statuses are included for archival.
+    - Supports organizing stored files into folder paths based on sheet and worksheet names.
+
+    ### Notes
+    - Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`.
+    - Create the sheet using the template provided in the docs.
+    """,
+}
--- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
+++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
@@ -0,0 +1,96 @@
+"""
+GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver.
+
+This reads data from Google Sheets and filters rows based on user-defined rules.
+The filtered rows are processed into `Metadata` objects.
+
+### Key properties
+- validates the sheet's structure and filters rows based on input configurations.
+- Ensures only rows with valid URLs and unprocessed statuses are included.
+"""
+import os
+import gspread
+
+from loguru import logger
+from slugify import slugify
+
+from auto_archiver.core import Feeder
+from auto_archiver.core import Metadata
+from . import GWorksheet
+
+
+class GsheetsFeeder(Feeder):
+
+    def setup(self) -> None:
+        self.gsheets_client = gspread.service_account(filename=self.service_account)
+        # TODO mv to validators
+        assert self.sheet or self.sheet_id, (
+            "You need to define either a 'sheet' name or a 'sheet_id' in your manifest."
+        )
+
+    def open_sheet(self):
+        if self.sheet:
+            return self.gsheets_client.open(self.sheet)
+        else:  # self.sheet_id
+            return self.gsheets_client.open_by_key(self.sheet_id)
+
+    def __iter__(self) -> Metadata:
+        sh = self.open_sheet()
+        for ii, worksheet in enumerate(sh.worksheets()):
+            if not self.should_process_sheet(worksheet.title):
+                logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
+                continue
+            logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
+            gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
+            if len(missing_cols := self.missing_required_columns(gw)):
+                logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
+                continue
+
+            # process and yield metadata here:
+            yield from self._process_rows(gw)
+            logger.success(f'Finished worksheet {worksheet.title}')
+
+    def _process_rows(self, gw: GWorksheet):
+        for row in range(1 + self.header, gw.count_rows() + 1):
+            url = gw.get_cell(row, 'url').strip()
+            if not len(url): continue
+            original_status = gw.get_cell(row, 'status')
+            status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
+            # TODO: custom status parser(?) aka should_retry_from_status
+            if status not in ['', None]: continue
+
+            # All checks done - archival process starts here
+            m = Metadata().set_url(url)
+            self._set_context(m, gw, row)
+            yield m
+
+    def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
+        # TODO: Check folder value not being recognised
+        m.set_context("gsheet", {"row": row, "worksheet": gw})
+
+        if gw.get_cell_or_default(row, 'folder', "") is None:
+            folder = ''
+        else:
+            folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
+        if len(folder):
+            if self.use_sheet_names_in_stored_paths:
+                m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title)))
+            else:
+                m.set_context("folder", folder)
+
+
+    def should_process_sheet(self, sheet_name: str) -> bool:
+        if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
+            # ALLOW rules exist AND sheet name not explicitly allowed
+            return False
+        if len(self.block_worksheets) and sheet_name in self.block_worksheets:
+            # BLOCK rules exist AND sheet name is blocked
+            return False
+        return True
+
+    def missing_required_columns(self, gw: GWorksheet) -> list:
+        missing = []
+        for required_col in ['url', 'status']:
+            if not gw.col_exists(required_col):
+                missing.append(required_col)
+        return missing
--- a/src/auto_archiver/modules/gsheet_feeder/gworksheet.py
+++ b/src/auto_archiver/modules/gsheet_feeder/gworksheet.py
--- a/src/auto_archiver/modules/hash_enricher/init.py
+++ b/src/auto_archiver/modules/hash_enricher/init.py
@@ -0,0 +1 @@
+from .hash_enricher import HashEnricher
--- a/src/auto_archiver/modules/hash_enricher/manifest.py
+++ b/src/auto_archiver/modules/hash_enricher/manifest.py
@@ -0,0 +1,31 @@
+{
+    "name": "Hash Enricher",
+    "type": ["enricher"],
+    "requires_setup": False,
+    "dependencies": {
+                          "python": ["loguru"],
+    },
+    "configs": {
+            "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
+            # TODO add non-negative requirement to match previous implementation?
+            "chunksize": {"default": 16000000,
+                          "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB",
+                          'type': 'int',
+                          },
+        },
+    "description": """
+Generates cryptographic hashes for media files to ensure data integrity and authenticity.
+
+### Features
+- Calculates cryptographic hashes (SHA-256 or SHA3-512) for media files stored in `Metadata` objects.
+- Ensures content authenticity, integrity validation, and duplicate identification.
+- Efficiently processes large files by reading file bytes in configurable chunk sizes.
+- Supports dynamic configuration of hash algorithms and chunk sizes.
+- Updates media metadata with the computed hash value in the format `<algorithm>:<hash>`.
+
+### Notes
+- Default hash algorithm is SHA-256, but SHA3-512 is also supported.
+- Chunk size defaults to 16 MB but can be adjusted based on memory requirements.
+- Useful for workflows requiring hash-based content validation or deduplication.
+""",
+}
--- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py
+++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py
@@ -0,0 +1,39 @@
+""" Hash Enricher for generating cryptographic hashes of media files.
+
+The `HashEnricher` calculates cryptographic hashes (e.g., SHA-256, SHA3-512)
+for media files stored in `Metadata` objects. These hashes are used for
+validating content integrity, ensuring data authenticity, and identifying
+exact duplicates. The hash is computed by reading the file's bytes in chunks,
+making it suitable for handling large files efficiently.
+
+"""
+import hashlib
+from loguru import logger
+
+from auto_archiver.core import Enricher
+from auto_archiver.core import Metadata
+from auto_archiver.utils.misc import calculate_file_hash
+
+
+class HashEnricher(Enricher):
+    """
+    Calculates hashes for Media instances
+    """
+
+
+    def enrich(self, to_enrich: Metadata) -> None:
+        url = to_enrich.get_url()
+        logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
+
+        for i, m in enumerate(to_enrich.media):
+            if len(hd := self.calculate_hash(m.filename)):
+                to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
+
+    def calculate_hash(self, filename) -> str:
+        hash_algo = None
+        if self.algorithm == "SHA-256":
+            hash_algo = hashlib.sha256
+        elif self.algorithm == "SHA3-512":
+            hash_algo = hashlib.sha3_512
+        else: return ""
+        return calculate_file_hash(filename, hash_algo, self.chunksize)
--- a/src/auto_archiver/modules/html_formatter/init.py
+++ b/src/auto_archiver/modules/html_formatter/init.py
@@ -0,0 +1 @@
+from .html_formatter import HtmlFormatter
--- a/src/auto_archiver/modules/html_formatter/manifest.py
+++ b/src/auto_archiver/modules/html_formatter/manifest.py
@@ -0,0 +1,13 @@
+{
+    "name": "HTML Formatter",
+    "type": ["formatter"],
+    "requires_setup": False,
+    "dependencies": {
+                          "python": ["hash_enricher", "loguru", "jinja2"],
+                          "bin": [""]
+    },
+    "configs": {
+            "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
+        },
+    "description": """ """,
+}
--- a/src/auto_archiver/modules/html_formatter/html_formatter.py
+++ b/src/auto_archiver/modules/html_formatter/html_formatter.py
@@ -1,5 +1,4 @@
 from __future__ import annotations
-from dataclasses import dataclass
 import mimetypes, os, pathlib
 from jinja2 import Environment, FileSystemLoader
 from urllib.parse import quote
@@ -7,32 +6,29 @@ from loguru import logger
 import json
 import base64

-from ..version import __version__
-from ..core import Metadata, Media, ArchivingContext
-from . import Formatter
-from ..enrichers import HashEnricher
-from ..utils.misc import random_str
+from auto_archiver.version import __version__
+from auto_archiver.core import Metadata, Media
+from auto_archiver.core import Formatter
+from auto_archiver.utils.misc import random_str
+from auto_archiver.core.module import get_module

-
-@dataclass
 class HtmlFormatter(Formatter):
-    name = "html_formatter"
+    environment: Environment = None
+    template: any = None
+
+    def setup(self) -> None:
+        """Sets up the Jinja2 environment and loads the template."""
+        template_dir = os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")
+        self.environment = Environment(loader=FileSystemLoader(template_dir), autoescape=True)

-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")), autoescape=True)
        # JinjaHelper class static methods are added as filters
        self.environment.filters.update({
            k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)
        })
-        self.template = self.environment.get_template("html_template.html")

-    @staticmethod
-    def configs() -> dict:
-        return {
-            "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
-        }
+        # Load a specific template or default to "html_template.html"
+        template_name = self.config.get("template_name", "html_template.html")
+        self.template = self.environment.get_template(template_name)

    def format(self, item: Metadata) -> Media:
        url = item.get_url()
@@ -48,12 +44,13 @@ class HtmlFormatter(Formatter):
            version=__version__
        )

-        html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html")
+        html_path = os.path.join(self.tmp_dir, f"formatted{random_str(24)}.html")
        with open(html_path, mode="w", encoding="utf-8") as outf:
            outf.write(content)
        final_media = Media(filename=html_path, _mimetype="text/html")

-        he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
+        # get the already instantiated hash_enricher module
+        he = get_module('hash_enricher', self.config)
        if len(hd := he.calculate_hash(final_media.filename)):
            final_media.set("hash", f"{he.algorithm}:{hd}")

--- a/src/auto_archiver/modules/html_formatter/templates/init.py
+++ b/src/auto_archiver/modules/html_formatter/templates/init.py
--- a/src/auto_archiver/modules/html_formatter/templates/html_template.html
+++ b/src/auto_archiver/modules/html_formatter/templates/html_template.html
--- a/src/auto_archiver/modules/html_formatter/templates/macros.html
+++ b/src/auto_archiver/modules/html_formatter/templates/macros.html
--- a/src/auto_archiver/modules/instagram_api_extractor/init.py
+++ b/src/auto_archiver/modules/instagram_api_extractor/init.py
@@ -0,0 +1 @@
+from .instagram_api_extractor import InstagramAPIExtractor
--- a/src/auto_archiver/modules/instagram_api_extractor/manifest.py
+++ b/src/auto_archiver/modules/instagram_api_extractor/manifest.py
@@ -0,0 +1,53 @@
+{
+    "name": "Instagram API Extractor",
+    "type": ["extractor"],
+    "entry_point": "instagram_api_extractor::InstagramAPIExtractor",
+    "dependencies":
+        {"python": ["requests",
+                    "loguru",
+                    "retrying",
+                    "tqdm",],
+         },
+    "requires_setup": True,
+    "configs": {
+        "access_token": {"default": None,
+                         "help": "a valid instagrapi-api token"},
+        "api_endpoint": {"required": True,
+                         "help": "API endpoint to use"},
+        "full_profile": {
+            "default": False,
+            "type": "bool",
+            "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.",
+        },
+        "full_profile_max_posts": {
+            "default": 0,
+            "type": "int",
+            "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights",
+        },
+        "minimize_json_output": {
+            "default": True,
+            "type": "bool",
+            "help": "if true, will remove empty values from the json output",
+        },
+    },
+    "description": """
+Archives various types of Instagram content using the Instagrapi API.
+
+Requires setting up an Instagrapi API deployment and providing an access token and API endpoint.
+
+### Features
+- Connects to an Instagrapi API deployment to fetch Instagram profiles, posts, stories, highlights, reels, and tagged content.
+- Supports advanced configuration options, including:
+  - Full profile download (all posts, stories, highlights, and tagged content).
+  - Limiting the number of posts to fetch for large profiles.
+  - Minimising JSON output to remove empty fields and redundant data.
+- Provides robust error handling and retries for API calls.
+- Ensures efficient media scraping, including handling nested or carousel media items.
+- Adds downloaded media and metadata to the result for further processing.
+
+### Notes
+- Requires a valid Instagrapi API token (`access_token`) and API endpoint (`api_endpoint`).
+- Full-profile downloads can be limited by setting `full_profile_max_posts`.
+- Designed to fetch content in batches for large profiles, minimising API load.
+""",
+}
--- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py
+++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py
@@ -1,5 +1,5 @@
 """
-The `instagram_api_archiver` module provides tools for archiving various types of Instagram content
+The `instagram_api_extractor` module provides tools for archiving various types of Instagram content
 using the [Instagrapi API](https://github.com/subzeroid/instagrapi).

 Connects to an Instagrapi API deployment and allows for downloading Instagram user profiles,
@@ -9,87 +9,88 @@ data, reducing JSON output size, and handling large profiles.
 """

 import re
-import requests
 from datetime import datetime
+
+import requests
 from loguru import logger
 from retrying import retry
 from tqdm import tqdm

-from . import Archiver
-from ..core import Metadata
-from ..core import Media
+from auto_archiver.core import Extractor
+from auto_archiver.core import Media
+from auto_archiver.core import Metadata

-class InstagramAPIArchiver(Archiver):
+
+class InstagramAPIExtractor(Extractor):
    """
    Uses an https://github.com/subzeroid/instagrapi API deployment to fetch instagram posts data
-    
+
    # TODO: improvement collect aggregates of locations[0].location and mentions for all posts
    """
-    name = "instagram_api_archiver"

-    global_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?")
+    valid_url = re.compile(
+        r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?"
+    )

-    def __init__(self, config: dict) -> None:
-        super().__init__(config)
-        self.assert_valid_string("access_token")
-        self.assert_valid_string("api_endpoint")
-        self.full_profile_max_posts = int(self.full_profile_max_posts)
-        if self.api_endpoint[-1] == "/": self.api_endpoint = self.api_endpoint[:-1]
+    def setup(self) -> None:
+        if self.api_endpoint[-1] == "/":
+            self.api_endpoint = self.api_endpoint[:-1]

-        self.full_profile = bool(self.full_profile)
-        self.minimize_json_output = bool(self.minimize_json_output)

-    @staticmethod
-    def configs() -> dict:
-        return {
-            "access_token": {"default": None, "help": "a valid instagrapi-api token"},
-            "api_endpoint": {"default": None, "help": "API endpoint to use"},
-            "full_profile": {"default": False, "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information."},
-            "full_profile_max_posts": {"default": 0, "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights"},
-            "minimize_json_output": {"default": True, "help": "if true, will remove empty values from the json output"},
-        }
-    
    def download(self, item: Metadata) -> Metadata:
        url = item.get_url()

-        url.replace("instagr.com", "instagram.com").replace("instagr.am", "instagram.com")
-        insta_matches = self.global_pattern.findall(url)
+        url.replace("instagr.com", "instagram.com").replace(
+            "instagr.am", "instagram.com"
+        )
+        insta_matches = self.valid_url.findall(url)
        logger.info(f"{insta_matches=}")
-        if not len(insta_matches) or len(insta_matches[0])!=3: return
-        if len(insta_matches) > 1: 
-            logger.warning(f"Multiple instagram matches found in {url=}, using the first one")
+        if not len(insta_matches) or len(insta_matches[0]) != 3:
+            return
+        if len(insta_matches) > 1:
+            logger.warning(
+                f"Multiple instagram matches found in {url=}, using the first one"
+            )
            return
        g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2]
-        if g1 == "": return self.download_profile(item, g2)
-        elif g1 == "p": return self.download_post(item, g2, context="post")
-        elif g1 == "reel": return self.download_post(item, g2, context="reel")
-        elif g1 == "stories/highlights": return self.download_highlights(item, g2)
-        elif g1 == "stories": 
-            if len(g3): return self.download_post(item, id=g3, context="story")
+        if g1 == "":
+            return self.download_profile(item, g2)
+        elif g1 == "p":
+            return self.download_post(item, g2, context="post")
+        elif g1 == "reel":
+            return self.download_post(item, g2, context="reel")
+        elif g1 == "stories/highlights":
+            return self.download_highlights(item, g2)
+        elif g1 == "stories":
+            if len(g3):
+                return self.download_post(item, id=g3, context="story")
            return self.download_stories(item, g2)
-        else: 
+        else:
            logger.warning(f"Unknown instagram regex group match {g1=} found in {url=}")
            return
-        
+
    @retry(wait_random_min=1000, wait_random_max=3000, stop_max_attempt_number=5)
    def call_api(self, path: str, params: dict) -> dict:
-        headers = {
-            "accept": "application/json",
-            "x-access-key": self.access_token
-        }
+        headers = {"accept": "application/json", "x-access-key": self.access_token}
        logger.debug(f"calling {self.api_endpoint}/{path} with {params=}")
-        return requests.get(f"{self.api_endpoint}/{path}", headers=headers, params=params).json()
+        return requests.get(
+            f"{self.api_endpoint}/{path}", headers=headers, params=params
+        ).json()

    def cleanup_dict(self, d: dict | list) -> dict:
        # repeats 3 times to remove nested empty values
-        if not self.minimize_json_output: return d
-        if type(d) == list: return [self.cleanup_dict(v) for v in d]
-        if type(d) != dict: return d
+        if not self.minimize_json_output:
+            return d
+        if type(d) == list:
+            return [self.cleanup_dict(v) for v in d]
+        if type(d) != dict:
+            return d
        return {
-                k: clean_v
-                for k, v in d.items() 
-                if (clean_v := self.cleanup_dict(v)) not in [0.0, 0, [], {}, "", None, "null"] and
-                k not in ["x", "y", "width", "height"]
+            k: clean_v
+            for k, v in d.items()
+            if (clean_v := self.cleanup_dict(v))
+            not in [0.0, 0, [], {}, "", None, "null"]
+            and k not in ["x", "y", "width", "height"]
        }

    def download_profile(self, result: Metadata, username: str) -> Metadata:
@@ -125,7 +126,9 @@ class InstagramAPIArchiver(Archiver):
            try:
                self.download_all_tagged(result, user_id)
            except Exception as e:
-                result.append("errors", f"Error downloading tagged posts for {username}")
+                result.append(
+                    "errors", f"Error downloading tagged posts for {username}"
+                )
                logger.error(f"Error downloading tagged posts for {username}: {e}")

            # download all highlights
@@ -135,26 +138,37 @@ class InstagramAPIArchiver(Archiver):
                result.append("errors", f"Error downloading highlights for {username}")
                logger.error(f"Error downloading highlights for {username}: {e}")

-
-        result.set_url(url) # reset as scrape_item modifies it
+        result.set_url(url)  # reset as scrape_item modifies it
        return result.success("insta profile")

    def download_all_highlights(self, result, username, user_id):
        count_highlights = 0
        highlights = self.call_api(f"v1/user/highlights", {"user_id": user_id})
        for h in highlights:
-            try: 
+            try:
                h_info = self._download_highlights_reusable(result, h.get("pk"))
                count_highlights += len(h_info.get("items", []))
            except Exception as e:
-                result.append("errors", f"Error downloading highlight id{h.get('pk')} for {username}")
-                logger.error(f"Error downloading highlight id{h.get('pk')} for {username}: {e}")
-            if self.full_profile_max_posts and count_highlights >= self.full_profile_max_posts:
-                logger.info(f"HIGHLIGHTS reached full_profile_max_posts={self.full_profile_max_posts}")
+                result.append(
+                    "errors",
+                    f"Error downloading highlight id{h.get('pk')} for {username}",
+                )
+                logger.error(
+                    f"Error downloading highlight id{h.get('pk')} for {username}: {e}"
+                )
+            if (
+                self.full_profile_max_posts
+                and count_highlights >= self.full_profile_max_posts
+            ):
+                logger.info(
+                    f"HIGHLIGHTS reached full_profile_max_posts={self.full_profile_max_posts}"
+                )
                break
        result.set("#highlights", count_highlights)

-    def download_post(self, result: Metadata, code: str = None, id: str = None, context: str = None) -> Metadata:
+    def download_post(
+        self, result: Metadata, code: str = None, id: str = None, context: str = None
+    ) -> Metadata:
        if id:
            post = self.call_api(f"v1/media/by/id", {"id": id})
        else:
@@ -166,7 +180,8 @@ class InstagramAPIArchiver(Archiver):

        post = self.scrape_item(result, post, context)

-        if post.get("taken_at"): result.set_timestamp(post.get("taken_at"))
+        if post.get("taken_at"):
+            result.set_timestamp(post.get("taken_at"))
        return result.success(f"insta {context or 'post'}")

    def download_highlights(self, result: Metadata, id: str) -> Metadata:
@@ -175,96 +190,127 @@ class InstagramAPIArchiver(Archiver):
        del h_info["items"]
        result.set_title(h_info.get("title")).set("data", h_info).set("#reels", items)
        return result.success("insta highlights")
-    
-    def _download_highlights_reusable(self, result: Metadata, id: str) ->dict:
+
+    def _download_highlights_reusable(self, result: Metadata, id: str) -> dict:
        full_h = self.call_api(f"v2/highlight/by/id", {"id": id})
        h_info = full_h.get("response", {}).get("reels", {}).get(f"highlight:{id}")
        assert h_info, f"Highlight {id} not found: {full_h=}"

-        if cover_media := h_info.get("cover_media", {}).get("cropped_image_version", {}).get("url"):
+        if (
+            cover_media := h_info.get("cover_media", {})
+            .get("cropped_image_version", {})
+            .get("url")
+        ):
            filename = self.download_from_url(cover_media)
            result.add_media(Media(filename=filename), id=f"cover_media highlight {id}")

-        items = h_info.get("items", [])[::-1] # newest to oldest
+        items = h_info.get("items", [])[::-1]  # newest to oldest
        for h in tqdm(items, desc="downloading highlights", unit="highlight"):
-            try: self.scrape_item(result, h, "highlight")
+            try:
+                self.scrape_item(result, h, "highlight")
            except Exception as e:
                result.append("errors", f"Error downloading highlight {h.get('id')}")
-                logger.error(f"Error downloading highlight, skipping {h.get('id')}: {e}")
-        
+                logger.error(
+                    f"Error downloading highlight, skipping {h.get('id')}: {e}"
+                )
+
        return h_info
-  
+
    def download_stories(self, result: Metadata, username: str) -> Metadata:
        now = datetime.now().strftime("%Y-%m-%d_%H-%M")
        stories = self._download_stories_reusable(result, username)
-        if stories == []: return result.success("insta no story")
+        if stories == []:
+            return result.success("insta no story")
        result.set_title(f"stories {username} at {now}").set("#stories", len(stories))
        return result.success(f"insta stories {now}")
-    
+
    def _download_stories_reusable(self, result: Metadata, username: str) -> list[dict]:
        stories = self.call_api(f"v1/user/stories/by/username", {"username": username})
-        if not stories or not len(stories): return []
-        stories = stories[::-1] # newest to oldest
+        if not stories or not len(stories):
+            return []
+        stories = stories[::-1]  # newest to oldest

        for s in tqdm(stories, desc="downloading stories", unit="story"):
-            try: self.scrape_item(result, s, "story")
+            try:
+                self.scrape_item(result, s, "story")
            except Exception as e:
                result.append("errors", f"Error downloading story {s.get('id')}")
                logger.error(f"Error downloading story, skipping {s.get('id')}: {e}")
        return stories
-        
+
    def download_all_posts(self, result: Metadata, user_id: str):
        end_cursor = None
        pbar = tqdm(desc="downloading posts")

        post_count = 0
        while end_cursor != "":
-            posts = self.call_api(f"v1/user/medias/chunk", {"user_id": user_id, "end_cursor": end_cursor})
-            if not len(posts) or not type(posts) == list or len(posts) != 2: break
+            posts = self.call_api(
+                f"v1/user/medias/chunk", {"user_id": user_id, "end_cursor": end_cursor}
+            )
+            if not len(posts) or not type(posts) == list or len(posts) != 2:
+                break
            posts, end_cursor = posts[0], posts[1]
            logger.info(f"parsing {len(posts)} posts, next {end_cursor=}")

            for p in posts:
-                try: self.scrape_item(result, p, "post")
+                try:
+                    self.scrape_item(result, p, "post")
                except Exception as e:
                    result.append("errors", f"Error downloading post {p.get('id')}")
                    logger.error(f"Error downloading post, skipping {p.get('id')}: {e}")
                pbar.update(1)
-                post_count+=1
-            if self.full_profile_max_posts and post_count >= self.full_profile_max_posts:
-                logger.info(f"POSTS reached full_profile_max_posts={self.full_profile_max_posts}")
+                post_count += 1
+            if (
+                self.full_profile_max_posts
+                and post_count >= self.full_profile_max_posts
+            ):
+                logger.info(
+                    f"POSTS reached full_profile_max_posts={self.full_profile_max_posts}"
+                )
                break
        result.set("#posts", post_count)
-        
+
    def download_all_tagged(self, result: Metadata, user_id: str):
        next_page_id = ""
        pbar = tqdm(desc="downloading tagged posts")

        tagged_count = 0
        while next_page_id != None:
-            resp = self.call_api(f"v2/user/tag/medias", {"user_id": user_id, "page_id": next_page_id})
+            resp = self.call_api(
+                f"v2/user/tag/medias", {"user_id": user_id, "page_id": next_page_id}
+            )
            posts = resp.get("response", {}).get("items", [])
-            if not len(posts): break
+            if not len(posts):
+                break
            next_page_id = resp.get("next_page_id")
-            
+
            logger.info(f"parsing {len(posts)} tagged posts, next {next_page_id=}")

            for p in posts:
-                try: self.scrape_item(result, p, "tagged")
+                try:
+                    self.scrape_item(result, p, "tagged")
                except Exception as e:
-                    result.append("errors", f"Error downloading tagged post {p.get('id')}")
-                    logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e}")
+                    result.append(
+                        "errors", f"Error downloading tagged post {p.get('id')}"
+                    )
+                    logger.error(
+                        f"Error downloading tagged post, skipping {p.get('id')}: {e}"
+                    )
                pbar.update(1)
-                tagged_count+=1
-            if self.full_profile_max_posts and tagged_count >= self.full_profile_max_posts:
-                logger.info(f"TAGS reached full_profile_max_posts={self.full_profile_max_posts}")
+                tagged_count += 1
+            if (
+                self.full_profile_max_posts
+                and tagged_count >= self.full_profile_max_posts
+            ):
+                logger.info(
+                    f"TAGS reached full_profile_max_posts={self.full_profile_max_posts}"
+                )
                break
        result.set("#tagged", tagged_count)

+    ### reusable parsing utils below

-### reusable parsing utils below
-
-    def scrape_item(self, result:Metadata, item:dict, context:str=None) -> dict:
+    def scrape_item(self, result: Metadata, item: dict, context: str = None) -> dict:
        """
        receives a Metadata and an API dict response
        fetches the media and adds it to the Metadata
@@ -272,23 +318,25 @@ class InstagramAPIArchiver(Archiver):
        context can be used to give specific id prefixes to media
        """
        if "clips_metadata" in item:
-            if reusable_text := item.get("clips_metadata", {}).get("reusable_text_attribute_string"):
+            if reusable_text := item.get("clips_metadata", {}).get(
+                "reusable_text_attribute_string"
+            ):
                item["clips_metadata_text"] = reusable_text
-            if self.minimize_json_output: 
+            if self.minimize_json_output:
                del item["clips_metadata"]

-        if code := item.get("code") and not result.get("url"): 
+        if code := item.get("code") and not result.get("url"):
            result.set_url(f"https://www.instagram.com/p/{code}/")
-            
+
        resources = item.get("resources", item.get("carousel_media", []))
        item, media, media_id = self.scrape_media(item, context)
        # if resources are present take the main media from the first resource
        if not media and len(resources):
            _, media, media_id = self.scrape_media(resources[0], context)
            resources = resources[1:]
-        
+
        assert media, f"Image/video not found in {item=}"
-            
+
        # posts with multiple items contain a resources list
        resources_metadata = Metadata()
        for r in resources:
@@ -298,40 +346,54 @@ class InstagramAPIArchiver(Archiver):

        result.add_media(media, id=media_id)
        return item
-    
-    def scrape_media(self, item: dict, context:str) -> tuple[dict, Media, str]:
+
+    def scrape_media(self, item: dict, context: str) -> tuple[dict, Media, str]:
        # remove unnecessary info
-        if self.minimize_json_output: 
-            for k in ["image_versions", "video_versions", "video_dash_manifest", "image_versions2", "video_versions2"]:
-                if k in item: del item[k]
+        if self.minimize_json_output:
+            for k in [
+                "image_versions",
+                "video_versions",
+                "video_dash_manifest",
+                "image_versions2",
+                "video_versions2",
+            ]:
+                if k in item:
+                    del item[k]
        item = self.cleanup_dict(item)

        image_media = None
        if image_url := item.get("thumbnail_url"):
            filename = self.download_from_url(image_url, verbose=False)
            image_media = Media(filename=filename)
-            
+
        # retrieve video info
-        best_id = item.get('id', item.get('pk'))
+        best_id = item.get("id", item.get("pk"))
        taken_at = item.get("taken_at", item.get("taken_at_ts"))
        code = item.get("code")
        caption_text = item.get("caption_text")
-        if "carousel_media" in item: del item["carousel_media"]
+        if "carousel_media" in item:
+            del item["carousel_media"]

        if video_url := item.get("video_url"):
            filename = self.download_from_url(video_url, verbose=False)
            video_media = Media(filename=filename)
-            if taken_at: video_media.set("date", taken_at)
-            if code: video_media.set("url", f"https://www.instagram.com/p/{code}")
-            if caption_text: video_media.set("text", caption_text)
+            if taken_at:
+                video_media.set("date", taken_at)
+            if code:
+                video_media.set("url", f"https://www.instagram.com/p/{code}")
+            if caption_text:
+                video_media.set("text", caption_text)
            video_media.set("preview", [image_media])
            video_media.set("data", [item])
            return item, video_media, f"{context or 'video'} {best_id}"
        elif image_media:
-            if taken_at: image_media.set("date", taken_at)
-            if code: image_media.set("url", f"https://www.instagram.com/p/{code}")
-            if caption_text: image_media.set("text", caption_text)
+            if taken_at:
+                image_media.set("date", taken_at)
+            if code:
+                image_media.set("url", f"https://www.instagram.com/p/{code}")
+            if caption_text:
+                image_media.set("text", caption_text)
            image_media.set("data", [item])
            return item, image_media, f"{context or 'image'} {best_id}"
-        
-        return item, None, None
+
+        return item, None, None
--- a/src/auto_archiver/modules/instagram_extractor/init.py
+++ b/src/auto_archiver/modules/instagram_extractor/init.py
@@ -0,0 +1 @@
+from .instagram_extractor import InstagramExtractor
--- a/src/auto_archiver/modules/instagram_extractor/manifest.py
+++ b/src/auto_archiver/modules/instagram_extractor/manifest.py
@@ -0,0 +1,36 @@
+{
+    "name": "Instagram Extractor",
+    "type": ["extractor"],
+    "dependencies": {
+        "python": [
+            "instaloader",
+            "loguru",
+        ],
+    },
+    "requires_setup": True,
+    "configs": {
+        "username": {"required": True,
+                     "help": "a valid Instagram username"},
+        "password": {
+            "required": True,
+            "help": "the corresponding Instagram account password",
+        },
+        "download_folder": {
+            "default": "instaloader",
+            "help": "name of a folder to temporarily download content to",
+        },
+        "session_file": {
+            "default": "secrets/instaloader.session",
+            "help": "path to the instagram session which saves session credentials",
+        },
+        # TODO: fine-grain
+        # "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"},
+    },
+    "description": """
+    Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts
+    and user profiles, downloading as much information as possible, including images, videos, text, stories,
+    highlights, and tagged posts. 
+    Authentication is required via username/password or a session file.
+                    
+                    """,
+}
--- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
+++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
@@ -4,33 +4,29 @@

 """
 import re, os, shutil, traceback
-import instaloader  # https://instaloader.github.io/as-module.html
+import instaloader
 from loguru import logger

-from . import Archiver
-from ..core import Metadata
-from ..core import Media
+from auto_archiver.core import Extractor
+from auto_archiver.core import Metadata
+from auto_archiver.core import Media

-class InstagramArchiver(Archiver):
+class InstagramExtractor(Extractor):
    """
    Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
    """
-    name = "instagram_archiver"
-
    # NB: post regex should be tested before profile
+
+    valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/")
+
    # https://regex101.com/r/MGPquX/1
-    post_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(?:p|reel)\/(\w+)")
+    post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url))
    # https://regex101.com/r/6Wbsxa/1
-    profile_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(\w+)")
+    profile_pattern = re.compile(r"{valid_url}(\w+)".format(valid_url=valid_url))
    # TODO: links to stories

-    def __init__(self, config: dict) -> None:
-        super().__init__(config)
-        # TODO: refactor how configuration validation is done
-        self.assert_valid_string("username")
-        self.assert_valid_string("password")
-        self.assert_valid_string("download_folder")
-        self.assert_valid_string("session_file")
+    def setup(self) -> None:
+
        self.insta = instaloader.Instaloader(
            download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}"
        )
@@ -45,16 +41,7 @@ class InstagramArchiver(Archiver):
            except Exception as e2:
                logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}")

-    @staticmethod
-    def configs() -> dict:
-        return {
-            "username": {"default": None, "help": "a valid Instagram username"},
-            "password": {"default": None, "help": "the corresponding Instagram account password"},
-            "download_folder": {"default": "instaloader", "help": "name of a folder to temporarily download content to"},
-            "session_file": {"default": "secrets/instaloader.session", "help": "path to the instagram session which saves session credentials"},
-            #TODO: fine-grain
-            # "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"},
-        }
+

    def download(self, item: Metadata) -> Metadata:
        url = item.get_url()
@@ -76,7 +63,7 @@ class InstagramArchiver(Archiver):
            elif len(profile_matches):
                result = self.download_profile(url, profile_matches[0])
        except Exception as e:
-            logger.error(f"Failed to download with instagram archiver due to: {e}, make sure your account credentials are valid.")
+            logger.error(f"Failed to download with instagram extractor due to: {e}, make sure your account credentials are valid.")
        finally:
            shutil.rmtree(self.download_folder, ignore_errors=True)
        return result
--- a/src/auto_archiver/modules/instagram_tbot_extractor/init.py
+++ b/src/auto_archiver/modules/instagram_tbot_extractor/init.py
@@ -0,0 +1 @@
+from .instagram_tbot_extractor import InstagramTbotExtractor
--- a/src/auto_archiver/modules/instagram_tbot_extractor/manifest.py
+++ b/src/auto_archiver/modules/instagram_tbot_extractor/manifest.py
@@ -0,0 +1,40 @@
+{
+    "name": "Instagram Telegram Bot Extractor",
+    "type": ["extractor"],
+    "dependencies": {"python": ["loguru", "telethon",],
+                              },
+    "requires_setup": True,
+    "configs": {
+            "api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
+            "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
+            "session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
+            "timeout": {"default": 45,
+                        "type": "int",
+                        "help": "timeout to fetch the instagram content in seconds."},
+    },
+    "description": """
+The `InstagramTbotExtractor` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content,
+such as posts and stories. It leverages the Telethon library to interact with the Telegram API, sending Instagram URLs
+to the bot and downloading the resulting media and metadata. The downloaded content is stored as `Media` objects and
+returned as part of a `Metadata` object.
+
+### Features
+- Supports archiving Instagram posts and stories through the Telegram bot.
+- Downloads and saves media files (e.g., images, videos) in a temporary directory.
+- Captures and returns metadata, including titles and descriptions, as a `Metadata` object.
+- Automatically manages Telegram session files for secure access.
+
+### Setup
+
+To use the `InstagramTbotExtractor`, you need to provide the following configuration settings:
+- **API ID and Hash**: Telegram API credentials obtained from [my.telegram.org/apps](https://my.telegram.org/apps).
+- **Session File**: Optional path to store the Telegram session file for future use.
+- The session file is created automatically and should be unique for each instance.
+- You may need to enter your Telegram credentials (phone) and use the a 2FA code sent to you the first time you run the extractor.:
+```2025-01-30 00:43:49.348 | INFO     | auto_archiver.modules.instagram_tbot_extractor.instagram_tbot_extractor:setup:36 - SETUP instagram_tbot_extractor checking login...
+Please enter your phone (or bot token): +447123456789
+Please enter the code you received: 00000
+Signed in successfully as E C; remember to not break the ToS or you will risk an account ban!
+```
+    """,
+}
--- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
+++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
@@ -0,0 +1,121 @@
+"""
+InstagramTbotExtractor Module
+
+This module provides functionality to archive Instagram content (posts, stories, etc.) using a Telegram bot (`instagram_load_bot`).
+It interacts with the Telegram API via the Telethon library to send Instagram URLs to the bot, which retrieves the
+relevant media and metadata. The fetched content is saved as `Media` objects in a temporary directory and returned as a
+`Metadata` object.
+"""
+
+import os
+import shutil
+import time
+from sqlite3 import OperationalError
+
+from loguru import logger
+from telethon.sync import TelegramClient
+
+from auto_archiver.core import Extractor
+from auto_archiver.core import Metadata, Media
+from auto_archiver.utils import random_str
+
+
+class InstagramTbotExtractor(Extractor):
+    """
+    calls a telegram bot to fetch instagram posts/stories... and gets available media from it
+    https://github.com/adw0rd/instagrapi
+    https://t.me/instagram_load_bot
+    """
+
+    def setup(self) -> None:
+        """
+        1. makes a copy of session_file that is removed in cleanup
+        2. checks if the session file is valid
+        """
+        logger.info(f"SETUP {self.name} checking login...")
+        self._prepare_session_file()
+        self._initialize_telegram_client()
+
+    def _prepare_session_file(self):
+        """
+        Creates a copy of the session file for exclusive use with this archiver instance.
+        Ensures that a valid session file exists before proceeding.
+        """
+        new_session_file = os.path.join("secrets/", f"instabot-{time.strftime('%Y-%m-%d')}{random_str(8)}.session")
+        if not os.path.exists(f"{self.session_file}.session"):
+            raise FileNotFoundError(f"Session file {self.session_file}.session not found.")
+        shutil.copy(self.session_file + ".session", new_session_file)
+        self.session_file = new_session_file.replace(".session", "")
+
+    def _initialize_telegram_client(self):
+        """Initializes the Telegram client."""
+        try:
+            self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
+        except OperationalError as e:
+            logger.error(
+                f"Unable to access the {self.session_file} session. "
+                "Ensure that you don't use the same session file here and in telethon_extractor. "
+                "If you do, disable at least one of the archivers for the first-time setup of the telethon session: {e}"
+            )
+        with self.client.start():
+            logger.success(f"SETUP {self.name} login works.")
+
+    def cleanup(self) -> None:
+        logger.info(f"CLEANUP {self.name}.")
+        session_file_name = self.session_file + ".session"
+        if os.path.exists(session_file_name):
+            os.remove(session_file_name)
+        
+    def download(self, item: Metadata) -> Metadata:
+        url = item.get_url()
+        if not "instagram.com" in url: return False
+
+        result = Metadata()
+        tmp_dir = self.tmp_dir
+        with self.client.start():
+
+            chat, since_id = self._send_url_to_bot(url)
+            message = self._process_messages(chat, since_id, tmp_dir, result)
+
+            if "You must enter a URL to a post" in message:
+                logger.debug(f"invalid link {url=} for {self.name}: {message}")
+                return False
+            # # TODO: It currently returns this as a success - is that intentional?
+            # if "Media not found or unavailable" in message:
+            #     logger.debug(f"invalid link {url=} for {self.name}: {message}")
+            #     return False
+
+            if message:
+                result.set_content(message).set_title(message[:128])
+            return result.success("insta-via-bot")
+
+    def _send_url_to_bot(self, url: str):
+        """
+        Sends the URL to the 'instagram_load_bot' and returns (chat, since_id).
+        """
+        chat = self.client.get_entity("instagram_load_bot")
+        since_message = self.client.send_message(entity=chat, message=url)
+        return chat, since_message.id
+
+    def _process_messages(self, chat, since_id, tmp_dir, result):
+        attempts = 0
+        seen_media = []
+        message = ""
+        time.sleep(3)
+        # media is added before text by the bot so it can be used as a stop-logic mechanism
+        while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
+            attempts += 1
+            time.sleep(1)
+            for post in self.client.iter_messages(chat, min_id=since_id):
+                since_id = max(since_id, post.id)
+                # Skip known filler message:
+                if post.message == 'The bot receives information through https://hikerapi.com/p/hJqpppqi':
+                    continue
+                if post.media and post.id not in seen_media:
+                    filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
+                    media = self.client.download_media(post.media, filename_dest)
+                    if media:
+                        result.add_media(Media(media))
+                        seen_media.append(post.id)
+                if post.message: message += post.message
+        return message.strip()
--- a/src/auto_archiver/modules/local_storage/init.py
+++ b/src/auto_archiver/modules/local_storage/init.py
@@ -0,0 +1 @@
+from .local_storage import LocalStorage
--- a/src/auto_archiver/modules/local_storage/manifest.py
+++ b/src/auto_archiver/modules/local_storage/manifest.py
@@ -0,0 +1,35 @@
+{
+    "name": "Local Storage",
+    "type": ["storage"],
+    "requires_setup": False,
+    "dependencies": {
+        "python": ["loguru"],
+    },
+    "configs": {
+        "path_generator": {
+            "default": "flat",
+            "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
+            "choices": ["flat", "url", "random"],
+        },
+        "filename_generator": {
+            "default": "static",
+            "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
+            "choices": ["random", "static"],
+        },
+        "save_to": {"default": "./local_archive", "help": "folder where to save archived content"},
+        "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
+    },
+    "description": """
+    LocalStorage: A storage module for saving archived content locally on the filesystem.
+
+    ### Features
+    - Saves archived media files to a specified folder on the local filesystem.
+    - Maintains file metadata during storage using `shutil.copy2`.
+    - Supports both absolute and relative paths for stored files, configurable via `save_absolute`.
+    - Automatically creates directories as needed for storing files.
+
+    ### Notes
+    - Default storage folder is `./archived`, but this can be changed via the `save_to` configuration.
+    - The `save_absolute` option can reveal the file structure in output formats; use with caution.
+    """
+}
--- a/Show More
+++ b/Show More
				`@@ -1 +0,0 @@`
				`from .generic_archiver import GenericArchiver`
				`@@ -0,0 +1 @@`
				`from .generic_extractor import GenericExtractor`
				`@@ -0,0 +1 @@`
				`from .instagram_api_extractor import InstagramAPIExtractor`
				`@@ -0,0 +1 @@`
				`from .instagram_extractor import InstagramExtractor`
				`@@ -0,0 +1 @@`
				`from .instagram_tbot_extractor import InstagramTbotExtractor`