Merge branch 'main' into timestamping_rewrite

2026-06-12 13:18:28 +03:00 · 2025-02-25 17:10:55 +00:00
parent 898faf6fe4 1ad158c016
commit 4dcb77c29f
48 changed files with 889 additions and 299 deletions
--- a/src/auto_archiver/modules/api_db/manifest.py
+++ b/src/auto_archiver/modules/api_db/manifest.py
@@ -1,5 +1,5 @@
 {
-    "name": "Auto-Archiver API Database",
+    "name": "Auto Archiver API Database",
    "type": ["database"],
    "entry_point": "api_db::AAApiDb",
    "requires_setup": True,
@@ -39,7 +39,7 @@
        },
    },
    "description": """
-     Provides integration with the Auto-Archiver API for querying and storing archival data.
+     Provides integration with the Auto Archiver API for querying and storing archival data.

 ### Features
 - **API Integration**: Supports querying for existing archives and submitting results.
@@ -49,6 +49,6 @@
 - **Optional Storage**: Archives results conditionally based on configuration.

 ### Setup
-Requires access to an Auto-Archiver API instance and a valid API token.
+Requires access to an Auto Archiver API instance and a valid API token.
     """,
 }
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -280,6 +280,7 @@ class GenericExtractor(Extractor):
        
        # set up auth
        auth = self.auth_for_site(url, extract_cookies=False)
+
        # order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
        if auth:
            if 'username' in auth and 'password' in auth:
@@ -290,11 +291,11 @@ class GenericExtractor(Extractor):
                logger.debug(f'Using provided auth cookie for {url}')
                yt_dlp.utils.std_headers['cookie'] = auth['cookie']
            elif 'cookies_from_browser' in auth:
-                logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}')
+                logger.debug(f'Using extracted cookies from browser {auth["cookies_from_browser"]} for {url}')
                ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
            elif 'cookies_file' in auth:
-                logger.debug(f'Using cookies from file {self.cookie_file} for {url}')
-                ydl_options['cookiesfile'] = auth['cookies_file']
+                logger.debug(f'Using cookies from file {auth["cookies_file"]} for {url}')
+                ydl_options['cookiefile'] = auth['cookies_file']

        ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"

--- a/src/auto_archiver/modules/gsheet_feeder/manifest.py
+++ b/src/auto_archiver/modules/gsheet_feeder/manifest.py
@@ -15,7 +15,8 @@
        "header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
        "service_account": {
            "default": "secrets/service_account.json",
-            "help": "service account JSON file path",
+            "help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
+            "required": True,
        },
        "columns": {
            "default": {
@@ -34,16 +35,16 @@
                "wacz": "wacz",
                "replaywebpage": "replaywebpage",
            },
-            "help": "names of columns in the google sheet (stringified JSON object)",
-            "type": "auto_archiver.utils.json_loader",
+            "help": "Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting",
+            "type": "json_loader",
        },
        "allow_worksheets": {
            "default": set(),
-            "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
+            "help": "A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed",
        },
        "block_worksheets": {
            "default": set(),
-            "help": "(CSV) explicitly block some worksheets from being processed",
+            "help": "A list of worksheet names for worksheets that should be explicitly blocked from being processed",
        },
        "use_sheet_names_in_stored_paths": {
            "default": True,
@@ -64,8 +65,10 @@
    - Ensures only rows with valid URLs and unprocessed statuses are included for archival.
    - Supports organizing stored files into folder paths based on sheet and worksheet names.

-    ### Notes
-    - Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`.
-    - Create the sheet using the template provided in the docs.
+    ### Setup
+    - Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`.
+    To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html).
+    - Define the `sheet` or `sheet_id` configuration to specify the sheet to archive.
+    - Customize the column names in your Google sheet using the `columns` configuration.
    """,
 }
--- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
+++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
@@ -24,9 +24,8 @@ class GsheetsFeeder(Feeder):
    def setup(self) -> None:
        self.gsheets_client = gspread.service_account(filename=self.service_account)
        # TODO mv to validators
-        assert self.sheet or self.sheet_id, (
-            "You need to define either a 'sheet' name or a 'sheet_id' in your manifest."
-        )
+        if not self.sheet and not self.sheet_id:
+            raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.")

    def open_sheet(self):
        if self.sheet:
--- a/src/auto_archiver/modules/telethon_extractor/manifest.py
+++ b/src/auto_archiver/modules/telethon_extractor/manifest.py
@@ -18,7 +18,7 @@
            "channel_invites": {
                "default": {},
                "help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
-                "type": "auto_archiver.utils.json_loader",
+                "type": "json_loader",
            }
        },
    "description": """
--- a/src/auto_archiver/modules/wacz_enricher/init.py
+++ b/src/auto_archiver/modules/wacz_enricher/init.py
@@ -1 +0,0 @@
-from .wacz_enricher import WaczExtractorEnricher
--- a/src/auto_archiver/modules/wacz_extractor_enricher/init.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/init.py
@@ -0,0 +1 @@
+from .wacz_extractor_enricher import WaczExtractorEnricher
--- a/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
@@ -1,7 +1,7 @@
 {
-    "name": "WACZ Enricher",
+    "name": "WACZ Enricher (and Extractor)",
    "type": ["enricher", "extractor"],
-    "entry_point": "wacz_enricher::WaczExtractorEnricher",
+    "entry_point": "wacz_extractor_enricher::WaczExtractorEnricher",
    "requires_setup": True,
    "dependencies": {
        "python": [
--- a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
--- a/src/auto_archiver/modules/wayback_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/wayback_extractor_enricher/manifest.py
@@ -1,5 +1,5 @@
 {
-    "name": "Wayback Machine Enricher",
+    "name": "Wayback Machine Enricher (and Extractor)",
    "type": ["enricher", "extractor"],
    "entry_point": "wayback_extractor_enricher::WaybackExtractorEnricher",
    "requires_setup": True,
				`@@ -1 +0,0 @@`
				`from .wacz_enricher import WaczExtractorEnricher`
				`@@ -0,0 +1 @@`
				`from .wacz_extractor_enricher import WaczExtractorEnricher`