Merge branch 'main' into small_issues

# Conflicts:
#	src/auto_archiver/core/base_module.py
#	src/auto_archiver/utils/misc.py
This commit is contained in:
erinhmclark
2025-02-26 13:19:49 +00:00
48 changed files with 890 additions and 299 deletions

View File

@@ -1,5 +1,5 @@
{
"name": "Auto-Archiver API Database",
"name": "Auto Archiver API Database",
"type": ["database"],
"entry_point": "api_db::AAApiDb",
"requires_setup": True,
@@ -39,7 +39,7 @@
},
},
"description": """
Provides integration with the Auto-Archiver API for querying and storing archival data.
Provides integration with the Auto Archiver API for querying and storing archival data.
### Features
- **API Integration**: Supports querying for existing archives and submitting results.
@@ -49,6 +49,6 @@
- **Optional Storage**: Archives results conditionally based on configuration.
### Setup
Requires access to an Auto-Archiver API instance and a valid API token.
Requires access to an Auto Archiver API instance and a valid API token.
""",
}

View File

@@ -280,6 +280,7 @@ class GenericExtractor(Extractor):
# set up auth
auth = self.auth_for_site(url, extract_cookies=False)
# order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
if auth:
if 'username' in auth and 'password' in auth:
@@ -290,11 +291,11 @@ class GenericExtractor(Extractor):
logger.debug(f'Using provided auth cookie for {url}')
yt_dlp.utils.std_headers['cookie'] = auth['cookie']
elif 'cookies_from_browser' in auth:
logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}')
logger.debug(f'Using extracted cookies from browser {auth["cookies_from_browser"]} for {url}')
ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
elif 'cookies_file' in auth:
logger.debug(f'Using cookies from file {self.cookie_file} for {url}')
ydl_options['cookiesfile'] = auth['cookies_file']
logger.debug(f'Using cookies from file {auth["cookies_file"]} for {url}')
ydl_options['cookiefile'] = auth['cookies_file']
ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"

View File

@@ -15,7 +15,8 @@
"header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
"service_account": {
"default": "secrets/service_account.json",
"help": "service account JSON file path",
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
"required": True,
},
"columns": {
"default": {
@@ -34,16 +35,16 @@
"wacz": "wacz",
"replaywebpage": "replaywebpage",
},
"help": "names of columns in the google sheet (stringified JSON object)",
"type": "auto_archiver.utils.json_loader",
"help": "Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting",
"type": "json_loader",
},
"allow_worksheets": {
"default": set(),
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
"help": "A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed",
},
"block_worksheets": {
"default": set(),
"help": "(CSV) explicitly block some worksheets from being processed",
"help": "A list of worksheet names for worksheets that should be explicitly blocked from being processed",
},
"use_sheet_names_in_stored_paths": {
"default": True,
@@ -64,8 +65,10 @@
- Ensures only rows with valid URLs and unprocessed statuses are included for archival.
- Supports organizing stored files into folder paths based on sheet and worksheet names.
### Notes
- Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`.
- Create the sheet using the template provided in the docs.
### Setup
- Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`.
To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html).
- Define the `sheet` or `sheet_id` configuration to specify the sheet to archive.
- Customize the column names in your Google sheet using the `columns` configuration.
""",
}

View File

@@ -24,9 +24,8 @@ class GsheetsFeeder(Feeder):
def setup(self) -> None:
self.gsheets_client = gspread.service_account(filename=self.service_account)
# TODO mv to validators
assert self.sheet or self.sheet_id, (
"You need to define either a 'sheet' name or a 'sheet_id' in your manifest."
)
if not self.sheet and not self.sheet_id:
raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.")
def open_sheet(self):
if self.sheet:

View File

@@ -18,7 +18,7 @@
"channel_invites": {
"default": {},
"help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
"type": "auto_archiver.utils.json_loader",
"type": "json_loader",
}
},
"description": """

View File

@@ -1 +0,0 @@
from .wacz_enricher import WaczExtractorEnricher

View File

@@ -0,0 +1 @@
from .wacz_extractor_enricher import WaczExtractorEnricher

View File

@@ -1,7 +1,7 @@
{
"name": "WACZ Enricher",
"name": "WACZ Enricher (and Extractor)",
"type": ["enricher", "extractor"],
"entry_point": "wacz_enricher::WaczExtractorEnricher",
"entry_point": "wacz_extractor_enricher::WaczExtractorEnricher",
"requires_setup": True,
"dependencies": {
"python": [

View File

@@ -1,5 +1,5 @@
{
"name": "Wayback Machine Enricher",
"name": "Wayback Machine Enricher (and Extractor)",
"type": ["enricher", "extractor"],
"entry_point": "wayback_extractor_enricher::WaybackExtractorEnricher",
"requires_setup": True,