mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
Merge branch 'main' into timestamping_rewrite
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"name": "Auto-Archiver API Database",
|
||||
"name": "Auto Archiver API Database",
|
||||
"type": ["database"],
|
||||
"entry_point": "api_db::AAApiDb",
|
||||
"requires_setup": True,
|
||||
@@ -39,7 +39,7 @@
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Provides integration with the Auto-Archiver API for querying and storing archival data.
|
||||
Provides integration with the Auto Archiver API for querying and storing archival data.
|
||||
|
||||
### Features
|
||||
- **API Integration**: Supports querying for existing archives and submitting results.
|
||||
@@ -49,6 +49,6 @@
|
||||
- **Optional Storage**: Archives results conditionally based on configuration.
|
||||
|
||||
### Setup
|
||||
Requires access to an Auto-Archiver API instance and a valid API token.
|
||||
Requires access to an Auto Archiver API instance and a valid API token.
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -280,6 +280,7 @@ class GenericExtractor(Extractor):
|
||||
|
||||
# set up auth
|
||||
auth = self.auth_for_site(url, extract_cookies=False)
|
||||
|
||||
# order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
|
||||
if auth:
|
||||
if 'username' in auth and 'password' in auth:
|
||||
@@ -290,11 +291,11 @@ class GenericExtractor(Extractor):
|
||||
logger.debug(f'Using provided auth cookie for {url}')
|
||||
yt_dlp.utils.std_headers['cookie'] = auth['cookie']
|
||||
elif 'cookies_from_browser' in auth:
|
||||
logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}')
|
||||
logger.debug(f'Using extracted cookies from browser {auth["cookies_from_browser"]} for {url}')
|
||||
ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
|
||||
elif 'cookies_file' in auth:
|
||||
logger.debug(f'Using cookies from file {self.cookie_file} for {url}')
|
||||
ydl_options['cookiesfile'] = auth['cookies_file']
|
||||
logger.debug(f'Using cookies from file {auth["cookies_file"]} for {url}')
|
||||
ydl_options['cookiefile'] = auth['cookies_file']
|
||||
|
||||
ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
|
||||
|
||||
|
||||
@@ -15,7 +15,8 @@
|
||||
"header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
|
||||
"service_account": {
|
||||
"default": "secrets/service_account.json",
|
||||
"help": "service account JSON file path",
|
||||
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
|
||||
"required": True,
|
||||
},
|
||||
"columns": {
|
||||
"default": {
|
||||
@@ -34,16 +35,16 @@
|
||||
"wacz": "wacz",
|
||||
"replaywebpage": "replaywebpage",
|
||||
},
|
||||
"help": "names of columns in the google sheet (stringified JSON object)",
|
||||
"type": "auto_archiver.utils.json_loader",
|
||||
"help": "Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting",
|
||||
"type": "json_loader",
|
||||
},
|
||||
"allow_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
"help": "A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed",
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||
"help": "A list of worksheet names for worksheets that should be explicitly blocked from being processed",
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
@@ -64,8 +65,10 @@
|
||||
- Ensures only rows with valid URLs and unprocessed statuses are included for archival.
|
||||
- Supports organizing stored files into folder paths based on sheet and worksheet names.
|
||||
|
||||
### Notes
|
||||
- Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`.
|
||||
- Create the sheet using the template provided in the docs.
|
||||
### Setup
|
||||
- Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`.
|
||||
To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html).
|
||||
- Define the `sheet` or `sheet_id` configuration to specify the sheet to archive.
|
||||
- Customize the column names in your Google sheet using the `columns` configuration.
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -24,9 +24,8 @@ class GsheetsFeeder(Feeder):
|
||||
def setup(self) -> None:
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
# TODO mv to validators
|
||||
assert self.sheet or self.sheet_id, (
|
||||
"You need to define either a 'sheet' name or a 'sheet_id' in your manifest."
|
||||
)
|
||||
if not self.sheet and not self.sheet_id:
|
||||
raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.")
|
||||
|
||||
def open_sheet(self):
|
||||
if self.sheet:
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
"channel_invites": {
|
||||
"default": {},
|
||||
"help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
|
||||
"type": "auto_archiver.utils.json_loader",
|
||||
"type": "json_loader",
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
from .wacz_enricher import WaczExtractorEnricher
|
||||
@@ -0,0 +1 @@
|
||||
from .wacz_extractor_enricher import WaczExtractorEnricher
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"name": "WACZ Enricher",
|
||||
"name": "WACZ Enricher (and Extractor)",
|
||||
"type": ["enricher", "extractor"],
|
||||
"entry_point": "wacz_enricher::WaczExtractorEnricher",
|
||||
"entry_point": "wacz_extractor_enricher::WaczExtractorEnricher",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": [
|
||||
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"name": "Wayback Machine Enricher",
|
||||
"name": "Wayback Machine Enricher (and Extractor)",
|
||||
"type": ["enricher", "extractor"],
|
||||
"entry_point": "wayback_extractor_enricher::WaybackExtractorEnricher",
|
||||
"requires_setup": True,
|
||||
|
||||
Reference in New Issue
Block a user