mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 12:48:28 +03:00
Merge branch 'load_modules' into add_module_tests
# Conflicts: # src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"name": "Auto-Archiver API Database",
|
||||
"type": ["database"],
|
||||
"entry_point": "api_db:AAApiDb",
|
||||
"entry_point": "api_db::AAApiDb",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["requests", "loguru"],
|
||||
@@ -23,7 +23,7 @@
|
||||
"default": None,
|
||||
"help": "which group of users have access to the archive in case public=false as author",
|
||||
},
|
||||
"allow_rearchive": {
|
||||
"use_api_cache": {
|
||||
"default": True,
|
||||
"type": "bool",
|
||||
"help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
|
||||
@@ -43,7 +43,7 @@
|
||||
|
||||
### Features
|
||||
- **API Integration**: Supports querying for existing archives and submitting results.
|
||||
- **Duplicate Prevention**: Avoids redundant archiving when `allow_rearchive` is disabled.
|
||||
- **Duplicate Prevention**: Avoids redundant archiving when `use_api_cache` is disabled.
|
||||
- **Configurable**: Supports settings like API endpoint, authentication token, tags, and permissions.
|
||||
- **Tagging and Metadata**: Adds tags and manages metadata for archives.
|
||||
- **Optional Storage**: Archives results conditionally based on configuration.
|
||||
|
||||
@@ -15,11 +15,11 @@ class AAApiDb(Database):
|
||||
""" query the database for the existence of this item.
|
||||
Helps avoid re-archiving the same URL multiple times.
|
||||
"""
|
||||
if not self.allow_rearchive: return
|
||||
|
||||
if not self.use_api_cache: return
|
||||
|
||||
params = {"url": item.get_url(), "limit": 15}
|
||||
headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
|
||||
response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers)
|
||||
response = requests.get(os.path.join(self.api_endpoint, "url/search"), params=params, headers=headers)
|
||||
|
||||
if response.status_code == 200:
|
||||
if len(response.json()):
|
||||
@@ -30,21 +30,26 @@ class AAApiDb(Database):
|
||||
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
|
||||
return False
|
||||
|
||||
|
||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
if not self.store_results: return
|
||||
if cached:
|
||||
if cached:
|
||||
logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
|
||||
return
|
||||
logger.debug(f"saving archive of {item.get_url()} to the AA API.")
|
||||
|
||||
payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
|
||||
payload = {
|
||||
'author_id': self.author_id,
|
||||
'url': item.get_url(),
|
||||
'public': self.public,
|
||||
'group_id': self.group_id,
|
||||
'tags': list(self.tags),
|
||||
'result': item.to_json(),
|
||||
}
|
||||
headers = {"Authorization": f"Bearer {self.api_token}"}
|
||||
response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers)
|
||||
response = requests.post(os.path.join(self.api_endpoint, "interop/submit-archive"), json=payload, headers=headers)
|
||||
|
||||
if response.status_code == 200:
|
||||
if response.status_code == 201:
|
||||
logger.success(f"AA API: {response.json()}")
|
||||
else:
|
||||
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"name": "Atlos Database",
|
||||
"type": ["database"],
|
||||
"entry_point": "atlos_db:AtlosDb",
|
||||
"entry_point": "atlos_db::AtlosDb",
|
||||
"requires_setup": True,
|
||||
"dependencies":
|
||||
{"python": ["loguru",
|
||||
|
||||
@@ -109,6 +109,6 @@ class GsheetsDb(Database):
|
||||
gw: GWorksheet = gsheet.get("worksheet")
|
||||
row: int = gsheet.get("row")
|
||||
elif self.sheet_id:
|
||||
print(self.sheet_id)
|
||||
logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")
|
||||
|
||||
return gw, row
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
The `TelegramExtractor` retrieves publicly available media content from Telegram message links without requiring login credentials.
|
||||
It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata`
|
||||
and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver`
|
||||
is advised for more comprehensive functionality.
|
||||
is advised for more comprehensive functionality, and higher quality media extraction.
|
||||
|
||||
### Features
|
||||
- Extracts images and videos from public Telegram message links (`t.me`).
|
||||
|
||||
Reference in New Issue
Block a user