mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
Merge branch 'main' into opentimestamps
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"name": "Auto-Archiver API Database",
|
||||
"name": "Auto Archiver API Database",
|
||||
"type": ["database"],
|
||||
"entry_point": "api_db::AAApiDb",
|
||||
"requires_setup": True,
|
||||
@@ -24,9 +24,9 @@
|
||||
"help": "which group of users have access to the archive in case public=false as author",
|
||||
},
|
||||
"use_api_cache": {
|
||||
"default": True,
|
||||
"default": False,
|
||||
"type": "bool",
|
||||
"help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
|
||||
"help": "if True then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
|
||||
},
|
||||
"store_results": {
|
||||
"default": True,
|
||||
@@ -39,7 +39,7 @@
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Provides integration with the Auto-Archiver API for querying and storing archival data.
|
||||
Provides integration with the Auto Archiver API for querying and storing archival data.
|
||||
|
||||
### Features
|
||||
- **API Integration**: Supports querying for existing archives and submitting results.
|
||||
@@ -49,6 +49,6 @@
|
||||
- **Optional Storage**: Archives results conditionally based on configuration.
|
||||
|
||||
### Setup
|
||||
Requires access to an Auto-Archiver API instance and a valid API token.
|
||||
Requires access to an Auto Archiver API instance and a valid API token.
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
from atlos_db import AtlosDb
|
||||
@@ -1,36 +0,0 @@
|
||||
{
|
||||
"name": "Atlos Database",
|
||||
"type": ["database"],
|
||||
"entry_point": "atlos_db::AtlosDb",
|
||||
"requires_setup": True,
|
||||
"dependencies":
|
||||
{"python": ["loguru",
|
||||
""],
|
||||
"bin": [""]},
|
||||
"configs": {
|
||||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"type": "str"
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Handles integration with the Atlos platform for managing archival results.
|
||||
|
||||
### Features
|
||||
- Outputs archival results to the Atlos API for storage and tracking.
|
||||
- Updates failure status with error details when archiving fails.
|
||||
- Processes and formats metadata, including ISO formatting for datetime fields.
|
||||
- Skips processing for items without an Atlos ID.
|
||||
|
||||
### Setup
|
||||
Required configs:
|
||||
- atlos_url: Base URL for the Atlos API.
|
||||
- api_token: Authentication token for API access.
|
||||
"""
|
||||
,
|
||||
}
|
||||
@@ -1,66 +0,0 @@
|
||||
from typing import Union
|
||||
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Database
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
class AtlosDb(Database):
|
||||
"""
|
||||
Outputs results to Atlos
|
||||
"""
|
||||
|
||||
def failed(self, item: Metadata, reason: str) -> None:
|
||||
"""Update DB accordingly for failure"""
|
||||
# If the item has no Atlos ID, there's nothing for us to do
|
||||
if not item.metadata.get("atlos_id"):
|
||||
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
|
||||
return
|
||||
|
||||
requests.post(
|
||||
f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver",
|
||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||
json={"metadata": {"processed": True, "status": "error", "error": reason}},
|
||||
).raise_for_status()
|
||||
logger.info(
|
||||
f"Stored failure for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos: {reason}"
|
||||
)
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
"""check and fetch if the given item has been archived already, each
|
||||
database should handle its own caching, and configuration mechanisms"""
|
||||
return False
|
||||
|
||||
def _process_metadata(self, item: Metadata) -> dict:
|
||||
"""Process metadata for storage on Atlos. Will convert any datetime
|
||||
objects to ISO format."""
|
||||
|
||||
return {
|
||||
k: v.isoformat() if hasattr(v, "isoformat") else v
|
||||
for k, v in item.metadata.items()
|
||||
}
|
||||
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
|
||||
if not item.metadata.get("atlos_id"):
|
||||
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
|
||||
return
|
||||
|
||||
requests.post(
|
||||
f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver",
|
||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||
json={
|
||||
"metadata": dict(
|
||||
processed=True,
|
||||
status="success",
|
||||
results=self._process_metadata(item),
|
||||
)
|
||||
},
|
||||
).raise_for_status()
|
||||
|
||||
logger.info(
|
||||
f"Stored success for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos"
|
||||
)
|
||||
@@ -1,13 +0,0 @@
|
||||
def get_atlos_config_options():
|
||||
return {
|
||||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"type": str
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"type": str
|
||||
},
|
||||
}
|
||||
@@ -1 +0,0 @@
|
||||
from .atlos_feeder import AtlosFeeder
|
||||
@@ -1,34 +0,0 @@
|
||||
{
|
||||
"name": "Atlos Feeder",
|
||||
"type": ["feeder"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
},
|
||||
"configs": {
|
||||
"api_token": {
|
||||
"type": "str",
|
||||
"required": True,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"type": "str"
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
AtlosFeeder: A feeder module that integrates with the Atlos API to fetch source material URLs for archival.
|
||||
|
||||
### Features
|
||||
- Connects to the Atlos API to retrieve a list of source material URLs.
|
||||
- Filters source materials based on visibility, processing status, and metadata.
|
||||
- Converts filtered source materials into `Metadata` objects with the relevant `atlos_id` and URL.
|
||||
- Iterates through paginated results using a cursor for efficient API interaction.
|
||||
|
||||
### Notes
|
||||
- Requires an Atlos API endpoint and a valid API token for authentication.
|
||||
- Ensures only unprocessed, visible, and ready-to-archive URLs are returned.
|
||||
- Handles pagination transparently when retrieving data from the Atlos API.
|
||||
"""
|
||||
}
|
||||
@@ -1,42 +0,0 @@
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Feeder
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
class AtlosFeeder(Feeder):
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
# Get all the urls from the Atlos API
|
||||
count = 0
|
||||
cursor = None
|
||||
while True:
|
||||
response = requests.get(
|
||||
f"{self.atlos_url}/api/v2/source_material",
|
||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||
params={"cursor": cursor},
|
||||
)
|
||||
data = response.json()
|
||||
response.raise_for_status()
|
||||
cursor = data["next"]
|
||||
|
||||
for item in data["results"]:
|
||||
if (
|
||||
item["source_url"] not in [None, ""]
|
||||
and (
|
||||
item["metadata"]
|
||||
.get("auto_archiver", {})
|
||||
.get("processed", False)
|
||||
!= True
|
||||
)
|
||||
and item["visibility"] == "visible"
|
||||
and item["status"] not in ["processing", "pending"]
|
||||
):
|
||||
yield Metadata().set_url(item["source_url"]).set(
|
||||
"atlos_id", item["id"]
|
||||
)
|
||||
count += 1
|
||||
|
||||
if len(data["results"]) == 0 or cursor is None:
|
||||
break
|
||||
@@ -0,0 +1 @@
|
||||
from .atlos_feeder_db_storage import AtlosFeederDbStorage
|
||||
@@ -0,0 +1,46 @@
|
||||
{
|
||||
"name": "Atlos Feeder Database Storage",
|
||||
"type": ["feeder", "database", "storage"],
|
||||
"entry_point": "atlos_feeder_db_storage::AtlosFeederDbStorage",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
},
|
||||
"configs": {
|
||||
"api_token": {
|
||||
"type": "str",
|
||||
"required": True,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"type": "str"
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
A module that integrates with the Atlos API to fetch source material URLs for archival, uplaod extracted media,
|
||||
|
||||
[Atlos](https://www.atlos.org/) is a visual investigation and archiving platform designed for investigative research, journalism, and open-source intelligence (OSINT).
|
||||
It helps users organize, analyze, and store media from various sources, making it easier to track and investigate digital evidence.
|
||||
|
||||
To get started create a new project and obtain an API token from the settings page. You can group event's into Atlos's 'incidents'.
|
||||
Here you can add 'source material' by URLn and the Atlos feeder will fetch these URLs for archival.
|
||||
|
||||
You can use Atlos only as a 'feeder', however you can also implement the 'database' and 'storage' features to store the media files in Atlos which is recommended.
|
||||
The Auto Archiver will retain the Atlos ID for each item, ensuring that the media and database outputs are uplaoded back into the relevant media item.
|
||||
|
||||
|
||||
### Features
|
||||
- Connects to the Atlos API to retrieve a list of source material URLs.
|
||||
- Iterates through the URLs from all source material items which are unprocessed, visible, and ready to archive.
|
||||
- If the storage option is selected, it will store the media files alongside the original source material item in Atlos.
|
||||
- Is the database option is selected it will output the results to the media item, as well as updating failure status with error details when archiving fails.
|
||||
- Skips Storege/ database upload for items without an Atlos ID - restricting that you must use the Atlos feeder so that it has the Atlos ID to store the results with.
|
||||
|
||||
### Notes
|
||||
- Requires an Atlos account with a project and a valid API token for authentication.
|
||||
- Ensures only unprocessed, visible, and ready-to-archive URLs are returned.
|
||||
- Feches any media items within an Atlos project, regardless of separation into incidents.
|
||||
"""
|
||||
}
|
||||
@@ -0,0 +1,153 @@
|
||||
import hashlib
|
||||
import os
|
||||
from typing import IO, Iterator, Optional, Union
|
||||
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Database, Feeder, Media, Metadata, Storage
|
||||
from auto_archiver.utils import calculate_file_hash
|
||||
|
||||
|
||||
class AtlosFeederDbStorage(Feeder, Database, Storage):
|
||||
|
||||
def setup(self) -> requests.Session:
|
||||
"""create and return a persistent session."""
|
||||
self.session = requests.Session()
|
||||
|
||||
def _get(self, endpoint: str, params: Optional[dict] = None) -> dict:
|
||||
"""Wrapper for GET requests to the Atlos API."""
|
||||
url = f"{self.atlos_url}{endpoint}"
|
||||
response = self.session.get(
|
||||
url, headers={"Authorization": f"Bearer {self.api_token}"}, params=params
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def _post(
|
||||
self,
|
||||
endpoint: str,
|
||||
json: Optional[dict] = None,
|
||||
params: Optional[dict] = None,
|
||||
files: Optional[dict] = None,
|
||||
) -> dict:
|
||||
"""Wrapper for POST requests to the Atlos API."""
|
||||
url = f"{self.atlos_url}{endpoint}"
|
||||
response = self.session.post(
|
||||
url,
|
||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||
json=json,
|
||||
params=params,
|
||||
files=files,
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
# ! Atlos Module - Feeder Methods
|
||||
|
||||
def __iter__(self) -> Iterator[Metadata]:
|
||||
"""Iterate over unprocessed, visible source materials from Atlos."""
|
||||
cursor = None
|
||||
while True:
|
||||
data = self._get("/api/v2/source_material", params={"cursor": cursor})
|
||||
cursor = data.get("next")
|
||||
results = data.get("results", [])
|
||||
for item in results:
|
||||
if (
|
||||
item.get("source_url") not in [None, ""]
|
||||
and not item.get("metadata", {}).get("auto_archiver", {}).get("processed", False)
|
||||
and item.get("visibility") == "visible"
|
||||
and item.get("status") not in ["processing", "pending"]
|
||||
):
|
||||
yield Metadata().set_url(item["source_url"]).set("atlos_id", item["id"])
|
||||
if not results or cursor is None:
|
||||
break
|
||||
|
||||
# ! Atlos Module - Database Methods
|
||||
|
||||
def failed(self, item: Metadata, reason: str) -> None:
|
||||
"""Mark an item as failed in Atlos, if the ID exists."""
|
||||
atlos_id = item.metadata.get("atlos_id")
|
||||
if not atlos_id:
|
||||
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
|
||||
return
|
||||
self._post(
|
||||
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
|
||||
json={"metadata": {"processed": True, "status": "error", "error": reason}},
|
||||
)
|
||||
logger.info(f"Stored failure for {item.get_url()} (ID {atlos_id}) on Atlos: {reason}")
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
"""check and fetch if the given item has been archived already, each
|
||||
database should handle its own caching, and configuration mechanisms"""
|
||||
return False
|
||||
|
||||
def _process_metadata(self, item: Metadata) -> dict:
|
||||
"""Process metadata for storage on Atlos. Will convert any datetime
|
||||
objects to ISO format."""
|
||||
return {
|
||||
k: v.isoformat() if hasattr(v, "isoformat") else v
|
||||
for k, v in item.metadata.items()
|
||||
}
|
||||
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""Mark an item as successfully archived in Atlos."""
|
||||
atlos_id = item.metadata.get("atlos_id")
|
||||
if not atlos_id:
|
||||
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
|
||||
return
|
||||
self._post(
|
||||
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
|
||||
json={
|
||||
"metadata": {
|
||||
"processed": True,
|
||||
"status": "success",
|
||||
"results": self._process_metadata(item),
|
||||
}
|
||||
},
|
||||
)
|
||||
logger.info(f"Stored success for {item.get_url()} (ID {atlos_id}) on Atlos")
|
||||
|
||||
# ! Atlos Module - Storage Methods
|
||||
|
||||
def get_cdn_url(self, _media: Media) -> str:
|
||||
"""Return the base Atlos URL as the CDN URL."""
|
||||
return self.atlos_url
|
||||
|
||||
def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool:
|
||||
"""Upload a media file to Atlos if it has not been uploaded already."""
|
||||
if metadata is None:
|
||||
logger.error(f"No metadata provided for {media.filename}")
|
||||
return False
|
||||
|
||||
atlos_id = metadata.get("atlos_id")
|
||||
if not atlos_id:
|
||||
logger.error(f"No Atlos ID found in metadata; can't store {media.filename} in Atlos.")
|
||||
return False
|
||||
|
||||
media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096)
|
||||
|
||||
# Check whether the media has already been uploaded
|
||||
source_material = self._get(f"/api/v2/source_material/{atlos_id}")["result"]
|
||||
existing_media = [
|
||||
artifact.get("file_hash_sha256")
|
||||
for artifact in source_material.get("artifacts", [])
|
||||
]
|
||||
if media_hash in existing_media:
|
||||
logger.info(f"{media.filename} with SHA256 {media_hash} already uploaded to Atlos")
|
||||
return True
|
||||
|
||||
# Upload the media to the Atlos API
|
||||
with open(media.filename, "rb") as file_obj:
|
||||
self._post(
|
||||
f"/api/v2/source_material/upload/{atlos_id}",
|
||||
params={"title": media.properties},
|
||||
files={"file": (os.path.basename(media.filename), file_obj)},
|
||||
)
|
||||
logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}")
|
||||
return True
|
||||
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||||
"""Upload a file-like object; not implemented."""
|
||||
pass
|
||||
|
||||
@@ -1,66 +0,0 @@
|
||||
import hashlib
|
||||
import os
|
||||
from typing import IO, Optional
|
||||
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Media, Metadata
|
||||
from auto_archiver.core import Storage
|
||||
|
||||
|
||||
class AtlosStorage(Storage):
|
||||
|
||||
def get_cdn_url(self, _media: Media) -> str:
|
||||
# It's not always possible to provide an exact URL, because it's
|
||||
# possible that the media once uploaded could have been copied to
|
||||
# another project.
|
||||
return self.atlos_url
|
||||
|
||||
def _hash(self, media: Media) -> str:
|
||||
# Hash the media file using sha-256. We don't use the existing auto archiver
|
||||
# hash because there's no guarantee that the configuerer is using sha-256, which
|
||||
# is how Atlos hashes files.
|
||||
|
||||
sha256 = hashlib.sha256()
|
||||
with open(media.filename, "rb") as f:
|
||||
while True:
|
||||
buf = f.read(4096)
|
||||
if not buf: break
|
||||
sha256.update(buf)
|
||||
return sha256.hexdigest()
|
||||
|
||||
def upload(self, media: Media, metadata: Optional[Metadata]=None, **_kwargs) -> bool:
|
||||
atlos_id = metadata.get("atlos_id")
|
||||
if atlos_id is None:
|
||||
logger.error(f"No Atlos ID found in metadata; can't store {media.filename} on Atlos")
|
||||
return False
|
||||
|
||||
media_hash = self._hash(media)
|
||||
|
||||
# Check whether the media has already been uploaded
|
||||
source_material = requests.get(
|
||||
f"{self.atlos_url}/api/v2/source_material/{atlos_id}",
|
||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||
).json()["result"]
|
||||
existing_media = [x["file_hash_sha256"] for x in source_material.get("artifacts", [])]
|
||||
if media_hash in existing_media:
|
||||
logger.info(f"{media.filename} with SHA256 {media_hash} already uploaded to Atlos")
|
||||
return True
|
||||
|
||||
# Upload the media to the Atlos API
|
||||
requests.post(
|
||||
f"{self.atlos_url}/api/v2/source_material/upload/{atlos_id}",
|
||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||
params={
|
||||
"title": media.properties
|
||||
},
|
||||
files={"file": (os.path.basename(media.filename), open(media.filename, "rb"))},
|
||||
).raise_for_status()
|
||||
|
||||
logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}")
|
||||
|
||||
return True
|
||||
|
||||
# must be implemented even if unused
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
|
||||
23
src/auto_archiver/modules/cli_feeder/__manifest__.py
Normal file
23
src/auto_archiver/modules/cli_feeder/__manifest__.py
Normal file
@@ -0,0 +1,23 @@
|
||||
{
|
||||
'name': 'Command Line Feeder',
|
||||
'type': ['feeder'],
|
||||
'entry_point': 'cli_feeder::CLIFeeder',
|
||||
'requires_setup': False,
|
||||
'description': 'Feeds URLs to orchestrator from the command line',
|
||||
'configs': {
|
||||
'urls': {
|
||||
'default': None,
|
||||
'help': 'URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml',
|
||||
},
|
||||
},
|
||||
'description': """
|
||||
The Command Line Feeder is the default enabled feeder for the Auto Archiver. It allows you to pass URLs directly to the orchestrator from the command line
|
||||
without the need to specify any additional configuration or command line arguments:
|
||||
|
||||
`auto-archiver --feeder cli_feeder -- "https://example.com/1/,https://example.com/2/"`
|
||||
|
||||
You can pass multiple URLs by separating them with a space. The URLs will be processed in the order they are provided.
|
||||
|
||||
`auto-archiver --feeder cli_feeder -- https://example.com/1/ https://example.com/2/`
|
||||
""",
|
||||
}
|
||||
20
src/auto_archiver/modules/cli_feeder/cli_feeder.py
Normal file
20
src/auto_archiver/modules/cli_feeder/cli_feeder.py
Normal file
@@ -0,0 +1,20 @@
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core.feeder import Feeder
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
|
||||
class CLIFeeder(Feeder):
|
||||
|
||||
def setup(self) -> None:
|
||||
self.urls = self.config['urls']
|
||||
if not self.urls:
|
||||
raise ValueError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
urls = self.config['urls']
|
||||
for url in urls:
|
||||
logger.debug(f"Processing {url}")
|
||||
m = Metadata().set_url(url)
|
||||
yield m
|
||||
|
||||
logger.success(f"Processed {len(urls)} URL(s)")
|
||||
@@ -10,7 +10,7 @@ class ConsoleDb(Database):
|
||||
"""
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
logger.info(f"STARTED {item}")
|
||||
|
||||
def failed(self, item: Metadata, reason:str) -> None:
|
||||
logger.error(f"FAILED {item}: {reason}")
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
},
|
||||
'entry_point': 'csv_db::CSVDb',
|
||||
"configs": {
|
||||
"csv_file": {"default": "db.csv", "help": "CSV file name"}
|
||||
"csv_file": {"default": "db.csv", "help": "CSV file name to save metadata to"},
|
||||
},
|
||||
"description": """
|
||||
Handles exporting archival results to a CSV file.
|
||||
|
||||
@@ -32,7 +32,6 @@
|
||||
|
||||
GDriveStorage: A storage module for saving archived content to Google Drive.
|
||||
|
||||
Author: Dave Mateer, (And maintained by: )
|
||||
Source Documentation: https://davemateer.com/2022/04/28/google-drive-with-python
|
||||
|
||||
### Features
|
||||
|
||||
@@ -28,6 +28,13 @@ the broader archiving framework.
|
||||
metadata objects. Some dropins are included in this generic_archiver by default, but
|
||||
custom dropins can be created to handle additional websites and passed to the archiver
|
||||
via the command line using the `--dropins` option (TODO!).
|
||||
|
||||
### Auto-Updates
|
||||
|
||||
The Generic Extractor will also automatically check for updates to `yt-dlp` (every 5 days by default).
|
||||
This can be configured using the `ytdlp_update_interval` setting (or disabled by setting it to -1).
|
||||
If you are having issues with the extractor, you can review the version of `yt-dlp` being used with `yt-dlp --version`.
|
||||
|
||||
""",
|
||||
"configs": {
|
||||
"subtitles": {"default": True, "help": "download subtitles if available", "type": "bool"},
|
||||
@@ -64,5 +71,10 @@ via the command line using the `--dropins` option (TODO!).
|
||||
"default": "inf",
|
||||
"help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
|
||||
},
|
||||
"ytdlp_update_interval": {
|
||||
"default": 5,
|
||||
"help": "How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.",
|
||||
"type": "int",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -39,11 +39,11 @@ class Bluesky(GenericDropin):
|
||||
for image_media in image_medias:
|
||||
url = media_url.format(image_media['image']['ref']['$link'], post['author']['did'])
|
||||
image_media = archiver.download_from_url(url)
|
||||
media.append(image_media)
|
||||
media.append(Media(image_media))
|
||||
for video_media in video_medias:
|
||||
url = media_url.format(video_media['ref']['$link'], post['author']['did'])
|
||||
video_media = archiver.download_from_url(url)
|
||||
media.append(video_media)
|
||||
media.append(Media(video_media))
|
||||
return media
|
||||
|
||||
|
||||
|
||||
@@ -8,7 +8,8 @@ class Facebook(GenericDropin):
|
||||
url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
|
||||
webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group('id'))
|
||||
|
||||
post_data = ie_instance._extract_from_url.extract_metadata(webpage)
|
||||
# TODO: fix once https://github.com/yt-dlp/yt-dlp/pull/12275 is merged
|
||||
post_data = ie_instance._extract_metadata(webpage)
|
||||
return post_data
|
||||
|
||||
def create_metadata(self, post: dict, ie_instance, archiver, url):
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
import datetime, os, yt_dlp, pysubs2
|
||||
import datetime, os
|
||||
import importlib
|
||||
from typing import Type
|
||||
import subprocess
|
||||
from typing import Generator, Type
|
||||
|
||||
import yt_dlp
|
||||
from yt_dlp.extractor.common import InfoExtractor
|
||||
import pysubs2
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -11,7 +15,45 @@ from auto_archiver.core import Metadata, Media
|
||||
class GenericExtractor(Extractor):
|
||||
_dropins = {}
|
||||
|
||||
def suitable_extractors(self, url: str) -> list[str]:
|
||||
def setup(self):
|
||||
# check for file .ytdlp-update in the secrets folder
|
||||
if self.ytdlp_update_interval < 0:
|
||||
return
|
||||
|
||||
use_secrets = os.path.exists('secrets')
|
||||
path = os.path.join('secrets' if use_secrets else '', '.ytdlp-update')
|
||||
next_update_check = None
|
||||
if os.path.exists(path):
|
||||
with open(path, "r") as f:
|
||||
next_update_check = datetime.datetime.fromisoformat(f.read())
|
||||
|
||||
if not next_update_check or next_update_check < datetime.datetime.now():
|
||||
self.update_ytdlp()
|
||||
|
||||
next_update_check = datetime.datetime.now() + datetime.timedelta(days=self.ytdlp_update_interval)
|
||||
with open(path, "w") as f:
|
||||
f.write(next_update_check.isoformat())
|
||||
|
||||
def update_ytdlp(self):
|
||||
logger.info("Checking and updating yt-dlp...")
|
||||
logger.info(f"Tip: change the 'ytdlp_update_interval' setting to control how often yt-dlp is updated. Set to -1 to disable or 0 to enable on every run. Current setting: {self.ytdlp_update_interval}")
|
||||
from importlib.metadata import version as get_version
|
||||
old_version = get_version("yt-dlp")
|
||||
try:
|
||||
# try and update with pip (this works inside poetry environment and in a normal virtualenv)
|
||||
result = subprocess.run(["pip", "install", "--upgrade", "yt-dlp"], check=True, capture_output=True)
|
||||
|
||||
if "Successfully installed yt-dlp" in result.stdout.decode():
|
||||
new_version = importlib.metadata.version("yt-dlp")
|
||||
logger.info(f"yt-dlp successfully (from {old_version} to {new_version})")
|
||||
importlib.reload(yt_dlp)
|
||||
else:
|
||||
logger.info("yt-dlp already up to date")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating yt-dlp: {e}")
|
||||
|
||||
def suitable_extractors(self, url: str) -> Generator[str, None, None]:
|
||||
"""
|
||||
Returns a list of valid extractors for the given URL"""
|
||||
for info_extractor in yt_dlp.YoutubeDL()._ies.values():
|
||||
@@ -86,7 +128,7 @@ class GenericExtractor(Extractor):
|
||||
# keep both 'title' and 'fulltitle', but prefer 'title', falling back to 'fulltitle' if it doesn't exist
|
||||
result.set_title(video_data.pop('title', video_data.pop('fulltitle', "")))
|
||||
result.set_url(url)
|
||||
|
||||
if "description" in video_data: result.set_content(video_data["description"])
|
||||
# extract comments if enabled
|
||||
if self.comments:
|
||||
result.set("comments", [{
|
||||
@@ -116,7 +158,7 @@ class GenericExtractor(Extractor):
|
||||
|
||||
def get_metadata_for_post(self, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
|
||||
"""
|
||||
Calls into the ytdlp InfoExtract subclass to use the prive _extract_post method to get the post metadata.
|
||||
Calls into the ytdlp InfoExtract subclass to use the private _extract_post method to get the post metadata.
|
||||
"""
|
||||
|
||||
ie_instance = info_extractor(downloader=ydl)
|
||||
@@ -266,6 +308,11 @@ class GenericExtractor(Extractor):
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
#TODO: this is a temporary hack until this issue is closed: https://github.com/yt-dlp/yt-dlp/issues/11025
|
||||
if url.startswith("https://ya.ru"):
|
||||
url = url.replace("https://ya.ru", "https://yandex.ru")
|
||||
item.set("replaced_url", url)
|
||||
|
||||
|
||||
ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'),
|
||||
'quiet': False, 'noplaylist': not self.allow_playlist ,
|
||||
@@ -275,7 +322,8 @@ class GenericExtractor(Extractor):
|
||||
|
||||
# set up auth
|
||||
auth = self.auth_for_site(url, extract_cookies=False)
|
||||
# order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file
|
||||
|
||||
# order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
|
||||
if auth:
|
||||
if 'username' in auth and 'password' in auth:
|
||||
logger.debug(f'Using provided auth username and password for {url}')
|
||||
@@ -284,12 +332,12 @@ class GenericExtractor(Extractor):
|
||||
elif 'cookie' in auth:
|
||||
logger.debug(f'Using provided auth cookie for {url}')
|
||||
yt_dlp.utils.std_headers['cookie'] = auth['cookie']
|
||||
elif 'cookie_from_browser' in auth:
|
||||
logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}')
|
||||
elif 'cookies_from_browser' in auth:
|
||||
logger.debug(f'Using extracted cookies from browser {auth["cookies_from_browser"]} for {url}')
|
||||
ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
|
||||
elif 'cookies_file' in auth:
|
||||
logger.debug(f'Using cookies from file {self.cookie_file} for {url}')
|
||||
ydl_options['cookiesfile'] = auth['cookies_file']
|
||||
logger.debug(f'Using cookies from file {auth["cookies_file"]} for {url}')
|
||||
ydl_options['cookiefile'] = auth['cookies_file']
|
||||
|
||||
ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
|
||||
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
from .gsheet_db import GsheetsDb
|
||||
@@ -1,38 +0,0 @@
|
||||
{
|
||||
"name": "Google Sheets Database",
|
||||
"type": ["database"],
|
||||
"entry_point": "gsheet_db::GsheetsDb",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "gspread", "slugify"],
|
||||
},
|
||||
"configs": {
|
||||
"allow_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
"type": "bool",
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
GsheetsDatabase:
|
||||
Handles integration with Google Sheets for tracking archival tasks.
|
||||
|
||||
### Features
|
||||
- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.
|
||||
- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.
|
||||
- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.
|
||||
- Skips redundant updates for empty or invalid data fields.
|
||||
|
||||
### Notes
|
||||
- Currently works only with metadata provided by GsheetFeeder.
|
||||
- Requires configuration of a linked Google Sheet and appropriate API credentials.
|
||||
"""
|
||||
}
|
||||
@@ -1,114 +0,0 @@
|
||||
from typing import Union, Tuple
|
||||
from urllib.parse import quote
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Database
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.modules.gsheet_feeder import GWorksheet
|
||||
from auto_archiver.utils.misc import get_current_timestamp
|
||||
|
||||
|
||||
class GsheetsDb(Database):
|
||||
"""
|
||||
NB: only works if GsheetFeeder is used.
|
||||
could be updated in the future to support non-GsheetFeeder metadata
|
||||
"""
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, "status", "Archive in progress")
|
||||
|
||||
def failed(self, item: Metadata, reason: str) -> None:
|
||||
logger.error(f"FAILED {item}")
|
||||
self._safe_status_update(item, f"Archive failed {reason}")
|
||||
|
||||
def aborted(self, item: Metadata) -> None:
|
||||
logger.warning(f"ABORTED {item}")
|
||||
self._safe_status_update(item, "")
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
"""check if the given item has been archived already"""
|
||||
return False
|
||||
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
logger.success(f"DONE {item.get_url()}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
# self._safe_status_update(item, 'done')
|
||||
|
||||
cell_updates = []
|
||||
row_values = gw.get_row(row)
|
||||
|
||||
def batch_if_valid(col, val, final_value=None):
|
||||
final_value = final_value or val
|
||||
try:
|
||||
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
|
||||
cell_updates.append((row, col, final_value))
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to batch {col}={final_value} due to {e}")
|
||||
|
||||
status_message = item.status
|
||||
if cached:
|
||||
status_message = f"[cached] {status_message}"
|
||||
cell_updates.append((row, "status", status_message))
|
||||
|
||||
media: Media = item.get_final_media()
|
||||
if hasattr(media, "urls"):
|
||||
batch_if_valid("archive", "\n".join(media.urls))
|
||||
batch_if_valid("date", True, get_current_timestamp())
|
||||
batch_if_valid("title", item.get_title())
|
||||
batch_if_valid("text", item.get("content", ""))
|
||||
batch_if_valid("timestamp", item.get_timestamp())
|
||||
if media:
|
||||
batch_if_valid("hash", media.get("hash", "not-calculated"))
|
||||
|
||||
# merge all pdq hashes into a single string, if present
|
||||
pdq_hashes = []
|
||||
all_media = item.get_all_media()
|
||||
for m in all_media:
|
||||
if pdq := m.get("pdq_hash"):
|
||||
pdq_hashes.append(pdq)
|
||||
if len(pdq_hashes):
|
||||
batch_if_valid("pdq_hash", ",".join(pdq_hashes))
|
||||
|
||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(
|
||||
screenshot, "urls"
|
||||
):
|
||||
batch_if_valid("screenshot", "\n".join(screenshot.urls))
|
||||
|
||||
if thumbnail := item.get_first_image("thumbnail"):
|
||||
if hasattr(thumbnail, "urls"):
|
||||
batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")')
|
||||
|
||||
if browsertrix := item.get_media_by_id("browsertrix"):
|
||||
batch_if_valid("wacz", "\n".join(browsertrix.urls))
|
||||
batch_if_valid(
|
||||
"replaywebpage",
|
||||
"\n".join(
|
||||
[
|
||||
f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}"
|
||||
for wacz in browsertrix.urls
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
gw.batch_set_cell(cell_updates)
|
||||
|
||||
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
|
||||
try:
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, "status", new_status)
|
||||
except Exception as e:
|
||||
logger.debug(f"Unable to update sheet: {e}")
|
||||
|
||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||
|
||||
if gsheet := item.get_context("gsheet"):
|
||||
gw: GWorksheet = gsheet.get("worksheet")
|
||||
row: int = gsheet.get("row")
|
||||
elif self.sheet_id:
|
||||
logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")
|
||||
|
||||
return gw, row
|
||||
@@ -1,2 +0,0 @@
|
||||
from .gworksheet import GWorksheet
|
||||
from .gsheet_feeder import GsheetsFeeder
|
||||
@@ -1,71 +0,0 @@
|
||||
{
|
||||
"name": "Google Sheets Feeder",
|
||||
"type": ["feeder"],
|
||||
"entry_point": "gsheet_feeder::GsheetsFeeder",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "gspread", "slugify"],
|
||||
},
|
||||
"configs": {
|
||||
"sheet": {"default": None, "help": "name of the sheet to archive"},
|
||||
"sheet_id": {
|
||||
"default": None,
|
||||
"help": "(alternative to sheet name) the id of the sheet to archive",
|
||||
},
|
||||
"header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
|
||||
"service_account": {
|
||||
"default": "secrets/service_account.json",
|
||||
"help": "service account JSON file path",
|
||||
},
|
||||
"columns": {
|
||||
"default": {
|
||||
"url": "link",
|
||||
"status": "archive status",
|
||||
"folder": "destination folder",
|
||||
"archive": "archive location",
|
||||
"date": "archive date",
|
||||
"thumbnail": "thumbnail",
|
||||
"timestamp": "upload timestamp",
|
||||
"title": "upload title",
|
||||
"text": "text content",
|
||||
"screenshot": "screenshot",
|
||||
"hash": "hash",
|
||||
"pdq_hash": "perceptual hashes",
|
||||
"wacz": "wacz",
|
||||
"replaywebpage": "replaywebpage",
|
||||
},
|
||||
"help": "names of columns in the google sheet (stringified JSON object)",
|
||||
"type": "auto_archiver.utils.json_loader",
|
||||
},
|
||||
"allow_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
"type": "bool",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
GsheetsFeeder
|
||||
A Google Sheets-based feeder for the Auto Archiver.
|
||||
|
||||
This reads data from Google Sheets and filters rows based on user-defined rules.
|
||||
The filtered rows are processed into `Metadata` objects.
|
||||
|
||||
### Features
|
||||
- Validates the sheet structure and filters rows based on input configurations.
|
||||
- Processes only worksheets allowed by the `allow_worksheets` and `block_worksheets` configurations.
|
||||
- Ensures only rows with valid URLs and unprocessed statuses are included for archival.
|
||||
- Supports organizing stored files into folder paths based on sheet and worksheet names.
|
||||
|
||||
### Notes
|
||||
- Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`.
|
||||
- Create the sheet using the template provided in the docs.
|
||||
""",
|
||||
}
|
||||
@@ -1,96 +0,0 @@
|
||||
"""
|
||||
GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver.
|
||||
|
||||
This reads data from Google Sheets and filters rows based on user-defined rules.
|
||||
The filtered rows are processed into `Metadata` objects.
|
||||
|
||||
### Key properties
|
||||
- validates the sheet's structure and filters rows based on input configurations.
|
||||
- Ensures only rows with valid URLs and unprocessed statuses are included.
|
||||
"""
|
||||
import os
|
||||
import gspread
|
||||
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
|
||||
from auto_archiver.core import Feeder
|
||||
from auto_archiver.core import Metadata
|
||||
from . import GWorksheet
|
||||
|
||||
|
||||
class GsheetsFeeder(Feeder):
|
||||
|
||||
def setup(self) -> None:
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
# TODO mv to validators
|
||||
assert self.sheet or self.sheet_id, (
|
||||
"You need to define either a 'sheet' name or a 'sheet_id' in your manifest."
|
||||
)
|
||||
|
||||
def open_sheet(self):
|
||||
if self.sheet:
|
||||
return self.gsheets_client.open(self.sheet)
|
||||
else: # self.sheet_id
|
||||
return self.gsheets_client.open_by_key(self.sheet_id)
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
sh = self.open_sheet()
|
||||
for ii, worksheet in enumerate(sh.worksheets()):
|
||||
if not self.should_process_sheet(worksheet.title):
|
||||
logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
|
||||
continue
|
||||
logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
|
||||
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
|
||||
if len(missing_cols := self.missing_required_columns(gw)):
|
||||
logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
|
||||
continue
|
||||
|
||||
# process and yield metadata here:
|
||||
yield from self._process_rows(gw)
|
||||
logger.success(f'Finished worksheet {worksheet.title}')
|
||||
|
||||
def _process_rows(self, gw: GWorksheet):
|
||||
for row in range(1 + self.header, gw.count_rows() + 1):
|
||||
url = gw.get_cell(row, 'url').strip()
|
||||
if not len(url): continue
|
||||
original_status = gw.get_cell(row, 'status')
|
||||
status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
|
||||
# TODO: custom status parser(?) aka should_retry_from_status
|
||||
if status not in ['', None]: continue
|
||||
|
||||
# All checks done - archival process starts here
|
||||
m = Metadata().set_url(url)
|
||||
self._set_context(m, gw, row)
|
||||
yield m
|
||||
|
||||
def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
|
||||
# TODO: Check folder value not being recognised
|
||||
m.set_context("gsheet", {"row": row, "worksheet": gw})
|
||||
|
||||
if gw.get_cell_or_default(row, 'folder', "") is None:
|
||||
folder = ''
|
||||
else:
|
||||
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
|
||||
if len(folder):
|
||||
if self.use_sheet_names_in_stored_paths:
|
||||
m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title)))
|
||||
else:
|
||||
m.set_context("folder", folder)
|
||||
|
||||
|
||||
def should_process_sheet(self, sheet_name: str) -> bool:
|
||||
if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
|
||||
# ALLOW rules exist AND sheet name not explicitly allowed
|
||||
return False
|
||||
if len(self.block_worksheets) and sheet_name in self.block_worksheets:
|
||||
# BLOCK rules exist AND sheet name is blocked
|
||||
return False
|
||||
return True
|
||||
|
||||
def missing_required_columns(self, gw: GWorksheet) -> list:
|
||||
missing = []
|
||||
for required_col in ['url', 'status']:
|
||||
if not gw.col_exists(required_col):
|
||||
missing.append(required_col)
|
||||
return missing
|
||||
2
src/auto_archiver/modules/gsheet_feeder_db/__init__.py
Normal file
2
src/auto_archiver/modules/gsheet_feeder_db/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from .gworksheet import GWorksheet
|
||||
from .gsheet_feeder_db import GsheetsFeederDB
|
||||
94
src/auto_archiver/modules/gsheet_feeder_db/__manifest__.py
Normal file
94
src/auto_archiver/modules/gsheet_feeder_db/__manifest__.py
Normal file
@@ -0,0 +1,94 @@
|
||||
{
|
||||
"name": "Google Sheets Feeder Database",
|
||||
"type": ["feeder", "database"],
|
||||
"entry_point": "gsheet_feeder_db::GsheetsFeederDB",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "gspread", "slugify"],
|
||||
},
|
||||
"configs": {
|
||||
"sheet": {"default": None, "help": "name of the sheet to archive"},
|
||||
"sheet_id": {
|
||||
"default": None,
|
||||
"help": "the id of the sheet to archive (alternative to 'sheet' config)",
|
||||
},
|
||||
"header": {"default": 1,
|
||||
"type": "int",
|
||||
"help": "index of the header row (starts at 1)", "type": "int"},
|
||||
"service_account": {
|
||||
"default": "secrets/service_account.json",
|
||||
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
|
||||
"required": True,
|
||||
},
|
||||
"columns": {
|
||||
"default": {
|
||||
"url": "link",
|
||||
"status": "archive status",
|
||||
"folder": "destination folder",
|
||||
"archive": "archive location",
|
||||
"date": "archive date",
|
||||
"thumbnail": "thumbnail",
|
||||
"timestamp": "upload timestamp",
|
||||
"title": "upload title",
|
||||
"text": "text content",
|
||||
"screenshot": "screenshot",
|
||||
"hash": "hash",
|
||||
"pdq_hash": "perceptual hashes",
|
||||
"wacz": "wacz",
|
||||
"replaywebpage": "replaywebpage",
|
||||
},
|
||||
"help": "Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting",
|
||||
"type": "json_loader",
|
||||
},
|
||||
"allow_worksheets": {
|
||||
"default": set(),
|
||||
"help": "A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed",
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": set(),
|
||||
"help": "A list of worksheet names for worksheets that should be explicitly blocked from being processed",
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
"type": "bool",
|
||||
},
|
||||
"allow_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
"type": "bool",
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
GsheetsFeederDatabase
|
||||
A Google Sheets-based feeder and optional database for the Auto Archiver.
|
||||
|
||||
This reads data from Google Sheets and filters rows based on user-defined rules.
|
||||
The filtered rows are processed into `Metadata` objects.
|
||||
|
||||
### Features
|
||||
- Validates the sheet structure and filters rows based on input configurations.
|
||||
- Processes only worksheets allowed by the `allow_worksheets` and `block_worksheets` configurations.
|
||||
- Ensures only rows with valid URLs and unprocessed statuses are included for archival.
|
||||
- Supports organizing stored files into folder paths based on sheet and worksheet names.
|
||||
- If the database is enabled, this updates the Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.
|
||||
- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.
|
||||
- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.
|
||||
- Skips redundant updates for empty or invalid data fields.
|
||||
|
||||
### Setup
|
||||
- Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`.
|
||||
To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html).
|
||||
- Define the `sheet` or `sheet_id` configuration to specify the sheet to archive.
|
||||
- Customize the column names in your Google sheet using the `columns` configuration.
|
||||
- The Google Sheet can be used soley as a feeder or as a feeder and database, but note you can't currently feed into the database from an alternate feeder.
|
||||
""",
|
||||
}
|
||||
196
src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py
Normal file
196
src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py
Normal file
@@ -0,0 +1,196 @@
|
||||
"""
|
||||
GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver.
|
||||
|
||||
This reads data from Google Sheets and filters rows based on user-defined rules.
|
||||
The filtered rows are processed into `Metadata` objects.
|
||||
|
||||
### Key properties
|
||||
- validates the sheet's structure and filters rows based on input configurations.
|
||||
- Ensures only rows with valid URLs and unprocessed statuses are included.
|
||||
"""
|
||||
import os
|
||||
from typing import Tuple, Union
|
||||
from urllib.parse import quote
|
||||
|
||||
import gspread
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
|
||||
from auto_archiver.core import Feeder, Database, Media
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.modules.gsheet_feeder_db import GWorksheet
|
||||
from auto_archiver.utils.misc import calculate_file_hash, get_current_timestamp
|
||||
|
||||
|
||||
class GsheetsFeederDB(Feeder, Database):
|
||||
|
||||
def setup(self) -> None:
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
# TODO mv to validators
|
||||
if not self.sheet and not self.sheet_id:
|
||||
raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.")
|
||||
|
||||
def open_sheet(self):
|
||||
if self.sheet:
|
||||
return self.gsheets_client.open(self.sheet)
|
||||
else: # self.sheet_id
|
||||
return self.gsheets_client.open_by_key(self.sheet_id)
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
sh = self.open_sheet()
|
||||
for ii, worksheet in enumerate(sh.worksheets()):
|
||||
if not self.should_process_sheet(worksheet.title):
|
||||
logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
|
||||
continue
|
||||
logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
|
||||
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
|
||||
if len(missing_cols := self.missing_required_columns(gw)):
|
||||
logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
|
||||
continue
|
||||
|
||||
# process and yield metadata here:
|
||||
yield from self._process_rows(gw)
|
||||
logger.success(f'Finished worksheet {worksheet.title}')
|
||||
|
||||
def _process_rows(self, gw: GWorksheet):
|
||||
for row in range(1 + self.header, gw.count_rows() + 1):
|
||||
url = gw.get_cell(row, 'url').strip()
|
||||
if not len(url): continue
|
||||
original_status = gw.get_cell(row, 'status')
|
||||
status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
|
||||
# TODO: custom status parser(?) aka should_retry_from_status
|
||||
if status not in ['', None]: continue
|
||||
|
||||
# All checks done - archival process starts here
|
||||
m = Metadata().set_url(url)
|
||||
self._set_context(m, gw, row)
|
||||
yield m
|
||||
|
||||
def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
|
||||
# TODO: Check folder value not being recognised
|
||||
m.set_context("gsheet", {"row": row, "worksheet": gw})
|
||||
|
||||
if gw.get_cell_or_default(row, 'folder', "") is None:
|
||||
folder = ''
|
||||
else:
|
||||
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
|
||||
if len(folder):
|
||||
if self.use_sheet_names_in_stored_paths:
|
||||
m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title)))
|
||||
else:
|
||||
m.set_context("folder", folder)
|
||||
|
||||
def should_process_sheet(self, sheet_name: str) -> bool:
|
||||
if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
|
||||
# ALLOW rules exist AND sheet name not explicitly allowed
|
||||
return False
|
||||
if len(self.block_worksheets) and sheet_name in self.block_worksheets:
|
||||
# BLOCK rules exist AND sheet name is blocked
|
||||
return False
|
||||
return True
|
||||
|
||||
def missing_required_columns(self, gw: GWorksheet) -> list:
|
||||
missing = []
|
||||
for required_col in ['url', 'status']:
|
||||
if not gw.col_exists(required_col):
|
||||
missing.append(required_col)
|
||||
return missing
|
||||
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, "status", "Archive in progress")
|
||||
|
||||
def failed(self, item: Metadata, reason: str) -> None:
|
||||
logger.error(f"FAILED {item}")
|
||||
self._safe_status_update(item, f"Archive failed {reason}")
|
||||
|
||||
def aborted(self, item: Metadata) -> None:
|
||||
logger.warning(f"ABORTED {item}")
|
||||
self._safe_status_update(item, "")
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
"""check if the given item has been archived already"""
|
||||
return False
|
||||
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
logger.success(f"DONE {item.get_url()}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
# self._safe_status_update(item, 'done')
|
||||
|
||||
cell_updates = []
|
||||
row_values = gw.get_row(row)
|
||||
|
||||
def batch_if_valid(col, val, final_value=None):
|
||||
final_value = final_value or val
|
||||
try:
|
||||
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
|
||||
cell_updates.append((row, col, final_value))
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to batch {col}={final_value} due to {e}")
|
||||
|
||||
status_message = item.status
|
||||
if cached:
|
||||
status_message = f"[cached] {status_message}"
|
||||
cell_updates.append((row, "status", status_message))
|
||||
|
||||
media: Media = item.get_final_media()
|
||||
if hasattr(media, "urls"):
|
||||
batch_if_valid("archive", "\n".join(media.urls))
|
||||
batch_if_valid("date", True, get_current_timestamp())
|
||||
batch_if_valid("title", item.get_title())
|
||||
batch_if_valid("text", item.get("content", ""))
|
||||
batch_if_valid("timestamp", item.get_timestamp())
|
||||
if media:
|
||||
batch_if_valid("hash", media.get("hash", "not-calculated"))
|
||||
|
||||
# merge all pdq hashes into a single string, if present
|
||||
pdq_hashes = []
|
||||
all_media = item.get_all_media()
|
||||
for m in all_media:
|
||||
if pdq := m.get("pdq_hash"):
|
||||
pdq_hashes.append(pdq)
|
||||
if len(pdq_hashes):
|
||||
batch_if_valid("pdq_hash", ",".join(pdq_hashes))
|
||||
|
||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(
|
||||
screenshot, "urls"
|
||||
):
|
||||
batch_if_valid("screenshot", "\n".join(screenshot.urls))
|
||||
|
||||
if thumbnail := item.get_first_image("thumbnail"):
|
||||
if hasattr(thumbnail, "urls"):
|
||||
batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")')
|
||||
|
||||
if browsertrix := item.get_media_by_id("browsertrix"):
|
||||
batch_if_valid("wacz", "\n".join(browsertrix.urls))
|
||||
batch_if_valid(
|
||||
"replaywebpage",
|
||||
"\n".join(
|
||||
[
|
||||
f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}"
|
||||
for wacz in browsertrix.urls
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
gw.batch_set_cell(cell_updates)
|
||||
|
||||
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
|
||||
try:
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, "status", new_status)
|
||||
except Exception as e:
|
||||
logger.debug(f"Unable to update sheet: {e}")
|
||||
|
||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||
|
||||
if gsheet := item.get_context("gsheet"):
|
||||
gw: GWorksheet = gsheet.get("worksheet")
|
||||
row: int = gsheet.get("row")
|
||||
elif self.sheet_id:
|
||||
logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")
|
||||
|
||||
return gw, row
|
||||
@@ -17,6 +17,7 @@ class GWorksheet:
|
||||
'thumbnail': 'thumbnail',
|
||||
'timestamp': 'upload timestamp',
|
||||
'title': 'upload title',
|
||||
'text': 'text content',
|
||||
'screenshot': 'screenshot',
|
||||
'hash': 'hash',
|
||||
'pdq_hash': 'perceptual hashes',
|
||||
@@ -7,7 +7,9 @@
|
||||
"bin": [""]
|
||||
},
|
||||
"configs": {
|
||||
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
|
||||
"detect_thumbnails": {"default": True,
|
||||
"help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'",
|
||||
"type": "bool"},
|
||||
},
|
||||
"description": """ """,
|
||||
}
|
||||
|
||||
@@ -9,9 +9,7 @@ import base64
|
||||
from auto_archiver.version import __version__
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.core import Formatter
|
||||
from auto_archiver.modules.hash_enricher import HashEnricher
|
||||
from auto_archiver.utils.misc import random_str
|
||||
from auto_archiver.core.module import get_module
|
||||
|
||||
class HtmlFormatter(Formatter):
|
||||
environment: Environment = None
|
||||
@@ -51,7 +49,7 @@ class HtmlFormatter(Formatter):
|
||||
final_media = Media(filename=html_path, _mimetype="text/html")
|
||||
|
||||
# get the already instantiated hash_enricher module
|
||||
he = get_module('hash_enricher', self.config)
|
||||
he = self.module_factory.get_module('hash_enricher', self.config)
|
||||
if len(hd := he.calculate_hash(final_media.filename)):
|
||||
final_media.set("hash", f"{he.algorithm}:{hd}")
|
||||
|
||||
|
||||
@@ -200,7 +200,7 @@
|
||||
el.innerHTML = decodeCertificate(certificate);
|
||||
|
||||
let cyberChefUrl =
|
||||
`https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate)}`;
|
||||
`https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate).replace(/=+$/, '')}`;
|
||||
// create a new anchor with this url and append after the code
|
||||
let a = document.createElement("a");
|
||||
a.href = cyberChefUrl;
|
||||
|
||||
@@ -10,25 +10,30 @@
|
||||
"requires_setup": True,
|
||||
"configs": {
|
||||
"username": {"required": True,
|
||||
"help": "a valid Instagram username"},
|
||||
"help": "A valid Instagram username."},
|
||||
"password": {
|
||||
"required": True,
|
||||
"help": "the corresponding Instagram account password",
|
||||
"help": "The corresponding Instagram account password.",
|
||||
},
|
||||
"download_folder": {
|
||||
"default": "instaloader",
|
||||
"help": "name of a folder to temporarily download content to",
|
||||
"help": "Name of a folder to temporarily download content to.",
|
||||
},
|
||||
"session_file": {
|
||||
"default": "secrets/instaloader.session",
|
||||
"help": "path to the instagram session which saves session credentials",
|
||||
"help": "Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one.",
|
||||
},
|
||||
# TODO: fine-grain
|
||||
# "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"},
|
||||
},
|
||||
"description": """
|
||||
Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts
|
||||
and user profiles, downloading as much information as possible, including images, videos, text, stories,
|
||||
Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram.
|
||||
|
||||
> ⚠️ **Warning**
|
||||
> This module is not actively maintained due to known issues with blocking.
|
||||
> Prioritise usage of the [Instagram Tbot Extractor](./instagram_tbot_extractor.md) and [Instagram API Extractor](./instagram_api_extractor.md)
|
||||
|
||||
This class handles both individual posts and user profiles, downloading as much information as possible, including images, videos, text, stories,
|
||||
highlights, and tagged posts.
|
||||
Authentication is required via username/password or a session file.
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
highlights, and tagged posts. Authentication is required via username/password or a session file.
|
||||
|
||||
"""
|
||||
import re, os, shutil, traceback
|
||||
import re, os, shutil
|
||||
import instaloader
|
||||
from loguru import logger
|
||||
|
||||
@@ -15,10 +15,9 @@ class InstagramExtractor(Extractor):
|
||||
"""
|
||||
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
|
||||
"""
|
||||
|
||||
# NB: post regex should be tested before profile
|
||||
|
||||
valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/")
|
||||
|
||||
# https://regex101.com/r/MGPquX/1
|
||||
post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url))
|
||||
# https://regex101.com/r/6Wbsxa/1
|
||||
@@ -28,19 +27,22 @@ class InstagramExtractor(Extractor):
|
||||
def setup(self) -> None:
|
||||
|
||||
self.insta = instaloader.Instaloader(
|
||||
download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}"
|
||||
download_geotags=True,
|
||||
download_comments=True,
|
||||
compress_json=False,
|
||||
dirname_pattern=self.download_folder,
|
||||
filename_pattern="{date_utc}_UTC_{target}__{typename}"
|
||||
)
|
||||
try:
|
||||
self.insta.load_session_from_file(self.username, self.session_file)
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to login from session file: {e}\n{traceback.format_exc()}")
|
||||
try:
|
||||
self.insta.login(self.username, config.instagram_self.password)
|
||||
# TODO: wait for this issue to be fixed https://github.com/instaloader/instaloader/issues/1758
|
||||
logger.debug(f"Session file failed", exc_info=True)
|
||||
logger.info("No valid session file found - Attempting login with use and password.")
|
||||
self.insta.login(self.username, self.password)
|
||||
self.insta.save_session_to_file(self.session_file)
|
||||
except Exception as e2:
|
||||
logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}")
|
||||
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
|
||||
@@ -77,13 +77,14 @@ class InstagramTbotExtractor(Extractor):
|
||||
chat, since_id = self._send_url_to_bot(url)
|
||||
message = self._process_messages(chat, since_id, tmp_dir, result)
|
||||
|
||||
# This may be outdated and replaced by the below message, but keeping until confirmed
|
||||
if "You must enter a URL to a post" in message:
|
||||
logger.debug(f"invalid link {url=} for {self.name}: {message}")
|
||||
return False
|
||||
# # TODO: It currently returns this as a success - is that intentional?
|
||||
# if "Media not found or unavailable" in message:
|
||||
# logger.debug(f"invalid link {url=} for {self.name}: {message}")
|
||||
# return False
|
||||
|
||||
if "Media not found or unavailable" in message:
|
||||
logger.debug(f"No media found for link {url=} for {self.name}: {message}")
|
||||
return False
|
||||
|
||||
if message:
|
||||
result.set_content(message).set_title(message[:128])
|
||||
@@ -103,7 +104,7 @@ class InstagramTbotExtractor(Extractor):
|
||||
message = ""
|
||||
time.sleep(3)
|
||||
# media is added before text by the bot so it can be used as a stop-logic mechanism
|
||||
while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
|
||||
while attempts < max(self.timeout - 3, 3) and (not message or not len(seen_media)):
|
||||
attempts += 1
|
||||
time.sleep(1)
|
||||
for post in self.client.iter_messages(chat, min_id=since_id):
|
||||
|
||||
@@ -17,7 +17,9 @@
|
||||
"choices": ["random", "static"],
|
||||
},
|
||||
"save_to": {"default": "./local_archive", "help": "folder where to save archived content"},
|
||||
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
|
||||
"save_absolute": {"default": False,
|
||||
"type": "bool",
|
||||
"help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
|
||||
},
|
||||
"description": """
|
||||
LocalStorage: A storage module for saving archived content locally on the filesystem.
|
||||
|
||||
@@ -6,25 +6,42 @@ from loguru import logger
|
||||
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.core import Storage
|
||||
|
||||
from auto_archiver.core.consts import SetupError
|
||||
|
||||
class LocalStorage(Storage):
|
||||
|
||||
|
||||
def setup(self) -> None:
|
||||
if len(self.save_to) > 200:
|
||||
raise SetupError(f"Your save_to path is too long, this will cause issues saving files on your computer. Please use a shorter path.")
|
||||
|
||||
def get_cdn_url(self, media: Media) -> str:
|
||||
# TODO: is this viable with Storage.configs on path/filename?
|
||||
dest = os.path.join(self.save_to, media.key)
|
||||
dest = media.key
|
||||
|
||||
if self.save_absolute:
|
||||
dest = os.path.abspath(dest)
|
||||
return dest
|
||||
|
||||
def set_key(self, media, url, metadata):
|
||||
# clarify we want to save the file to the save_to folder
|
||||
|
||||
old_folder = metadata.get('folder', '')
|
||||
metadata.set_context('folder', os.path.join(self.save_to, metadata.get('folder', '')))
|
||||
super().set_key(media, url, metadata)
|
||||
# don't impact other storages that might want a different 'folder' set
|
||||
metadata.set_context('folder', old_folder)
|
||||
|
||||
def upload(self, media: Media, **kwargs) -> bool:
|
||||
# override parent so that we can use shutil.copy2 and keep metadata
|
||||
dest = os.path.join(self.save_to, media.key)
|
||||
dest = media.key
|
||||
|
||||
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
||||
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}')
|
||||
|
||||
res = shutil.copy2(media.filename, dest)
|
||||
logger.info(res)
|
||||
return True
|
||||
|
||||
# must be implemented even if unused
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||||
pass
|
||||
@@ -42,7 +42,7 @@ class S3Storage(Storage):
|
||||
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
|
||||
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
|
||||
return True
|
||||
|
||||
|
||||
def is_upload_needed(self, media: Media) -> bool:
|
||||
if self.random_no_duplicate:
|
||||
# checks if a folder with the hash already exists, if so it skips the upload
|
||||
@@ -50,13 +50,13 @@ class S3Storage(Storage):
|
||||
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
|
||||
|
||||
if existing_key:=self.file_in_folder(path):
|
||||
media.key = existing_key
|
||||
media._key = existing_key
|
||||
media.set("previously archived", True)
|
||||
logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
|
||||
return False
|
||||
|
||||
_, ext = os.path.splitext(media.key)
|
||||
media.key = os.path.join(path, f"{random_str(24)}{ext}")
|
||||
media._key = os.path.join(path, f"{random_str(24)}{ext}")
|
||||
return True
|
||||
|
||||
def file_in_folder(self, path:str) -> str:
|
||||
@@ -66,5 +66,4 @@ class S3Storage(Storage):
|
||||
resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter='/', MaxKeys=1)
|
||||
if 'Contents' in resp:
|
||||
return resp['Contents'][0]['Key']
|
||||
return False
|
||||
|
||||
return False
|
||||
@@ -4,16 +4,27 @@
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "selenium"],
|
||||
"bin": ["chromedriver"]
|
||||
},
|
||||
"configs": {
|
||||
"width": {"default": 1280, "help": "width of the screenshots"},
|
||||
"height": {"default": 720, "help": "height of the screenshots"},
|
||||
"timeout": {"default": 60, "help": "timeout for taking the screenshot"},
|
||||
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
|
||||
"width": {"default": 1280,
|
||||
"type": "int",
|
||||
"help": "width of the screenshots"},
|
||||
"height": {"default": 1024,
|
||||
"type": "int",
|
||||
"help": "height of the screenshots"},
|
||||
"timeout": {"default": 60,
|
||||
"type": "int",
|
||||
"help": "timeout for taking the screenshot"},
|
||||
"sleep_before_screenshot": {"default": 4,
|
||||
"type": "int",
|
||||
"help": "seconds to wait for the pages to load before taking screenshot"},
|
||||
"http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
|
||||
"save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
|
||||
"print_options": {"default": {}, "help": "options to pass to the pdf printer"}
|
||||
"save_to_pdf": {"default": False,
|
||||
"type": "bool",
|
||||
"help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
|
||||
"print_options": {"default": {},
|
||||
"help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
|
||||
"type": "json_loader"},
|
||||
},
|
||||
"description": """
|
||||
Captures screenshots and optionally saves web pages as PDFs using a WebDriver.
|
||||
|
||||
@@ -11,6 +11,10 @@ from auto_archiver.core import Media, Metadata
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
|
||||
def __init__(self, webdriver_factory=None):
|
||||
super().__init__()
|
||||
self.webdriver_factory = webdriver_factory or Webdriver
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
|
||||
@@ -20,7 +24,8 @@ class ScreenshotEnricher(Enricher):
|
||||
|
||||
logger.debug(f"Enriching screenshot for {url=}")
|
||||
auth = self.auth_for_site(url)
|
||||
with Webdriver(self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
|
||||
with self.webdriver_factory(
|
||||
self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
|
||||
http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver:
|
||||
try:
|
||||
driver.get(url)
|
||||
@@ -38,3 +43,4 @@ class ScreenshotEnricher(Enricher):
|
||||
logger.info("TimeoutException loading page for screenshot")
|
||||
except Exception as e:
|
||||
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
|
||||
|
||||
|
||||
@@ -7,7 +7,9 @@
|
||||
},
|
||||
'entry_point': 'ssl_enricher::SSLEnricher',
|
||||
"configs": {
|
||||
"skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"},
|
||||
"skip_when_nothing_archived": {"default": True,
|
||||
"type": 'bool',
|
||||
"help": "if true, will skip enriching when no media is archived"},
|
||||
},
|
||||
"description": """
|
||||
Retrieves SSL certificate information for a domain and stores it as a file.
|
||||
|
||||
@@ -20,5 +20,6 @@
|
||||
- Processes HTML content of messages to retrieve embedded media.
|
||||
- Sets structured metadata, including timestamps, content, and media details.
|
||||
- Does not require user authentication for Telegram.
|
||||
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"name": "telethon_extractor",
|
||||
"name": "Telethon Extractor",
|
||||
"type": ["extractor"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
@@ -14,11 +14,13 @@
|
||||
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
||||
"bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"},
|
||||
"session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
||||
"join_channels": {"default": True, "help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
|
||||
"join_channels": {"default": True,
|
||||
"type": "bool",
|
||||
"help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
|
||||
"channel_invites": {
|
||||
"default": {},
|
||||
"help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
|
||||
"type": "auto_archiver.utils.json_loader",
|
||||
"type": "json_loader",
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
@@ -40,5 +42,9 @@ To use the `TelethonExtractor`, you must configure the following:
|
||||
- **Bot Token**: Optional, allows access to additional content (e.g., large videos) but limits private channel archiving.
|
||||
- **Channel Invites**: Optional, specify a JSON string of invite links to join channels during setup.
|
||||
|
||||
### First Time Login
|
||||
The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root.
|
||||
|
||||
|
||||
"""
|
||||
}
|
||||
@@ -7,8 +7,12 @@
|
||||
"bin": ["ffmpeg"]
|
||||
},
|
||||
"configs": {
|
||||
"thumbnails_per_minute": {"default": 60, "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
|
||||
"max_thumbnails": {"default": 16, "help": "limit the number of thumbnails to generate per video, 0 means no limit"},
|
||||
"thumbnails_per_minute": {"default": 60,
|
||||
"type": "int",
|
||||
"help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
|
||||
"max_thumbnails": {"default": 16,
|
||||
"type": "int",
|
||||
"help": "limit the number of thumbnails to generate per video, 0 means no limit"},
|
||||
},
|
||||
"description": """
|
||||
Generates thumbnails for video files to provide visual previews.
|
||||
|
||||
@@ -42,7 +42,7 @@ class ThumbnailEnricher(Enricher):
|
||||
logger.error(f"error getting duration of video {m.filename}: {e}")
|
||||
return
|
||||
|
||||
num_thumbs = int(min(max(1, duration * self.thumbnails_per_minute), self.max_thumbnails))
|
||||
num_thumbs = int(min(max(1, (duration / 60) * self.thumbnails_per_minute), self.max_thumbnails))
|
||||
timestamps = [duration / (num_thumbs + 1) * i for i in range(1, num_thumbs + 1)]
|
||||
|
||||
thumbnails_media = []
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
from .tiktok_tikwm_extractor import TiktokTikwmExtractor
|
||||
@@ -0,0 +1,23 @@
|
||||
{
|
||||
"name": "Tiktok Tikwm Extractor",
|
||||
"type": ["extractor"],
|
||||
"requires_setup": False,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
"bin": []
|
||||
},
|
||||
"description": """
|
||||
Uses an unofficial TikTok video download platform's API to download videos: https://tikwm.com/
|
||||
|
||||
This extractor complements the generic_extractor which can already get TikTok videos, but this one can extract special videos like those marked as sensitive.
|
||||
|
||||
### Features
|
||||
- Downloads the video and, if possible, also the video cover.
|
||||
- Stores extra metadata about the post like author information, and more as returned by tikwm.com.
|
||||
|
||||
### Notes
|
||||
- If tikwm.com is down, this extractor will not work.
|
||||
- If tikwm.com changes their API, this extractor may break.
|
||||
- If no video is found, this extractor will consider the extraction failed.
|
||||
"""
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
import re
|
||||
import requests
|
||||
from loguru import logger
|
||||
from datetime import datetime, timezone
|
||||
from yt_dlp.extractor.tiktok import TikTokIE
|
||||
|
||||
from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
|
||||
class TiktokTikwmExtractor(Extractor):
|
||||
"""
|
||||
Extractor for TikTok that uses an unofficial API and can capture content that requires a login, like sensitive content.
|
||||
"""
|
||||
TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}"
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
if not re.match(TikTokIE._VALID_URL, url):
|
||||
return False
|
||||
|
||||
endpoint = TiktokTikwmExtractor.TIKWM_ENDPOINT.format(url=url)
|
||||
|
||||
r = requests.get(endpoint)
|
||||
if r.status_code != 200:
|
||||
logger.error(f"unexpected status code '{r.status_code}' from tikwm.com for {url=}:")
|
||||
return False
|
||||
|
||||
try:
|
||||
json_response = r.json()
|
||||
except ValueError:
|
||||
logger.error(f"failed to parse JSON response from tikwm.com for {url=}")
|
||||
return False
|
||||
|
||||
if not json_response.get('msg') == 'success' or not (api_data := json_response.get('data', {})):
|
||||
logger.error(f"failed to get a valid response from tikwm.com for {url=}: {json_response}")
|
||||
return False
|
||||
|
||||
# tries to get the non-watermarked version first
|
||||
video_url = api_data.pop("play", api_data.pop("wmplay", None))
|
||||
if not video_url:
|
||||
logger.error(f"no valid video URL found in response from tikwm.com for {url=}")
|
||||
return False
|
||||
|
||||
# prepare result, start by downloading video
|
||||
result = Metadata()
|
||||
|
||||
# get the cover if possible
|
||||
cover_url = api_data.pop("origin_cover", api_data.pop("cover", api_data.pop("ai_dynamic_cover", None)))
|
||||
if cover_url and (cover_downloaded := self.download_from_url(cover_url)):
|
||||
result.add_media(Media(cover_downloaded))
|
||||
|
||||
# get the video or fail
|
||||
video_downloaded = self.download_from_url(video_url, f"vid_{api_data.get('id', '')}")
|
||||
if not video_downloaded:
|
||||
logger.error(f"failed to download video from {video_url}")
|
||||
return False
|
||||
video_media = Media(video_downloaded)
|
||||
if duration := api_data.pop("duration", None):
|
||||
video_media.set("duration", duration)
|
||||
result.add_media(video_media)
|
||||
|
||||
# add remaining metadata
|
||||
result.set_title(api_data.pop("title", ""))
|
||||
|
||||
if created_at := api_data.pop("create_time", None):
|
||||
result.set_timestamp(datetime.fromtimestamp(created_at, tz=timezone.utc))
|
||||
|
||||
if (author := api_data.pop("author", None)):
|
||||
result.set("author", author)
|
||||
|
||||
result.set("api_data", api_data)
|
||||
|
||||
return result.success("tikwm")
|
||||
@@ -1 +0,0 @@
|
||||
from .wacz_enricher import WaczExtractorEnricher
|
||||
@@ -0,0 +1 @@
|
||||
from .wacz_extractor_enricher import WaczExtractorEnricher
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"name": "WACZ Enricher",
|
||||
"type": ["enricher", "archiver"],
|
||||
"entry_point": "wacz_enricher::WaczExtractorEnricher",
|
||||
"name": "WACZ Enricher (and Extractor)",
|
||||
"type": ["enricher", "extractor"],
|
||||
"entry_point": "wacz_extractor_enricher::WaczExtractorEnricher",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": [
|
||||
@@ -17,11 +17,19 @@
|
||||
"configs": {
|
||||
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
|
||||
"docker_commands": {"default": None, "help":"if a custom docker invocation is needed"},
|
||||
"timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
|
||||
"extract_media": {"default": False, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
|
||||
"extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
|
||||
"timeout": {"default": 120,
|
||||
"type": "int",
|
||||
"help": "timeout for WACZ generation in seconds", "type": "int"},
|
||||
"extract_media": {"default": False,
|
||||
"type": 'bool',
|
||||
"help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."
|
||||
},
|
||||
"extract_screenshot": {"default": True,
|
||||
"type": 'bool',
|
||||
"help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."
|
||||
},
|
||||
"socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
|
||||
"socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
|
||||
"socks_proxy_port": {"default": None, "type":"int", "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
|
||||
"proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
|
||||
},
|
||||
"description": """
|
||||
@@ -221,4 +221,4 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
||||
to_enrich.add_media(m, warc_fn)
|
||||
counter += 1
|
||||
seen_urls.add(record_url)
|
||||
logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)")
|
||||
logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)")
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "Wayback Machine Enricher",
|
||||
"type": ["enricher", "archiver"],
|
||||
"name": "Wayback Machine Enricher (and Extractor)",
|
||||
"type": ["enricher", "extractor"],
|
||||
"entry_point": "wayback_extractor_enricher::WaybackExtractorEnricher",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
@@ -9,6 +9,7 @@
|
||||
"configs": {
|
||||
"timeout": {
|
||||
"default": 15,
|
||||
"type": "int",
|
||||
"help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.",
|
||||
},
|
||||
"if_not_archived_within": {
|
||||
|
||||
@@ -10,8 +10,12 @@
|
||||
"help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
|
||||
"api_key": {"required": True,
|
||||
"help": "WhisperApi api key for authentication"},
|
||||
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
|
||||
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
|
||||
"include_srt": {"default": False,
|
||||
"type": "bool",
|
||||
"help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
|
||||
"timeout": {"default": 90,
|
||||
"type": "int",
|
||||
"help": "How many seconds to wait at most for a successful job completion."},
|
||||
"action": {"default": "translate",
|
||||
"help": "which Whisper operation to execute",
|
||||
"choices": ["transcribe", "translate", "language_detection"]},
|
||||
|
||||
@@ -4,7 +4,6 @@ from loguru import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.core.module import get_module
|
||||
|
||||
class WhisperEnricher(Enricher):
|
||||
"""
|
||||
@@ -15,7 +14,7 @@ class WhisperEnricher(Enricher):
|
||||
|
||||
def setup(self) -> None:
|
||||
self.stores = self.config['steps']['storages']
|
||||
self.s3 = get_module("s3_storage", self.config)
|
||||
self.s3 = self.module_factory.get_module("s3_storage", self.config)
|
||||
if not "s3_storage" in self.stores:
|
||||
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
|
||||
return
|
||||
@@ -29,8 +28,7 @@ class WhisperEnricher(Enricher):
|
||||
job_results = {}
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
if m.is_video() or m.is_audio():
|
||||
# TODO: this used to pass all storage items to store now
|
||||
# Now only passing S3, the rest will get added later in the usual order (?)
|
||||
# Only storing S3, the rest will get added later in the usual order (?)
|
||||
m.store(url=url, metadata=to_enrich, storages=[self.s3])
|
||||
try:
|
||||
job_id = self.submit_job(m)
|
||||
|
||||
Reference in New Issue
Block a user