mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
Update modules for new core structure.
This commit is contained in:
@@ -1,14 +1,14 @@
|
||||
{
|
||||
"name": "Google Drive Storage",
|
||||
"type": ["storage"],
|
||||
"author": "Dave Mateer",
|
||||
"entry_point": "gdrive_storage::GDriveStorage",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": [
|
||||
"loguru",
|
||||
"google-api-python-client",
|
||||
"google-auth",
|
||||
"google-auth-oauthlib",
|
||||
"google-auth-httplib2"
|
||||
"googleapiclient",
|
||||
"google",
|
||||
],
|
||||
},
|
||||
"configs": {
|
||||
@@ -18,17 +18,24 @@
|
||||
"choices": ["flat", "url", "random"],
|
||||
},
|
||||
"filename_generator": {
|
||||
"default": "random",
|
||||
"default": "static",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
"choices": ["random", "static"],
|
||||
},
|
||||
"root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
|
||||
"oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
|
||||
"root_folder_id": {"default": None,
|
||||
# "required": True,
|
||||
"help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
|
||||
"oauth_token": {"default": None,
|
||||
"help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
|
||||
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."},
|
||||
},
|
||||
"description": """
|
||||
|
||||
GDriveStorage: A storage module for saving archived content to Google Drive.
|
||||
|
||||
Author: Dave Mateer, (And maintained by: )
|
||||
Source Documentation: https://davemateer.com/2022/04/28/google-drive-with-python
|
||||
|
||||
### Features
|
||||
- Saves media files to Google Drive, organizing them into folders based on the provided path structure.
|
||||
- Supports OAuth token-based authentication or service account credentials for API access.
|
||||
@@ -39,5 +46,55 @@
|
||||
- Requires setup with either a Google OAuth token or a service account JSON file.
|
||||
- Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure.
|
||||
- Automatically handles Google Drive API token refreshes for long-running jobs.
|
||||
"""
|
||||
|
||||
## Overview
|
||||
This module integrates Google Drive as a storage backend, enabling automatic folder creation and file uploads. It supports authentication via **service accounts** (recommended for automation) or **OAuth tokens** (for user-based authentication).
|
||||
|
||||
## Features
|
||||
- Saves files to Google Drive, organizing them into structured folders.
|
||||
- Supports both **service account** and **OAuth token** authentication.
|
||||
- Automatically creates folders if they don't exist.
|
||||
- Generates public URLs for easy file sharing.
|
||||
|
||||
## Setup Guide
|
||||
1. **Enable Google Drive API**
|
||||
- Create a Google Cloud project at [Google Cloud Console](https://console.cloud.google.com/)
|
||||
- Enable the **Google Drive API**.
|
||||
|
||||
2. **Set Up a Google Drive Folder**
|
||||
- Create a folder in **Google Drive** and copy its **folder ID** from the URL.
|
||||
- Add the **folder ID** to your configuration (`orchestration.yaml`):
|
||||
```yaml
|
||||
root_folder_id: "FOLDER_ID"
|
||||
```
|
||||
|
||||
3. **Authentication Options**
|
||||
- **Option 1: Service Account (Recommended)**
|
||||
- Create a **service account** in Google Cloud IAM.
|
||||
- Download the JSON key file and save it as:
|
||||
```
|
||||
secrets/service_account.json
|
||||
```
|
||||
- **Share your Drive folder** with the service account’s `client_email` (found in the JSON file).
|
||||
|
||||
- **Option 2: OAuth Token (User Authentication)**
|
||||
- Create OAuth **Desktop App credentials** in Google Cloud.
|
||||
- Save the credentials as:
|
||||
```
|
||||
secrets/oauth_credentials.json
|
||||
```
|
||||
- Generate an OAuth token by running:
|
||||
```sh
|
||||
python scripts/create_update_gdrive_oauth_token.py -c secrets/oauth_credentials.json
|
||||
```
|
||||
|
||||
|
||||
Notes on the OAuth token:
|
||||
Tokens are refreshed after 1 hour however keep working for 7 days (tbc)
|
||||
so as long as the job doesn't last for 7 days then this method of refreshing only once per run will work
|
||||
see this link for details on the token:
|
||||
https://davemateer.com/2022/04/28/google-drive-with-python#tokens
|
||||
|
||||
|
||||
"""
|
||||
}
|
||||
|
||||
@@ -1,68 +1,69 @@
|
||||
|
||||
import shutil, os, time, json
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from typing import IO
|
||||
from loguru import logger
|
||||
|
||||
from googleapiclient.discovery import build
|
||||
from googleapiclient.http import MediaFileUpload
|
||||
from google.auth.transport.requests import Request
|
||||
from google.oauth2 import service_account
|
||||
from google.oauth2.credentials import Credentials
|
||||
from google.auth.transport.requests import Request
|
||||
from googleapiclient.discovery import build
|
||||
from googleapiclient.http import MediaFileUpload
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.core import Storage
|
||||
|
||||
|
||||
|
||||
|
||||
class GDriveStorage(Storage):
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
def setup(self, config: dict) -> None:
|
||||
# Step 1: Call the BaseModule setup to dynamically assign configs
|
||||
super().setup(config)
|
||||
self.scopes = ['https://www.googleapis.com/auth/drive']
|
||||
# Initialize Google Drive service
|
||||
self._setup_google_drive_service()
|
||||
|
||||
SCOPES = ['https://www.googleapis.com/auth/drive']
|
||||
|
||||
if self.oauth_token is not None:
|
||||
"""
|
||||
Tokens are refreshed after 1 hour
|
||||
however keep working for 7 days (tbc)
|
||||
so as long as the job doesn't last for 7 days
|
||||
then this method of refreshing only once per run will work
|
||||
see this link for details on the token
|
||||
https://davemateer.com/2022/04/28/google-drive-with-python#tokens
|
||||
"""
|
||||
logger.debug(f'Using GD OAuth token {self.oauth_token}')
|
||||
# workaround for missing 'refresh_token' in from_authorized_user_file
|
||||
with open(self.oauth_token, 'r') as stream:
|
||||
creds_json = json.load(stream)
|
||||
creds_json['refresh_token'] = creds_json.get("refresh_token", "")
|
||||
creds = Credentials.from_authorized_user_info(creds_json, SCOPES)
|
||||
# creds = Credentials.from_authorized_user_file(self.oauth_token, SCOPES)
|
||||
|
||||
if not creds or not creds.valid:
|
||||
if creds and creds.expired and creds.refresh_token:
|
||||
logger.debug('Requesting new GD OAuth token')
|
||||
creds.refresh(Request())
|
||||
else:
|
||||
raise Exception("Problem with creds - create the token again")
|
||||
|
||||
# Save the credentials for the next run
|
||||
with open(self.oauth_token, 'w') as token:
|
||||
logger.debug('Saving new GD OAuth token')
|
||||
token.write(creds.to_json())
|
||||
else:
|
||||
logger.debug('GD OAuth Token valid')
|
||||
def _setup_google_drive_service(self):
|
||||
"""Initialize Google Drive service based on provided credentials."""
|
||||
if self.oauth_token:
|
||||
logger.debug(f"Using Google Drive OAuth token: {self.oauth_token}")
|
||||
self.service = self._initialize_with_oauth_token()
|
||||
elif self.service_account:
|
||||
logger.debug(f"Using Google Drive service account: {self.service_account}")
|
||||
self.service = self._initialize_with_service_account()
|
||||
else:
|
||||
gd_service_account = self.service_account
|
||||
logger.debug(f'Using GD Service Account {gd_service_account}')
|
||||
creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES)
|
||||
raise ValueError("Missing credentials: either `oauth_token` or `service_account` must be provided.")
|
||||
|
||||
self.service = build('drive', 'v3', credentials=creds)
|
||||
def _initialize_with_oauth_token(self):
|
||||
"""Initialize Google Drive service with OAuth token."""
|
||||
with open(self.oauth_token, 'r') as stream:
|
||||
creds_json = json.load(stream)
|
||||
creds_json['refresh_token'] = creds_json.get("refresh_token", "")
|
||||
|
||||
creds = Credentials.from_authorized_user_info(creds_json, self.scopes)
|
||||
if not creds.valid and creds.expired and creds.refresh_token:
|
||||
creds.refresh(Request())
|
||||
with open(self.oauth_token, 'w') as token_file:
|
||||
logger.debug("Saving refreshed OAuth token.")
|
||||
token_file.write(creds.to_json())
|
||||
elif not creds.valid:
|
||||
raise ValueError("Invalid OAuth token. Please regenerate the token.")
|
||||
|
||||
return build('drive', 'v3', credentials=creds)
|
||||
|
||||
def _initialize_with_service_account(self):
|
||||
"""Initialize Google Drive service with service account."""
|
||||
creds = service_account.Credentials.from_service_account_file(self.service_account, scopes=self.scopes)
|
||||
return build('drive', 'v3', credentials=creds)
|
||||
|
||||
def get_cdn_url(self, media: Media) -> str:
|
||||
"""
|
||||
only support files saved in a folder for GD
|
||||
S3 supports folder and all stored in the root
|
||||
"""
|
||||
|
||||
# full_name = os.path.join(self.folder, media.key)
|
||||
parent_id, folder_id = self.root_folder_id, None
|
||||
path_parts = media.key.split(os.path.sep)
|
||||
@@ -77,7 +78,7 @@ class GDriveStorage(Storage):
|
||||
return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
|
||||
|
||||
def upload(self, media: Media, **kwargs) -> bool:
|
||||
logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}')
|
||||
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
|
||||
"""
|
||||
1. for each sub-folder in the path check if exists or create
|
||||
2. upload file to root_id/other_paths.../filename
|
||||
@@ -168,8 +169,3 @@ class GDriveStorage(Storage):
|
||||
gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields='id').execute()
|
||||
return gd_folder.get('id')
|
||||
|
||||
# def exists(self, key):
|
||||
# try:
|
||||
# self.get_cdn_url(key)
|
||||
# return True
|
||||
# except: return False
|
||||
|
||||
Reference in New Issue
Block a user