mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 04:38:29 +03:00
Merge pull request #56 from djhmateer/oauth
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -17,4 +17,6 @@ config-*.yaml
|
||||
logs/*
|
||||
local_archive/
|
||||
vk_config*.json
|
||||
gd-token.json
|
||||
credentials.json
|
||||
secrets/*
|
||||
@@ -118,6 +118,7 @@ class Config:
|
||||
gd = secrets["google_drive"]
|
||||
self.gd_config = GDConfig(
|
||||
root_folder_id=gd.get("root_folder_id"),
|
||||
oauth_token_filename=gd.get("oauth_token_filename"),
|
||||
service_account=gd.get("service_account", GDConfig.service_account)
|
||||
)
|
||||
|
||||
|
||||
73
create_update_test_oauth_token.py
Normal file
73
create_update_test_oauth_token.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import os.path
|
||||
|
||||
from google.auth.transport.requests import Request
|
||||
from google.oauth2.credentials import Credentials
|
||||
from google_auth_oauthlib.flow import InstalledAppFlow
|
||||
from googleapiclient.discovery import build
|
||||
from googleapiclient.errors import HttpError
|
||||
|
||||
# If creating for first time download the OAuth Client Ids json `credentials.json` from https://console.cloud.google.com/apis/credentials OAuth 2.0 Client IDs
|
||||
# add "http://localhost:55192/" to the list of "Authorised redirect URIs"
|
||||
# https://davemateer.com/2022/04/28/google-drive-with-python for more information
|
||||
|
||||
# You can run this code to get a new token and verify it belongs to the correct user
|
||||
# This token will be refresh automatically by the auto-archiver
|
||||
|
||||
# Code below from https://developers.google.com/drive/api/quickstart/python
|
||||
|
||||
SCOPES = ['https://www.googleapis.com/auth/drive']
|
||||
|
||||
|
||||
def main():
|
||||
token_file = 'gd-token.json'
|
||||
creds = None
|
||||
|
||||
# The file token.json stores the user's access and refresh tokens, and is
|
||||
# created automatically when the authorization flow completes for the first
|
||||
# time.
|
||||
if os.path.exists(token_file):
|
||||
creds = Credentials.from_authorized_user_file(token_file, SCOPES)
|
||||
|
||||
# If there are no (valid) credentials available, let the user log in.
|
||||
if not creds or not creds.valid:
|
||||
if creds and creds.expired and creds.refresh_token:
|
||||
print('Requesting new token')
|
||||
creds.refresh(Request())
|
||||
else:
|
||||
print('First run through so putting up login dialog')
|
||||
# credentials.json downloaded from https://console.cloud.google.com/apis/credentials
|
||||
flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
|
||||
creds = flow.run_local_server(port=55192)
|
||||
# Save the credentials for the next run
|
||||
with open(token_file, 'w') as token:
|
||||
print('Saving new token')
|
||||
token.write(creds.to_json())
|
||||
else:
|
||||
print('Token valid')
|
||||
|
||||
try:
|
||||
service = build('drive', 'v3', credentials=creds)
|
||||
|
||||
# About the user
|
||||
results = service.about().get(fields="*").execute()
|
||||
emailAddress = results['user']['emailAddress']
|
||||
print(emailAddress)
|
||||
|
||||
# Call the Drive v3 API and return some files
|
||||
results = service.files().list(
|
||||
pageSize=10, fields="nextPageToken, files(id, name)").execute()
|
||||
items = results.get('files', [])
|
||||
|
||||
if not items:
|
||||
print('No files found.')
|
||||
return
|
||||
print('Files:')
|
||||
for item in items:
|
||||
print(u'{0} ({1})'.format(item['name'], item['id']))
|
||||
|
||||
except HttpError as error:
|
||||
print(f'An error occurred: {error}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -18,8 +18,19 @@ secrets:
|
||||
|
||||
# needed if you use storage=gd
|
||||
google_drive:
|
||||
# local filename can be the same or different file from google_sheets.service_account, defaults to service_account.json
|
||||
service_account: "service_account.json"
|
||||
# To authenticate with google you have two options (1. service account OR 2. OAuth token)
|
||||
|
||||
# 1. service account - storage space will count towards the developer account
|
||||
# filename can be the same or different file from google_sheets.service_account, defaults to "service_account.json"
|
||||
# service_account: "service_account.json"
|
||||
|
||||
# 2. OAuth token - storage space will count towards the owner of the GDrive folder
|
||||
# (only 1. or 2. - if both specified then this 2. takes precedence)
|
||||
# needs write access on the server so refresh flow works
|
||||
# To get the token, run the file `create_update_test_oauth_token.py`
|
||||
# you can edit that file if you want a different token filename, default is "gd-token.json"
|
||||
oauth_token_filename: "gd-token.json"
|
||||
|
||||
root_folder_id: copy XXXX from https://drive.google.com/drive/folders/XXXX
|
||||
|
||||
# needed if you use storage=local
|
||||
|
||||
@@ -8,19 +8,54 @@ from googleapiclient.http import MediaFileUpload
|
||||
from google.oauth2 import service_account
|
||||
|
||||
|
||||
from google.oauth2.credentials import Credentials
|
||||
from google.auth.transport.requests import Request
|
||||
|
||||
@dataclass
|
||||
class GDConfig:
|
||||
root_folder_id: str
|
||||
folder: str = "default"
|
||||
oauth_token_filename: str
|
||||
service_account: str = "service_account.json"
|
||||
|
||||
folder: str = "default"
|
||||
|
||||
class GDStorage(Storage):
|
||||
def __init__(self, config: GDConfig):
|
||||
self.folder = config.folder
|
||||
self.root_folder_id = config.root_folder_id
|
||||
creds = service_account.Credentials.from_service_account_file(
|
||||
config.service_account, scopes=['https://www.googleapis.com/auth/drive'])
|
||||
|
||||
SCOPES=['https://www.googleapis.com/auth/drive']
|
||||
|
||||
token_file = config.oauth_token_filename
|
||||
if token_file is not None:
|
||||
"""
|
||||
Tokens are refreshed after 1 hour
|
||||
however keep working for 7 days (tbc)
|
||||
so as long as the job doesn't last for 7 days
|
||||
then this method of refreshing only once per run will work
|
||||
see this link for details on the token
|
||||
https://davemateer.com/2022/04/28/google-drive-with-python#tokens
|
||||
"""
|
||||
logger.debug(f'Using GD OAuth token {token_file}')
|
||||
creds = Credentials.from_authorized_user_file(token_file, SCOPES)
|
||||
|
||||
if not creds or not creds.valid:
|
||||
if creds and creds.expired and creds.refresh_token:
|
||||
logger.debug('Requesting new GD OAuth token')
|
||||
creds.refresh(Request())
|
||||
else:
|
||||
raise Exception("Problem with creds - create the token again")
|
||||
|
||||
# Save the credentials for the next run
|
||||
with open(token_file, 'w') as token:
|
||||
logger.debug('Saving new GD OAuth token')
|
||||
token.write(creds.to_json())
|
||||
else:
|
||||
logger.debug('GD OAuth Token valid')
|
||||
else:
|
||||
gd_service_account = config.service_account
|
||||
logger.debug(f'Using GD Service Account {gd_service_account}')
|
||||
creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES)
|
||||
|
||||
self.service = build('drive', 'v3', credentials=creds)
|
||||
|
||||
def get_cdn_url(self, key):
|
||||
@@ -88,13 +123,18 @@ class GDStorage(Storage):
|
||||
return key[1:]
|
||||
return key
|
||||
|
||||
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=True):
|
||||
# gets the Drive folderID if it is there
|
||||
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
|
||||
"""
|
||||
Retrieves the id of a folder or file from its @name and the @parent_id folder
|
||||
Optionally does multiple @retries and sleeps @sleep_seconds between them
|
||||
If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'"
|
||||
If @raise_on_missing will throw error when not found, or returns None
|
||||
Will remember previous calls to avoid duplication if @use_cache
|
||||
DM - caching giving a perf improvement in order of 41s to 46s
|
||||
So I prefer not to use yet, purely as caching notoriously hard in terms of edge cases
|
||||
and pro's don't outweigh cons for me (yet)
|
||||
to be fair I just need to test this and make sure it always runs well!
|
||||
Returns the id of the file or folder from its name as a string
|
||||
"""
|
||||
# cache logic
|
||||
@@ -107,7 +147,7 @@ class GDStorage(Storage):
|
||||
|
||||
# API logic
|
||||
debug_header: str = f"[searching {name=} in {parent_id=}]"
|
||||
query_string = f"'{parent_id}' in parents and name = '{name}' "
|
||||
query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false "
|
||||
if use_mime_type:
|
||||
query_string += f" and mimeType='application/vnd.google-apps.folder' "
|
||||
|
||||
|
||||
Reference in New Issue
Block a user