mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
refactor
This commit is contained in:
@@ -1,24 +1,23 @@
|
||||
import os, time
|
||||
|
||||
from loguru import logger
|
||||
from .base_storage import Storage
|
||||
from dataclasses import dataclass
|
||||
|
||||
from googleapiclient.discovery import build
|
||||
from googleapiclient.http import MediaFileUpload
|
||||
from google.oauth2 import service_account
|
||||
|
||||
import time
|
||||
|
||||
|
||||
@dataclass
|
||||
class GDConfig:
|
||||
root_folder_id: str
|
||||
default_folder: str = "default"
|
||||
folder: str = "default"
|
||||
service_account: str = "service_account.json"
|
||||
|
||||
|
||||
class GDStorage(Storage):
|
||||
def __init__(self, config: GDConfig):
|
||||
self.default_folder = config.default_folder
|
||||
self.folder = config.folder
|
||||
self.root_folder_id = config.root_folder_id
|
||||
creds = service_account.Credentials.from_service_account_file(
|
||||
config.service_account, scopes=['https://www.googleapis.com/auth/drive'])
|
||||
@@ -29,77 +28,73 @@ class GDStorage(Storage):
|
||||
only support files saved in a folder for GD
|
||||
S3 supports folder and all stored in the root
|
||||
"""
|
||||
self.subfolder = self._clean_path(self.subfolder, self.default_folder, False)
|
||||
filename = key
|
||||
logger.debug(f'Looking for {self.subfolder} and filename: {filename} on GD')
|
||||
|
||||
folder_id = self._get_id_from_parent_and_name(self.root_folder_id, self.subfolder, 5, 10)
|
||||
|
||||
# check for sub folder in file youtube_dl_abcde/index.html, needed for thumbnails
|
||||
# a='youtube_dl_abcde', b='index.html'
|
||||
a, _, b = filename.partition('/')
|
||||
if b != '':
|
||||
logger.debug(f'get_cdn_url: Found a subfolder so need to split on: {a=} and {b=}')
|
||||
folder_id = self._get_id_from_parent_and_name(folder_id, a, use_mime_type=True)
|
||||
filename = b
|
||||
full_name = os.path.join(self.folder, key)
|
||||
parent_id, folder_id = self.root_folder_id, None
|
||||
path_parts = full_name.split(os.path.sep)
|
||||
filename = path_parts[-1]
|
||||
logger.info(f"looking for folders for {path_parts=} before uploading {filename=}")
|
||||
for folder in path_parts[0:-1]:
|
||||
folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
|
||||
parent_id = folder_id
|
||||
|
||||
# get id of file inside folder (or sub folder)
|
||||
file_id = self._get_id_from_parent_and_name(folder_id, filename)
|
||||
return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
|
||||
|
||||
def exists(self, _key):
|
||||
# TODO: How to check for google drive, as it accepts different names?
|
||||
return False
|
||||
def exists(self, key):
|
||||
try:
|
||||
self.get_cdn_url(key)
|
||||
return True
|
||||
except: return False
|
||||
|
||||
def uploadf(self, file, key, **_kwargs):
|
||||
def uploadf(self, file: str, key: str, **_kwargs):
|
||||
"""
|
||||
1. check if subfolder exists or create it
|
||||
2. check if key contains sub-subfolder, check if exists or create it
|
||||
3. upload file to root_id/subfolder[/sub-subfolder]/filename
|
||||
1. for each sub-folder in the path check if exists or create
|
||||
2. upload file to root_id/other_paths.../filename
|
||||
"""
|
||||
self.subfolder = self._clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False)
|
||||
filename = key
|
||||
|
||||
# get id of subfolder or create if it does not exist
|
||||
folder_id_to_upload_to = self._get_id_from_parent_and_name(self.root_folder_id, self.subfolder, use_mime_type=True, raise_on_missing=False)
|
||||
if folder_id_to_upload_to is None:
|
||||
folder_id_to_upload_to = self._mkdir(self.subfolder, self.root_folder_id)
|
||||
|
||||
# check for sub folder in file youtube_dl_abcde/index.html, needed for thumbnails
|
||||
# a='youtube_dl_abcde', b='index.html'
|
||||
a, _, b = filename.partition('/')
|
||||
if b != '':
|
||||
logger.debug(f'uploadf: Found a subfolder so need to split on: {a=} and {b=}')
|
||||
# get id of subfolder or create if it does not exist
|
||||
sub_folder_id_to_upload_to = self._get_id_from_parent_and_name(folder_id_to_upload_to, a, use_mime_type=True, raise_on_missing=False)
|
||||
if sub_folder_id_to_upload_to is None:
|
||||
sub_folder_id_to_upload_to = self._mkdir(a, folder_id_to_upload_to)
|
||||
|
||||
filename = b
|
||||
folder_id_to_upload_to = sub_folder_id_to_upload_to
|
||||
full_name = os.path.join(self.folder, key)
|
||||
parent_id, upload_to = self.root_folder_id, None
|
||||
path_parts = full_name.split(os.path.sep)
|
||||
filename = path_parts[-1]
|
||||
logger.info(f"checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}")
|
||||
for folder in path_parts[0:-1]:
|
||||
upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False)
|
||||
if upload_to is None:
|
||||
upload_to = self._mkdir(folder, parent_id)
|
||||
parent_id = upload_to
|
||||
|
||||
# upload file to gd
|
||||
logger.debug(f'uploading {filename=} to folder id {upload_to}')
|
||||
file_metadata = {
|
||||
'name': [filename],
|
||||
'parents': [folder_id_to_upload_to]
|
||||
'parents': [upload_to]
|
||||
}
|
||||
media = MediaFileUpload(file, resumable=True)
|
||||
gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute()
|
||||
logger.debug(f'uploadf: uploaded file {gd_file["id"]} succesfully in folder={folder_id_to_upload_to}')
|
||||
logger.debug(f'uploadf: uploaded file {gd_file["id"]} succesfully in folder={upload_to}')
|
||||
|
||||
def upload(self, filename: str, key: str, **kwargs):
|
||||
# GD only requires the filename not a file reader
|
||||
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
|
||||
self.uploadf(filename, key, **kwargs)
|
||||
|
||||
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True):
|
||||
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=True):
|
||||
"""
|
||||
Retrieves the id of a folder or file from its @name and the @parent_id folder
|
||||
Optionally does multiple @retries and sleeps @sleep_seconds between them
|
||||
If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'"
|
||||
If @raise_on_missing will throw error when not found, or returns None
|
||||
Will remember previous calls to avoid duplication if @use_cache
|
||||
Returns the id of the file or folder from its name as a string
|
||||
"""
|
||||
# cache logic
|
||||
if use_cache:
|
||||
self.api_cache = getattr(self, "api_cache", {})
|
||||
cache_key = f"{parent_id}_{name}_{use_mime_type}"
|
||||
if cache_key in self.api_cache:
|
||||
logger.debug(f"cache hit for {cache_key=}")
|
||||
return self.api_cache[cache_key]
|
||||
|
||||
# API logic
|
||||
debug_header: str = f"[searching {name=} in {parent_id=}]"
|
||||
query_string = f"'{parent_id}' in parents and name = '{name}' "
|
||||
if use_mime_type:
|
||||
@@ -115,10 +110,14 @@ class GDStorage(Storage):
|
||||
|
||||
if len(items) > 0:
|
||||
logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}")
|
||||
return items[-1]['id']
|
||||
_id = items[-1]['id']
|
||||
if use_cache: self.api_cache[cache_key] = _id
|
||||
return _id
|
||||
else:
|
||||
logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}. sleeping for {sleep_seconds} second(s)')
|
||||
if attempt < retries - 1: time.sleep(sleep_seconds)
|
||||
logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}.')
|
||||
if attempt < retries - 1:
|
||||
logger.debug(f'sleeping for {sleep_seconds} second(s)')
|
||||
time.sleep(sleep_seconds)
|
||||
|
||||
if raise_on_missing:
|
||||
raise ValueError(f'{debug_header} not found after {retries} attempt(s)')
|
||||
@@ -129,7 +128,7 @@ class GDStorage(Storage):
|
||||
Creates a new GDrive folder @name inside folder @parent_id
|
||||
Returns id of the created folder
|
||||
"""
|
||||
logger.debug(f'[_mkdir] Creating new folder with {name=} inside {parent_id=}')
|
||||
logger.debug(f'Creating new folder with {name=} inside {parent_id=}')
|
||||
file_metadata = {
|
||||
'name': [name],
|
||||
'mimeType': 'application/vnd.google-apps.folder',
|
||||
|
||||
Reference in New Issue
Block a user