mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-13 05:38:29 +03:00
refactoring filenumber into subfolder
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
from loguru import logger
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class Storage(ABC):
|
||||
@@ -19,3 +20,25 @@ class Storage(ABC):
|
||||
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
|
||||
with open(filename, 'rb') as f:
|
||||
self.uploadf(f, key, **kwargs)
|
||||
|
||||
def update_properties(self, **kwargs):
|
||||
"""
|
||||
method used to update general properties that some children may use
|
||||
and others not, but that all can call
|
||||
"""
|
||||
for k, v in kwargs.items():
|
||||
if k in self.get_allowed_properties():
|
||||
setattr(self, k, v)
|
||||
else:
|
||||
logger.warning(f'[{self.__class__.__name__}] does not accept dynamic property "{k}"')
|
||||
|
||||
def get_allowed_properties(self):
|
||||
"""
|
||||
child classes should specify which properties they allow to be set
|
||||
"""
|
||||
return set(["subfolder"])
|
||||
|
||||
def clean_path(self, folder, default="", add_forward_slash=True):
|
||||
if folder is None or type(folder) != str or len(folder.strip()) == 0:
|
||||
return default
|
||||
return str(Path(folder)) + ("/" if add_forward_slash else "")
|
||||
|
||||
@@ -15,6 +15,7 @@ class GDConfig:
|
||||
|
||||
|
||||
class GDStorage(Storage):
|
||||
DEFAULT_UPLOAD_FOLDER_NAME = "default"
|
||||
|
||||
def __init__(self, config: GDConfig):
|
||||
self.root_folder_id = config.root_folder_id
|
||||
@@ -22,19 +23,14 @@ class GDStorage(Storage):
|
||||
creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=SCOPES)
|
||||
self.service = build('drive', 'v3', credentials=creds)
|
||||
|
||||
def _get_path(self, key):
|
||||
return self.folder + key
|
||||
|
||||
def get_cdn_url(self, key):
|
||||
# only support files saved in a folders for GD
|
||||
# S3 supports folder and all stored in the root
|
||||
|
||||
# key will be SM0002/twitter__media_ExeUSW2UcAE6RbN.jpg
|
||||
foldername = key.split('/', 1)[0]
|
||||
# eg twitter__media_asdf.jpg
|
||||
filename = key.split('/', 1)[1]
|
||||
|
||||
logger.debug(f'Looking for {foldername} and filename: {filename} on GD')
|
||||
"""
|
||||
only support files saved in a folder for GD
|
||||
S3 supports folder and all stored in the root
|
||||
"""
|
||||
self.subfolder = self.clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False)
|
||||
filename = key
|
||||
logger.debug(f'Looking for {self.subfolder} and filename: {filename} on GD')
|
||||
|
||||
# retry policy on Google Drive
|
||||
try_again = True
|
||||
@@ -42,11 +38,11 @@ class GDStorage(Storage):
|
||||
folder_id = None
|
||||
while try_again:
|
||||
# need to lookup the id of folder eg SM0002 which should be there already as this is get_cdn_url
|
||||
results = self.service.files().list(q=f"'{self.root_folder_id}' in parents \
|
||||
and name = '{foldername}' ",
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
results = self.service.files().list(
|
||||
q=f"'{self.root_folder_id}' in parents and name = '{self.subfolder}' ",
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
|
||||
for item in items:
|
||||
@@ -55,11 +51,11 @@ class GDStorage(Storage):
|
||||
try_again = False
|
||||
|
||||
if folder_id is None:
|
||||
logger.debug(f'Cant find {foldername=} waiting and trying again {counter=}')
|
||||
logger.debug(f'Cannot find {self.subfolder=} waiting and trying again {counter=}')
|
||||
counter += 1
|
||||
time.sleep(10)
|
||||
if counter > 18:
|
||||
raise ValueError(f'Cant find {foldername} and retried 18 times pausing 10seconds at a time which is 3 minutes')
|
||||
raise ValueError(f'Cannot find {self.subfolder} and retried 18 times pausing 10s at a time which is 3 minutes')
|
||||
|
||||
# check for sub folder in file eg youtube_dl_sDE-qZdi8p8/index.html'
|
||||
# happens doing thumbnails
|
||||
@@ -71,12 +67,11 @@ class GDStorage(Storage):
|
||||
logger.debug(f'get_cdn_url: Found a subfolder so need to split on a: {a} and {b}')
|
||||
|
||||
# get id of the sub folder
|
||||
results = self.service.files().list(q=f"'{folder_id}' in parents \
|
||||
and mimeType='application/vnd.google-apps.folder' \
|
||||
and name = '{a}' ",
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
results = self.service.files().list(
|
||||
q=f"'{folder_id}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{a}' ",
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
|
||||
filename = None
|
||||
@@ -87,11 +82,11 @@ class GDStorage(Storage):
|
||||
raise ValueError(f'Problem finding sub folder {a}')
|
||||
|
||||
# get id of file inside folder (or sub folder)
|
||||
results = self.service.files().list(q=f"'{folder_id}' in parents \
|
||||
and name = '{filename}' ",
|
||||
spaces='drive',
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
results = self.service.files().list(
|
||||
q=f"'{folder_id}' in parents and name = '{filename}' ",
|
||||
spaces='drive',
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
|
||||
file_id = None
|
||||
@@ -110,41 +105,36 @@ class GDStorage(Storage):
|
||||
return False
|
||||
|
||||
def uploadf(self, file, key, **_kwargs):
|
||||
# split on first occurance of /
|
||||
# eg SM0005
|
||||
foldername = key.split('/', 1)[0]
|
||||
# eg twitter__media_asdf.jpg
|
||||
filename = key.split('/', 1)[1]
|
||||
|
||||
logger.debug(f"before {self.subfolder=}")
|
||||
self.subfolder = self.clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False)
|
||||
filename = key
|
||||
logger.debug(f"after {self.subfolder=}")
|
||||
# does folder eg SM0005 exist already inside parent of Files auto-archiver
|
||||
results = self.service.files().list(q=f"'{self.root_folder_id}' in parents \
|
||||
and mimeType='application/vnd.google-apps.folder' \
|
||||
and name = '{foldername}' ",
|
||||
spaces='drive',
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
results = self.service.files().list(
|
||||
q=f"'{self.root_folder_id}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{self.subfolder}' ",
|
||||
spaces='drive',
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
folder_id_to_upload_to = None
|
||||
if len(items) > 1:
|
||||
logger.error(f'Duplicate folder name of {foldername} which should never happen, but continuing anyway')
|
||||
logger.error(f'Duplicate folder name of {self.subfolder} which should never happen, but continuing anyway')
|
||||
|
||||
for item in items:
|
||||
logger.debug(f"Found existing folder of {item['name']}")
|
||||
folder_id_to_upload_to = item['id']
|
||||
|
||||
if folder_id_to_upload_to is None:
|
||||
logger.debug(f'Creating new folder {foldername}')
|
||||
logger.debug(f'Creating new folder {self.subfolder}')
|
||||
file_metadata = {
|
||||
'name': [foldername],
|
||||
'name': [self.subfolder],
|
||||
'mimeType': 'application/vnd.google-apps.folder',
|
||||
'parents': [self.root_folder_id]
|
||||
}
|
||||
gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
|
||||
folder_id_to_upload_to = gd_file.get('id')
|
||||
|
||||
# check for subfolder nema in file eg youtube_dl_sDE-qZdi8p8/out1.jpg'
|
||||
# happens doing thumbnails
|
||||
|
||||
# check for subfolder name in file eg youtube_dl_sDE-qZdi8p8/out1.jpg', eg: thumbnails
|
||||
# will always return a and a blank b even if there is nothing to split
|
||||
# https://stackoverflow.com/a/38149500/26086
|
||||
a, _, b = filename.partition('/')
|
||||
@@ -155,12 +145,11 @@ class GDStorage(Storage):
|
||||
logger.debug(f'uploadf: Found a subfolder so need to split on a: {a} and {b}')
|
||||
|
||||
# does the 'a' folder exist already in folder_id_to_upload_to eg SM0005
|
||||
results = self.service.files().list(q=f"'{folder_id_to_upload_to}' in parents \
|
||||
and mimeType='application/vnd.google-apps.folder' \
|
||||
and name = '{a}' ",
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
results = self.service.files().list(
|
||||
q=f"'{folder_id_to_upload_to}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{a}' ",
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
sub_folder_id_to_upload_to = None
|
||||
if len(items) > 1:
|
||||
@@ -184,17 +173,13 @@ class GDStorage(Storage):
|
||||
folder_id_to_upload_to = sub_folder_id_to_upload_to
|
||||
# back to normal control flow
|
||||
|
||||
# else:
|
||||
# upload file to gd
|
||||
file_metadata = {
|
||||
# 'name': 'twitter__media_FMQg7yeXwAAwNEi.jpg',
|
||||
'name': [filename],
|
||||
'parents': [folder_id_to_upload_to]
|
||||
}
|
||||
media = MediaFileUpload(file, resumable=True)
|
||||
gd_file = self.service.files().create(body=file_metadata,
|
||||
media_body=media,
|
||||
fields='id').execute()
|
||||
gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute()
|
||||
|
||||
def upload(self, filename: str, key: str, **kwargs):
|
||||
# GD only requires the filename not a file reader
|
||||
|
||||
@@ -2,6 +2,7 @@ import boto3
|
||||
from botocore.errorfactory import ClientError
|
||||
from .base_storage import Storage
|
||||
from dataclasses import dataclass
|
||||
from loguru import logger
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -19,12 +20,9 @@ class S3Storage(Storage):
|
||||
def __init__(self, config: S3Config):
|
||||
self.bucket = config.bucket
|
||||
self.region = config.region
|
||||
self.folder = config.folder
|
||||
self.folder = self.clean_path(config.folder)
|
||||
self.private = config.private
|
||||
|
||||
if len(self.folder) and self.folder[-1] != '/':
|
||||
self.folder += '/'
|
||||
|
||||
self.s3 = boto3.client(
|
||||
's3',
|
||||
region_name=self.region,
|
||||
@@ -34,7 +32,7 @@ class S3Storage(Storage):
|
||||
)
|
||||
|
||||
def _get_path(self, key):
|
||||
return self.folder + key
|
||||
return self.folder + self.clean_path(self.subfolder) + key
|
||||
|
||||
def get_cdn_url(self, key):
|
||||
return f'https://{self.bucket}.{self.region}.cdn.digitaloceanspaces.com/{self._get_path(key)}'
|
||||
@@ -47,9 +45,9 @@ class S3Storage(Storage):
|
||||
return False
|
||||
|
||||
def uploadf(self, file, key, **kwargs):
|
||||
logger.debug(f'[S3 storage] uploading {file=}, {key=}')
|
||||
if self.private:
|
||||
extra_args = kwargs.get("extra_args", {})
|
||||
else:
|
||||
extra_args = kwargs.get("extra_args", {'ACL': 'public-read'})
|
||||
|
||||
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)
|
||||
|
||||
Reference in New Issue
Block a user