From 51a3134065640e44bbae7f43aec698d99657e22c Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 7 Feb 2023 21:59:24 +0000 Subject: [PATCH] adds gd_drive storage --- Pipfile.lock | 134 ++++++------ .../create_update_gdrive_oauth_token.py | 45 ++-- src/auto_archiver/storages/__init__.py | 3 +- src/auto_archiver/storages/gd.py | 192 ++++++++++++++++++ src/auto_archiver/storages/gd_storage.py | 181 ----------------- src/auto_archiver/storages/local.py | 9 +- src/auto_archiver/storages/s3.py | 1 - 7 files changed, 298 insertions(+), 267 deletions(-) rename create_update_test_oauth_token.py => scripts/create_update_gdrive_oauth_token.py (59%) create mode 100644 src/auto_archiver/storages/gd.py delete mode 100644 src/auto_archiver/storages/gd_storage.py diff --git a/Pipfile.lock b/Pipfile.lock index 8c29e0b..ddf2832 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -49,27 +49,27 @@ }, "beautifulsoup4": { "hashes": [ - "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30", - "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693" + "sha256:0e79446b10b3ecb499c1556f7e228a53e64a2bfcebd455f370d8927cb5b59e39", + "sha256:bc4bdda6717de5a2987436fb8d72f45dc90dd856bdfd512a1314ce90349a0106" ], "index": "pypi", - "version": "==4.11.1" + "version": "==4.11.2" }, "boto3": { "hashes": [ - "sha256:4e876ba5d64928cde0c416dd844f04f22d6b73d14002bbc3ca55591f80f49927", - "sha256:c729bb0af76e85a2776b6bd3da8d9fa0f4b91b425eab51612aa53956f644ee23" + "sha256:3a1ffeecfe6e61d414617294b822b008e604ccfd83434c483f429a2922db314d", + "sha256:ebea98f3054b467caf6c8aead9f0ef78395a78bce78b04db12fde452c02b3734" ], "index": "pypi", - "version": "==1.26.54" + "version": "==1.26.66" }, "botocore": { "hashes": [ - "sha256:ca3ef7588daa664fe196d3234718db5f6b5dab961507500b4bb921e31133eea1", - "sha256:f2fe17ed6b8e163769a715f81cb6ce3d4628d172918de535256bdf34d29b704f" + "sha256:4d1ac019e677cc39e615f9d473fa658ea22a8d906c1c562f9406b5d0cd854cbd", + "sha256:772da07d2a49a9d2dc8d23e060e88eb72881e58074be7c813aa946ecdbd0e5b5" ], "markers": "python_version >= '3.7'", - "version": "==1.29.54" + "version": "==1.29.66" }, "brotli": { "hashes": [ @@ -168,11 +168,11 @@ }, "cachetools": { "hashes": [ - "sha256:5991bc0e08a1319bb618d3195ca5b6bc76646a49c21d55962977197b301cc1fe", - "sha256:8462eebf3a6c15d25430a8c27c56ac61340b2ecf60c9ce57afc2b97e450e47da" + "sha256:13dfddc7b8df938c21a940dfa6557ce6e94a2f1cdfa58eb90c805721d58f2c14", + "sha256:429e1a1e845c008ea6c85aa35d4b98b65d6a9763eeef3e37e92728a12d1de9d4" ], "markers": "python_version ~= '3.7'", - "version": "==5.2.1" + "version": "==5.3.0" }, "certifi": { "hashes": [ @@ -316,11 +316,11 @@ }, "dateparser": { "hashes": [ - "sha256:c47b6e4b8c4b2b2a21690111b6571b6991295ba327ec6503753abeebf5e80696", - "sha256:e703db1815270c020552f4b3e3a981937b48b2cbcfcef5347071b74788dd9214" + "sha256:fbed8b738a24c9cd7f47c4f2089527926566fe539e1a06125eddba75917b1eef", + "sha256:ff047d9cffad4d3113ead8ec0faf8a7fc43bab7d853ac8715e071312b53c465a" ], "index": "pypi", - "version": "==1.1.6" + "version": "==1.1.7" }, "exceptiongroup": { "hashes": [ @@ -371,11 +371,11 @@ }, "google-api-python-client": { "hashes": [ - "sha256:7e860e3ec27b504fb797fa23c07c012a874dd736491fddbe50a20d3bdde8ace6", - "sha256:bafce2a02b06ee501df039eba5874afc7d28c9cf5ef92253327776448706556d" + "sha256:42a44e9adfca6bb27540ce52348aa1d3b81e214bcc53d454a76ebfbe8eee1483", + "sha256:f18e9dbb365f0485194a8daf5d60da2cff6a80ce2c9a694efc2b279922cb3dd0" ], "index": "pypi", - "version": "==2.73.0" + "version": "==2.77.0" }, "google-auth": { "hashes": [ @@ -395,11 +395,11 @@ }, "google-auth-oauthlib": { "hashes": [ - "sha256:40cc612a13c3336d5433e94e2adb42a0c88f6feb6c55769e44500fc70043a576", - "sha256:81056a310fb1c4a3e5a7e1a443e1eb96593c6bbc55b26c0261e4d3295d3e6593" + "sha256:95880ca704928c300f48194d1770cf5b1462835b6e49db61445a520f793fd5fb", + "sha256:e375064964820b47221a7e1b7ee1fd77051b6323c3f9e3e19785f78ab67ecfc5" ], "index": "pypi", - "version": "==0.8.0" + "version": "==1.0.0" }, "googleapis-common-protos": { "hashes": [ @@ -660,10 +660,11 @@ }, "mypy-extensions": { "hashes": [ - "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d", - "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8" + "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d", + "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782" ], - "version": "==0.4.3" + "markers": "python_version >= '3.5'", + "version": "==1.0.0" }, "oauth2client": { "hashes": [ @@ -768,35 +769,42 @@ }, "pycryptodomex": { "hashes": [ - "sha256:04610536921c1ec7adba158ef570348550c9f3a40bc24be9f8da2ef7ab387981", - "sha256:0ba28aa97cdd3ff5ed1a4f2b7f5cd04e721166bd75bd2b929e2734433882b583", - "sha256:0da835af786fdd1c9930994c78b23e88d816dc3f99aa977284a21bbc26d19735", - "sha256:1619087fb5b31510b0b0b058a54f001a5ffd91e6ffee220d9913064519c6a69d", - "sha256:1cda60207be8c1cf0b84b9138f9e3ca29335013d2b690774a5e94678ff29659a", - "sha256:22aed0868622d95179217c298e37ed7410025c7b29dac236d3230617d1e4ed56", - "sha256:231dc8008cbdd1ae0e34645d4523da2dbc7a88c325f0d4a59635a86ee25b41dd", - "sha256:2ad9bb86b355b6104796567dd44c215b3dc953ef2fae5e0bdfb8516731df92cf", - "sha256:4dbbe18cc232b5980c7633972ae5417d0df76fe89e7db246eefd17ef4d8e6d7a", - "sha256:6a465e4f856d2a4f2a311807030c89166529ccf7ccc65bef398de045d49144b6", - "sha256:70288d9bfe16b2fd0d20b6c365db614428f1bcde7b20d56e74cf88ade905d9eb", - "sha256:7993d26dae4d83b8f4ce605bb0aecb8bee330bb3c95475ef06f3694403621e71", - "sha256:8851585ff19871e5d69e1790f4ca5f6fd1699d6b8b14413b472a4c0dbc7ea780", - "sha256:893f8a97d533c66cc3a56e60dd3ed40a3494ddb4aafa7e026429a08772f8a849", - "sha256:8dd2d9e3c617d0712ed781a77efd84ea579e76c5f9b2a4bc0b684ebeddf868b2", - "sha256:a1c0ae7123448ecb034c75c713189cb00ebe2d415b11682865b6c54d200d9c93", - "sha256:b0789a8490114a2936ed77c87792cfe77582c829cb43a6d86ede0f9624ba8aa3", - "sha256:b3d04c00d777c36972b539fb79958790126847d84ec0129fce1efef250bfe3ce", - "sha256:ba57ac7861fd2c837cdb33daf822f2a052ff57dd769a2107807f52a36d0e8d38", - "sha256:ce338a9703f54b2305a408fc9890eb966b727ce72b69f225898bb4e9d9ed3f1f", - "sha256:daa67f5ebb6fbf1ee9c90decaa06ca7fc88a548864e5e484d52b0920a57fe8a5", - "sha256:e2453162f473c1eae4826eb10cd7bce19b5facac86d17fb5f29a570fde145abd", - "sha256:e25a2f5667d91795f9417cb856f6df724ccdb0cdd5cbadb212ee9bf43946e9f8", - "sha256:e5a670919076b71522c7d567a9043f66f14b202414a63c3a078b5831ae342c03", - "sha256:e9ba9d8ed638733c9e95664470b71d624a6def149e2db6cc52c1aca5a6a2df1d", - "sha256:f2b971a7b877348a27dcfd0e772a0343fb818df00b74078e91c008632284137d" + "sha256:0af93aad8d62e810247beedef0261c148790c52f3cd33643791cc6396dd217c1", + "sha256:12056c38e49d972f9c553a3d598425f8a1c1d35b2e4330f89d5ff1ffb70de041", + "sha256:23d83b610bd97704f0cd3acc48d99b76a15c8c1540d8665c94d514a49905bad7", + "sha256:2d4d395f109faba34067a08de36304e846c791808524614c731431ee048fe70a", + "sha256:32e764322e902bbfac49ca1446604d2839381bbbdd5a57920c9daaf2e0b778df", + "sha256:3c2516b42437ae6c7a29ef3ddc73c8d4714e7b6df995b76be4695bbe4b3b5cd2", + "sha256:40e8a11f578bd0851b02719c862d55d3ee18d906c8b68a9c09f8c564d6bb5b92", + "sha256:4b51e826f0a04d832eda0790bbd0665d9bfe73e5a4d8ea93b6a9b38beeebe935", + "sha256:4c4674f4b040321055c596aac926d12f7f6859dfe98cd12f4d9453b43ab6adc8", + "sha256:55eed98b4150a744920597c81b3965b632038781bab8a08a12ea1d004213c600", + "sha256:599bb4ae4bbd614ca05f49bd4e672b7a250b80b13ae1238f05fd0f09d87ed80a", + "sha256:5c23482860302d0d9883404eaaa54b0615eefa5274f70529703e2c43cc571827", + "sha256:64b876d57cb894b31056ad8dd6a6ae1099b117ae07a3d39707221133490e5715", + "sha256:67a3648025e4ddb72d43addab764336ba2e670c8377dba5dd752e42285440d31", + "sha256:6feedf4b0e36b395329b4186a805f60f900129cdf0170e120ecabbfcb763995d", + "sha256:78f0ddd4adc64baa39b416f3637aaf99f45acb0bcdc16706f0cc7ebfc6f10109", + "sha256:7a6651a07f67c28b6e978d63aa3a3fccea0feefed9a8453af3f7421a758461b7", + "sha256:7a8dc3ee7a99aae202a4db52de5a08aa4d01831eb403c4d21da04ec2f79810db", + "sha256:7cc28dd33f1f3662d6da28ead4f9891035f63f49d30267d3b41194c8778997c8", + "sha256:7fa0b52df90343fafe319257b31d909be1d2e8852277fb0376ba89d26d2921db", + "sha256:88b0d5bb87eaf2a31e8a759302b89cf30c97f2f8ca7d83b8c9208abe8acb447a", + "sha256:a4fa037078e92c7cc49f6789a8bac3de06856740bb2038d05f2d9a2e4b165d59", + "sha256:a57e3257bacd719769110f1f70dd901c5b6955e9596ad403af11a3e6e7e3311c", + "sha256:ab33c2d9f275e05e235dbca1063753b5346af4a5cac34a51fa0da0d4edfb21d7", + "sha256:c84689c73358dfc23f9fdcff2cb9e7856e65e2ce3b5ed8ff630d4c9bdeb1867b", + "sha256:c92537b596bd5bffb82f8964cabb9fef1bca8a28a9e0a69ffd3ec92a4a7ad41b", + "sha256:caa937ff29d07a665dfcfd7a84f0d4207b2ebf483362fa9054041d67fdfacc20", + "sha256:d38ab9e53b1c09608ba2d9b8b888f1e75d6f66e2787e437adb1fecbffec6b112", + "sha256:d4cf0128da167562c49b0e034f09e9cedd733997354f2314837c2fa461c87bb1", + "sha256:db23d7341e21b273d2440ec6faf6c8b1ca95c8894da612e165be0b89a8688340", + "sha256:ee8bf4fdcad7d66beb744957db8717afc12d176e3fd9c5d106835133881a049b", + "sha256:f854c8476512cebe6a8681cc4789e4fcff6019c17baa0fd72b459155dc605ab4", + "sha256:fd29d35ac80755e5c0a99d96b44fb9abbd7e871849581ea6a4cb826d24267537" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==3.16.0" + "version": "==3.17" }, "pygments": { "hashes": [ @@ -840,11 +848,11 @@ }, "python-slugify": { "hashes": [ - "sha256:003aee64f9fd955d111549f96c4b58a3f40b9319383c70fad6277a4974bbf570", - "sha256:7a0f21a39fa6c1c4bf2e5984c9b9ae944483fd10b54804cb0e23a3ccd4954f0b" + "sha256:51f217508df20a6c166c7821683384b998560adcf8f19a6c2ca8b460528ccd9c", + "sha256:f1da83f3c7ab839b3f84543470cd95bdb5a81f1a0b80fed502f78b7dca256062" ], "index": "pypi", - "version": "==7.0.0" + "version": "==8.0.0" }, "python-twitter-v2": { "hashes": [ @@ -1035,11 +1043,11 @@ }, "rich": { "hashes": [ - "sha256:7c963f0d03819221e9ac561e1bc866e3f95a02248c1234daa48954e6d381c003", - "sha256:f1a00cdd3eebf999a15d85ec498bfe0b1a77efe9b34f645768a54132ef444ac5" + "sha256:125d96d20c92b946b983d0d392b84ff945461e5a06d3867e9f9e575f8697b67f", + "sha256:8aa57747f3fc3e977684f0176a88e789be314a99f99b43b75d1e9cb5dc6db9e9" ], "markers": "python_version >= '3.7'", - "version": "==13.2.0" + "version": "==13.3.1" }, "rsa": { "hashes": [ @@ -1059,11 +1067,11 @@ }, "selenium": { "hashes": [ - "sha256:06a1c7d9f313130b21c3218ddd8852070d0e7419afdd31f96160cd576555a5ce", - "sha256:3aefa14a28a42e520550c1cd0f29cf1d566328186ea63aa9a3e01fb265b5894d" + "sha256:20f28ee4ea9b273b4112a7df5276ebb3052f79ff6eff42a564db6143e5926683", + "sha256:fee36724d6cf0b18c73781bb8ec7be4a35ab1e2564e64e64e64da75e50e052af" ], "index": "pypi", - "version": "==4.7.2" + "version": "==4.8.0" }, "six": { "hashes": [ @@ -1106,11 +1114,11 @@ }, "telethon": { "hashes": [ - "sha256:3ec7ea04e61e0179dd08b974b609814e1a5298eeda3d68368a34bba754f43aec", - "sha256:d894f6ef2bf2cb119f6413b9f620957503785bab0999694b4bf67dea36f8ee09" + "sha256:21fb26051adc521a4a00a157e6f4a9e87711940ac3504414f96e66056918ef61", + "sha256:39ae3c3335ddd5acc80e395969f27556df140a73e58e9d3bb45863c766c23a8c" ], "index": "pypi", - "version": "==1.26.1" + "version": "==1.27.0" }, "text-unidecode": { "hashes": [ diff --git a/create_update_test_oauth_token.py b/scripts/create_update_gdrive_oauth_token.py similarity index 59% rename from create_update_test_oauth_token.py rename to scripts/create_update_gdrive_oauth_token.py index 65b3086..ac2f446 100644 --- a/create_update_test_oauth_token.py +++ b/scripts/create_update_gdrive_oauth_token.py @@ -1,4 +1,5 @@ import os.path +import click, json from google.auth.transport.requests import Request from google.oauth2.credentials import Credentials @@ -6,27 +7,41 @@ from google_auth_oauthlib.flow import InstalledAppFlow from googleapiclient.discovery import build from googleapiclient.errors import HttpError -# If creating for first time download the OAuth Client Ids json `credentials.json` from https://console.cloud.google.com/apis/credentials OAuth 2.0 Client IDs -# add "http://localhost:55192/" to the list of "Authorised redirect URIs" -# https://davemateer.com/2022/04/28/google-drive-with-python for more information - # You can run this code to get a new token and verify it belongs to the correct user # This token will be refresh automatically by the auto-archiver - # Code below from https://developers.google.com/drive/api/quickstart/python SCOPES = ['https://www.googleapis.com/auth/drive'] -def main(): - token_file = 'gd-token.json' - creds = None - +@click.command( + help="script to generate Google Drive OAuth token to use gdrive_storage, requires credentials.json and outputs gd-token.json, if you don't have credentials.json go to https://console.cloud.google.com/apis/credentials. Be sure to add 'http://localhost:55192/' to the Authorized redirect URIs in your OAuth App. More info: https://davemateer.com/2022/04/28/google-drive-with-python" +) +@click.option( + "--credentials", + "-c", + type=click.Path(exists=True), + help="path to the credentials.json file downloaded from https://console.cloud.google.com/apis/credentials", + required=True +) +@click.option( + "--token", + "-t", + type=click.Path(exists=False), + default="gd-token.json", + help="file where to place the OAuth token, defaults to gd-token.json which you must then move to where your orchestration file points to, defaults to gd-token.json", + required=True +) +def main(credentials, token): # The file token.json stores the user's access and refresh tokens, and is - # created automatically when the authorization flow completes for the first - # time. - if os.path.exists(token_file): - creds = Credentials.from_authorized_user_file(token_file, SCOPES) + # created automatically when the authorization flow completes for the first time. + creds = None + if os.path.exists(token): + with open(token, 'r') as stream: + creds_json = json.load(stream) + # creds = Credentials.from_authorized_user_file(creds_json, SCOPES) + creds_json['refresh_token'] = creds_json.get("refresh_token", "") + creds = Credentials.from_authorized_user_info(creds_json, SCOPES) # If there are no (valid) credentials available, let the user log in. if not creds or not creds.valid: @@ -36,10 +51,10 @@ def main(): else: print('First run through so putting up login dialog') # credentials.json downloaded from https://console.cloud.google.com/apis/credentials - flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES) + flow = InstalledAppFlow.from_client_secrets_file(credentials, SCOPES) creds = flow.run_local_server(port=55192) # Save the credentials for the next run - with open(token_file, 'w') as token: + with open(token, 'w') as token: print('Saving new token') token.write(creds.to_json()) else: diff --git a/src/auto_archiver/storages/__init__.py b/src/auto_archiver/storages/__init__.py index 250004a..c375f8e 100644 --- a/src/auto_archiver/storages/__init__.py +++ b/src/auto_archiver/storages/__init__.py @@ -1,3 +1,4 @@ from .storage import Storage from .s3 import S3Storage -from .local import LocalStorage \ No newline at end of file +from .local import LocalStorage +from .gd import GDriveStorage \ No newline at end of file diff --git a/src/auto_archiver/storages/gd.py b/src/auto_archiver/storages/gd.py new file mode 100644 index 0000000..7dbbb21 --- /dev/null +++ b/src/auto_archiver/storages/gd.py @@ -0,0 +1,192 @@ + +import shutil, os, time, json +from typing import IO +from loguru import logger + +from googleapiclient.discovery import build +from googleapiclient.http import MediaFileUpload +from google.oauth2 import service_account +from google.oauth2.credentials import Credentials +from google.auth.transport.requests import Request + +from ..core import Media +from . import Storage + + +class GDriveStorage(Storage): + name = "gdrive_storage" + + def __init__(self, config: dict) -> None: + super().__init__(config) + + SCOPES = ['https://www.googleapis.com/auth/drive'] + + if self.oauth_token is not None: + """ + Tokens are refreshed after 1 hour + however keep working for 7 days (tbc) + so as long as the job doesn't last for 7 days + then this method of refreshing only once per run will work + see this link for details on the token + https://davemateer.com/2022/04/28/google-drive-with-python#tokens + """ + logger.debug(f'Using GD OAuth token {self.oauth_token}') + # workaround for missing 'refresh_token' in from_authorized_user_file + with open(self.oauth_token, 'r') as stream: + creds_json = json.load(stream) + creds_json['refresh_token'] = creds_json.get("refresh_token", "") + creds = Credentials.from_authorized_user_info(creds_json, SCOPES) + # creds = Credentials.from_authorized_user_file(self.oauth_token, SCOPES) + + if not creds or not creds.valid: + if creds and creds.expired and creds.refresh_token: + logger.debug('Requesting new GD OAuth token') + creds.refresh(Request()) + else: + raise Exception("Problem with creds - create the token again") + + # Save the credentials for the next run + with open(self.oauth_token, 'w') as token: + logger.debug('Saving new GD OAuth token') + token.write(creds.to_json()) + else: + logger.debug('GD OAuth Token valid') + else: + gd_service_account = config.service_account + logger.debug(f'Using GD Service Account {gd_service_account}') + creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES) + + self.service = build('drive', 'v3', credentials=creds) + + @staticmethod + def configs() -> dict: + return dict( + Storage.configs(), + ** { + "root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"}, + "oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."}, + "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."}, + }) + + def get_cdn_url(self, media: Media) -> str: + """ + only support files saved in a folder for GD + S3 supports folder and all stored in the root + """ + + # full_name = os.path.join(self.folder, media.key) + parent_id, folder_id = self.root_folder_id, None + path_parts = media.key.split(os.path.sep) + filename = path_parts[-1] + logger.info(f"looking for folders for {path_parts[0:-1]} before getting url for {filename=}") + for folder in path_parts[0:-1]: + folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True) + parent_id = folder_id + + # get id of file inside folder (or sub folder) + file_id = self._get_id_from_parent_and_name(folder_id, filename) + return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing" + + def upload(self, media: Media, **kwargs) -> bool: + # override parent so that we can use shutil.copy2 and keep metadata + dest = os.path.join(self.save_to, media.key) + os.makedirs(os.path.dirname(dest), exist_ok=True) + logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key} to {dest}') + res = shutil.copy2(media.filename, dest) + logger.info(res) + return True + + def upload(self, media: Media, **kwargs) -> bool: + logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}') + """ + 1. for each sub-folder in the path check if exists or create + 2. upload file to root_id/other_paths.../filename + """ + parent_id, upload_to = self.root_folder_id, None + path_parts = media.key.split(os.path.sep) + filename = path_parts[-1] + logger.info(f"checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}") + for folder in path_parts[0:-1]: + upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False) + if upload_to is None: + upload_to = self._mkdir(folder, parent_id) + parent_id = upload_to + + # upload file to gd + logger.debug(f'uploading {filename=} to folder id {upload_to}') + file_metadata = { + 'name': [filename], + 'parents': [upload_to] + } + media = MediaFileUpload(media.filename, resumable=True) + gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute() + logger.debug(f'uploadf: uploaded file {gd_file["id"]} successfully in folder={upload_to}') + + # must be implemented even if unused + def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass + + def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False): + """ + Retrieves the id of a folder or file from its @name and the @parent_id folder + Optionally does multiple @retries and sleeps @sleep_seconds between them + If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'" + If @raise_on_missing will throw error when not found, or returns None + Will remember previous calls to avoid duplication if @use_cache - might not have all edge cases tested, so use at own risk + Returns the id of the file or folder from its name as a string + """ + # cache logic + if use_cache: + self.api_cache = getattr(self, "api_cache", {}) + cache_key = f"{parent_id}_{name}_{use_mime_type}" + if cache_key in self.api_cache: + logger.debug(f"cache hit for {cache_key=}") + return self.api_cache[cache_key] + + # API logic + debug_header: str = f"[searching {name=} in {parent_id=}]" + query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false " + if use_mime_type: + query_string += f" and mimeType='application/vnd.google-apps.folder' " + + for attempt in range(retries): + results = self.service.files().list( + q=query_string, + spaces='drive', # ie not appDataFolder or photos + fields='files(id, name)' + ).execute() + items = results.get('files', []) + + if len(items) > 0: + logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}") + _id = items[-1]['id'] + if use_cache: self.api_cache[cache_key] = _id + return _id + else: + logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}.') + if attempt < retries - 1: + logger.debug(f'sleeping for {sleep_seconds} second(s)') + time.sleep(sleep_seconds) + + if raise_on_missing: + raise ValueError(f'{debug_header} not found after {retries} attempt(s)') + return None + + def _mkdir(self, name: str, parent_id: str): + """ + Creates a new GDrive folder @name inside folder @parent_id + Returns id of the created folder + """ + logger.debug(f'Creating new folder with {name=} inside {parent_id=}') + file_metadata = { + 'name': [name], + 'mimeType': 'application/vnd.google-apps.folder', + 'parents': [parent_id] + } + gd_folder = self.service.files().create(body=file_metadata, fields='id').execute() + return gd_folder.get('id') + + # def exists(self, key): + # try: + # self.get_cdn_url(key) + # return True + # except: return False diff --git a/src/auto_archiver/storages/gd_storage.py b/src/auto_archiver/storages/gd_storage.py deleted file mode 100644 index 09c8938..0000000 --- a/src/auto_archiver/storages/gd_storage.py +++ /dev/null @@ -1,181 +0,0 @@ - -#TODO: refactor GDriveStorage before merging to main - -# import os, time - -# from loguru import logger -# from .base_storage import Storage -# from dataclasses import dataclass -# from googleapiclient.discovery import build -# from googleapiclient.http import MediaFileUpload -# from google.oauth2 import service_account - - -# from google.oauth2.credentials import Credentials -# from google.auth.transport.requests import Request - -# @dataclass -# class GDConfig: -# root_folder_id: str -# oauth_token_filename: str -# service_account: str = "service_account.json" -# folder: str = "default" - -# class GDStorage(Storage): -# def __init__(self, config: GDConfig): -# self.folder = config.folder -# self.root_folder_id = config.root_folder_id - -# SCOPES=['https://www.googleapis.com/auth/drive'] - -# token_file = config.oauth_token_filename -# if token_file is not None: -# """ -# Tokens are refreshed after 1 hour -# however keep working for 7 days (tbc) -# so as long as the job doesn't last for 7 days -# then this method of refreshing only once per run will work -# see this link for details on the token -# https://davemateer.com/2022/04/28/google-drive-with-python#tokens -# """ -# logger.debug(f'Using GD OAuth token {token_file}') -# creds = Credentials.from_authorized_user_file(token_file, SCOPES) - -# if not creds or not creds.valid: -# if creds and creds.expired and creds.refresh_token: -# logger.debug('Requesting new GD OAuth token') -# creds.refresh(Request()) -# else: -# raise Exception("Problem with creds - create the token again") - -# # Save the credentials for the next run -# with open(token_file, 'w') as token: -# logger.debug('Saving new GD OAuth token') -# token.write(creds.to_json()) -# else: -# logger.debug('GD OAuth Token valid') -# else: -# gd_service_account = config.service_account -# logger.debug(f'Using GD Service Account {gd_service_account}') -# creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES) - -# self.service = build('drive', 'v3', credentials=creds) - -# def get_cdn_url(self, key): -# """ -# only support files saved in a folder for GD -# S3 supports folder and all stored in the root -# """ -# key = self.clean_key(key) - -# full_name = os.path.join(self.folder, key) -# parent_id, folder_id = self.root_folder_id, None -# path_parts = full_name.split(os.path.sep) -# filename = path_parts[-1] -# logger.info(f"looking for folders for {path_parts[0:-1]} before uploading {filename=}") -# for folder in path_parts[0:-1]: -# folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True) -# parent_id = folder_id - -# # get id of file inside folder (or sub folder) -# file_id = self._get_id_from_parent_and_name(folder_id, filename) -# return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing" - -# def exists(self, key): -# try: -# self.get_cdn_url(key) -# return True -# except: return False - -# def uploadf(self, file: str, key: str, **_kwargs): -# """ -# 1. for each sub-folder in the path check if exists or create -# 2. upload file to root_id/other_paths.../filename -# """ -# key = self.clean_key(key) - -# full_name = os.path.join(self.folder, key) -# parent_id, upload_to = self.root_folder_id, None -# path_parts = full_name.split(os.path.sep) -# filename = path_parts[-1] -# logger.info(f"checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}") -# for folder in path_parts[0:-1]: -# upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False) -# if upload_to is None: -# upload_to = self._mkdir(folder, parent_id) -# parent_id = upload_to - -# # upload file to gd -# logger.debug(f'uploading {filename=} to folder id {upload_to}') -# file_metadata = { -# 'name': [filename], -# 'parents': [upload_to] -# } -# media = MediaFileUpload(file, resumable=True) -# gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute() -# logger.debug(f'uploadf: uploaded file {gd_file["id"]} succesfully in folder={upload_to}') - -# def upload(self, filename: str, key: str, **kwargs): -# # GD only requires the filename not a file reader -# self.uploadf(filename, key, **kwargs) - -# # gets the Drive folderID if it is there -# def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False): -# """ -# Retrieves the id of a folder or file from its @name and the @parent_id folder -# Optionally does multiple @retries and sleeps @sleep_seconds between them -# If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'" -# If @raise_on_missing will throw error when not found, or returns None -# Will remember previous calls to avoid duplication if @use_cache - might not have all edge cases tested, so use at own risk -# Returns the id of the file or folder from its name as a string -# """ -# # cache logic -# if use_cache: -# self.api_cache = getattr(self, "api_cache", {}) -# cache_key = f"{parent_id}_{name}_{use_mime_type}" -# if cache_key in self.api_cache: -# logger.debug(f"cache hit for {cache_key=}") -# return self.api_cache[cache_key] - -# # API logic -# debug_header: str = f"[searching {name=} in {parent_id=}]" -# query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false " -# if use_mime_type: -# query_string += f" and mimeType='application/vnd.google-apps.folder' " - -# for attempt in range(retries): -# results = self.service.files().list( -# q=query_string, -# spaces='drive', # ie not appDataFolder or photos -# fields='files(id, name)' -# ).execute() -# items = results.get('files', []) - -# if len(items) > 0: -# logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}") -# _id = items[-1]['id'] -# if use_cache: self.api_cache[cache_key] = _id -# return _id -# else: -# logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}.') -# if attempt < retries - 1: -# logger.debug(f'sleeping for {sleep_seconds} second(s)') -# time.sleep(sleep_seconds) - -# if raise_on_missing: -# raise ValueError(f'{debug_header} not found after {retries} attempt(s)') -# return None - -# def _mkdir(self, name: str, parent_id: str): -# """ -# Creates a new GDrive folder @name inside folder @parent_id -# Returns id of the created folder -# """ -# logger.debug(f'Creating new folder with {name=} inside {parent_id=}') -# file_metadata = { -# 'name': [name], -# 'mimeType': 'application/vnd.google-apps.folder', -# 'parents': [parent_id] -# } -# gd_folder = self.service.files().create(body=file_metadata, fields='id').execute() -# return gd_folder.get('id') diff --git a/src/auto_archiver/storages/local.py b/src/auto_archiver/storages/local.py index f4fb6bc..aa08e49 100644 --- a/src/auto_archiver/storages/local.py +++ b/src/auto_archiver/storages/local.py @@ -1,12 +1,9 @@ import shutil -from typing import IO, Any -import boto3, uuid, os, mimetypes -from botocore.errorfactory import ClientError +from typing import IO +import os from loguru import logger -from slugify import slugify -from ..core import Metadata from ..core import Media from ..storages import Storage @@ -28,7 +25,7 @@ class LocalStorage(Storage): }) def get_cdn_url(self, media: Media) -> str: - #TODO: is this viable with Storage.configs on path/filename? + # TODO: is this viable with Storage.configs on path/filename? dest = os.path.join(self.save_to, media.key) if self.save_absolute: dest = os.path.abspath(dest) diff --git a/src/auto_archiver/storages/s3.py b/src/auto_archiver/storages/s3.py index f46ba43..37370f4 100644 --- a/src/auto_archiver/storages/s3.py +++ b/src/auto_archiver/storages/s3.py @@ -41,7 +41,6 @@ class S3Storage(Storage): "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime" }, "private": {"default": False, "help": "if true S3 files will not be readable online"}, - # "key_path": {"default": "random", "help": "S3 file names are non-predictable strings, one of ['random', 'default']"}, }) def get_cdn_url(self, media: Media) -> str: