From 16bd54b8d350126c81eb47e2c0d7f1ebc100fa1c Mon Sep 17 00:00:00 2001
From: Dave Mateer <davemateer@gmail.com>
Date: Tue, 12 Jul 2022 12:44:29 +0100
Subject: [PATCH 01/17] Put in fix for leading / in Google Drive

---
 storages/gd_storage.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/storages/gd_storage.py b/storages/gd_storage.py
index e772a90..2a92f51 100644
--- a/storages/gd_storage.py
+++ b/storages/gd_storage.py
@@ -28,6 +28,12 @@ class GDStorage(Storage):
         only support files saved in a folder for GD
         S3 supports folder and all stored in the root
         """
+        # doesn't work if key starts with / which can happen from telethon
+        if key.startswith('/'):
+            # remove first character ie /
+            logger.debug(f'CDN: Found and fixing leading / on uploading a file with {key=}')
+            key = key[1:]
+
         full_name = os.path.join(self.folder, key)
         parent_id, folder_id = self.root_folder_id, None
         path_parts = full_name.split(os.path.sep)
@@ -52,6 +58,13 @@ class GDStorage(Storage):
         1. for each sub-folder in the path check if exists or create
         2. upload file to root_id/other_paths.../filename
         """
+        # doesn't work if key starts with / which can happen from telethon
+        if key.startswith('/'):
+            # remove first character ie /
+            logger.debug(f'UPLOADF: Found and fixing a leading / on uploading a file with {key=}')
+            key = key[1:]
+
+        
         full_name = os.path.join(self.folder, key)
         parent_id, upload_to = self.root_folder_id, None
         path_parts = full_name.split(os.path.sep)

From 42172566f20cd3ba96a23a2ad2a6343565071a38 Mon Sep 17 00:00:00 2001
From: Dave Mateer <davemateer@gmail.com>
Date: Tue, 12 Jul 2022 12:53:59 +0100
Subject: [PATCH 02/17] Added whitelist and blacklist for workwheets (not
 spreadsheet)

---
 auto_archive.py     | 13 +++++++++++++
 configs/config.py   |  4 ++++
 example.config.yaml |  8 ++++++++
 3 files changed, 25 insertions(+)

diff --git a/auto_archive.py b/auto_archive.py
index 375c5be..840ccdc 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -58,6 +58,19 @@ def process_sheet(c: Config):
 
     # loop through worksheets to check
     for ii, wks in enumerate(sh.worksheets()):
+
+        whitelist = c.worksheet_whitelist
+        if whitelist is not None:
+            if wks.title != whitelist: 
+                logger.debug(f'Ignoring worksheet {wks.title} as not in whitelist which is specified as {whitelist}')
+                continue
+
+        blacklist = c.worksheet_blacklist
+        if blacklist is not None:
+            if wks.title == blacklist: 
+                logger.debug(f'Ignoring worksheet {wks.title} as in blacklist')
+                continue
+
         logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}')
         gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)
 
diff --git a/configs/config.py b/configs/config.py
index 4232651..98fabe9 100644
--- a/configs/config.py
+++ b/configs/config.py
@@ -50,6 +50,10 @@ class Config:
 
         self.sheet = getattr_or(self.args, "sheet", execution.get("sheet"))
         assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
+
+        self.worksheet_whitelist = execution.get("worksheet_whitelist")
+        self.worksheet_blacklist = execution.get("worksheet_blacklist")
+
         self.header = int(getattr_or(self.args, "header", execution.get("header", 1)))
         self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3"))
         self.save_logs = getattr(self.args, "save_logs") or execution.get("save_logs", False)
diff --git a/example.config.yaml b/example.config.yaml
index c5b6a76..8778bba 100644
--- a/example.config.yaml
+++ b/example.config.yaml
@@ -65,6 +65,14 @@ secrets:
 execution:
   # can be overwritten with CMD --sheet=
   sheet: your-sheet-name
+
+  # only check this worksheet rather than iterating through all worksheets in the spreadsheet. If whitelist is used then blacklist is ignored as whitelist is more restrictive.
+  # worksheet_whitelist: Sheet1
+
+  # worksheet to blacklist. Leave blank which is default for none. Useful if users want a MASTERSHEET exact copy of the working worksheet
+  # worksheet_blacklist: MASTERSHEET
+
+
   # which row of your tabs contains the header, can be overwritten with CMD --header=
   header: 1
   # which storage to use, can be overwritten with CMD --storage=

From 03e542a0fcc269c312c065aeabd4176641a4fc95 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Thu, 14 Jul 2022 17:45:28 +0200
Subject: [PATCH 03/17] isolate into function

---
 storages/gd_storage.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/storages/gd_storage.py b/storages/gd_storage.py
index 2a92f51..d9a11de 100644
--- a/storages/gd_storage.py
+++ b/storages/gd_storage.py
@@ -28,11 +28,7 @@ class GDStorage(Storage):
         only support files saved in a folder for GD
         S3 supports folder and all stored in the root
         """
-        # doesn't work if key starts with / which can happen from telethon
-        if key.startswith('/'):
-            # remove first character ie /
-            logger.debug(f'CDN: Found and fixing leading / on uploading a file with {key=}')
-            key = key[1:]
+        key = self.clean_key(key)
 
         full_name = os.path.join(self.folder, key)
         parent_id, folder_id = self.root_folder_id, None
@@ -58,13 +54,8 @@ class GDStorage(Storage):
         1. for each sub-folder in the path check if exists or create
         2. upload file to root_id/other_paths.../filename
         """
-        # doesn't work if key starts with / which can happen from telethon
-        if key.startswith('/'):
-            # remove first character ie /
-            logger.debug(f'UPLOADF: Found and fixing a leading / on uploading a file with {key=}')
-            key = key[1:]
+        key = self.clean_key(key)
 
-        
         full_name = os.path.join(self.folder, key)
         parent_id, upload_to = self.root_folder_id, None
         path_parts = full_name.split(os.path.sep)
@@ -90,6 +81,13 @@ class GDStorage(Storage):
         # GD only requires the filename not a file reader
         self.uploadf(filename, key, **kwargs)
 
+    def clean_key(self, key):
+        # GDrive does not work well with trailing forward slashes and some keys come with that
+        if key.startswith('/'):
+            logger.debug(f'Found and fixed a leading "/" for {key=}')
+            return key[1:]
+        return key
+
     def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=True):
         """
         Retrieves the id of a folder or file from its @name and the @parent_id folder

From 90cb080c811e2575274a0edf1b9f46abfd4aa1ae Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Thu, 14 Jul 2022 18:10:02 +0200
Subject: [PATCH 04/17] refactoring and renaming

---
 auto_archive.py     | 31 ++++++++++++++++---------------
 configs/config.py   |  9 +++++++--
 example.config.yaml | 13 +++++++------
 3 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/auto_archive.py b/auto_archive.py
index 840ccdc..f12b9c4 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -53,23 +53,24 @@ def missing_required_columns(gw: GWorksheet):
     return missing
 
 
+def should_process_sheet(c, sheet_name):
+    if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow:
+        # ALLOW rules exist AND sheet name not explicitly allowed
+        return False
+    if len(c.worksheet_block) and sheet_name in c.worksheet_block:
+        # BLOCK rules exist AND sheet name is blocked
+        return False
+    return True
+
+
 def process_sheet(c: Config):
     sh = c.gsheets_client.open(c.sheet)
 
     # loop through worksheets to check
     for ii, wks in enumerate(sh.worksheets()):
-
-        whitelist = c.worksheet_whitelist
-        if whitelist is not None:
-            if wks.title != whitelist: 
-                logger.debug(f'Ignoring worksheet {wks.title} as not in whitelist which is specified as {whitelist}')
-                continue
-
-        blacklist = c.worksheet_blacklist
-        if blacklist is not None:
-            if wks.title == blacklist: 
-                logger.debug(f'Ignoring worksheet {wks.title} as in blacklist')
-                continue
+        if not should_process_sheet(c, wks.title):
+            logger.info(f'Ignoring worksheet "{wks.title}" due to allow/block configurations')
+            continue
 
         logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}')
         gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)
@@ -93,7 +94,7 @@ def process_sheet(c: Config):
                 if not is_retry: continue
 
             # All checks done - archival process starts here
-            try: 
+            try:
                 gw.set_cell(row, 'status', 'Archive in progress')
                 url = expand_url(url)
                 c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True))
@@ -109,7 +110,7 @@ def process_sheet(c: Config):
                     YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
                     TelegramArchiver(storage, c.webdriver),
                     TwitterArchiver(storage, c.webdriver),
-                    VkArchiver(storage,  c.webdriver, c.vk_config),
+                    VkArchiver(storage, c.webdriver, c.vk_config),
                     WaybackArchiver(storage, c.webdriver, c.wayback_config)
                 ]
 
@@ -118,7 +119,7 @@ def process_sheet(c: Config):
 
                     try:
                         result = archiver.download(url, check_if_exists=c.check_if_exists)
-                    except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
+                    except KeyboardInterrupt as e: raise e  # so the higher level catch can catch it
                     except Exception as e:
                         result = False
                         logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
diff --git a/configs/config.py b/configs/config.py
index 98fabe9..e9bd084 100644
--- a/configs/config.py
+++ b/configs/config.py
@@ -51,8 +51,11 @@ class Config:
         self.sheet = getattr_or(self.args, "sheet", execution.get("sheet"))
         assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
 
-        self.worksheet_whitelist = execution.get("worksheet_whitelist")
-        self.worksheet_blacklist = execution.get("worksheet_blacklist")
+        def ensure_set(l):
+            l = l if isinstance(l, list) else [l]
+            return set([x for x in l if isinstance(x, str) and len(x) > 0])
+        self.worksheet_allow = ensure_set(execution.get("worksheet_allow", []))
+        self.worksheet_block = ensure_set(execution.get("worksheet_block", []))
 
         self.header = int(getattr_or(self.args, "header", execution.get("header", 1)))
         self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3"))
@@ -250,6 +253,8 @@ class Config:
         return json.dumps({
             "config_file": self.config_file,
             "sheet": self.sheet,
+            "worksheet_allow": list(self.worksheet_allow),
+            "worksheet_block": list(self.worksheet_block),
             "storage": self.storage,
             "header": self.header,
             "check_if_exists": self.check_if_exists,
diff --git a/example.config.yaml b/example.config.yaml
index 8778bba..3092efc 100644
--- a/example.config.yaml
+++ b/example.config.yaml
@@ -66,12 +66,13 @@ execution:
   # can be overwritten with CMD --sheet=
   sheet: your-sheet-name
 
-  # only check this worksheet rather than iterating through all worksheets in the spreadsheet. If whitelist is used then blacklist is ignored as whitelist is more restrictive.
-  # worksheet_whitelist: Sheet1
-
-  # worksheet to blacklist. Leave blank which is default for none. Useful if users want a MASTERSHEET exact copy of the working worksheet
-  # worksheet_blacklist: MASTERSHEET
-
+  # block or allow worksheets by name, instead of defaulting to checking all worksheets in a Spreadsheet
+  # worksheet_allow and worksheet_block can be single values or lists
+  # if worksheet_allow is specified, worksheet_block is ignored
+  # worksheet_allow:
+  #   - Sheet1
+  #   - "Sheet 2"
+  # worksheet_block: BlockedSheet
 
   # which row of your tabs contains the header, can be overwritten with CMD --header=
   header: 1

From 37e1fcd540e2549ea13a3d22d34a4244d4a640dd Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Thu, 14 Jul 2022 18:10:53 +0200
Subject: [PATCH 05/17] comment

---
 configs/config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/configs/config.py b/configs/config.py
index e9bd084..41b531a 100644
--- a/configs/config.py
+++ b/configs/config.py
@@ -52,6 +52,7 @@ class Config:
         assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
 
         def ensure_set(l):
+            # always returns a set of strings, can receive a set or a string
             l = l if isinstance(l, list) else [l]
             return set([x for x in l if isinstance(x, str) and len(x) > 0])
         self.worksheet_allow = ensure_set(execution.get("worksheet_allow", []))

From 6d8be4c07f2d3ae6e6cdfda4d8bff4ad22420820 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Thu, 14 Jul 2022 18:16:06 +0200
Subject: [PATCH 06/17] s3 allow online preview instead of forced download

---
 storages/s3_storage.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/storages/s3_storage.py b/storages/s3_storage.py
index ceb75c1..b124aae 100644
--- a/storages/s3_storage.py
+++ b/storages/s3_storage.py
@@ -1,4 +1,4 @@
-import uuid, os
+import uuid, os, mimetypes
 from dataclasses import dataclass
 
 import boto3
@@ -21,6 +21,7 @@ class S3Config:
     private: bool = False
     key_path: str = "default"  # 'default' uses full naming, 'random' uses generated uuid
 
+
 class S3Storage(Storage):
 
     def __init__(self, config: S3Config):
@@ -70,4 +71,5 @@ class S3Storage(Storage):
             extra_args = kwargs.get("extra_args", {})
         else:
             extra_args = kwargs.get("extra_args", {'ACL': 'public-read'})
+        extra_args['ContentType'] = mimetypes.guess_type(key)[0]
         self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)

From 363a8ef67a39a55d0386c53a097c6d3a8f1d15a1 Mon Sep 17 00:00:00 2001
From: Dave Mateer <davemateer@gmail.com>
Date: Mon, 18 Jul 2022 13:15:48 +0100
Subject: [PATCH 07/17] Added hash_algorithm to config to choose between SHA256
 and SHA3_512

---
 .gitignore                        |  4 +++-
 archivers/base_archiver.py        | 14 +++++++++-----
 archivers/telegram_archiver.py    |  3 +++
 archivers/telethon_archiver.py    |  4 ++--
 archivers/tiktok_archiver.py      |  3 +++
 archivers/twitter_api_archiver.py |  6 +++---
 archivers/twitter_archiver.py     |  4 ++++
 archivers/vk_archiver.py          |  4 ++--
 archivers/wayback_archiver.py     |  4 ++--
 archivers/youtubedl_archiver.py   |  4 ++--
 auto_archive.py                   | 16 ++++++++--------
 configs/config.py                 |  2 ++
 example.config.yaml               |  5 +++++
 13 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2885782..62a5815 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,4 +16,6 @@ config.yaml
 config-*.yaml
 logs/*
 local_archive/
-vk_config*.json
\ No newline at end of file
+vk_config*.json
+
+secrets/*
\ No newline at end of file
diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
index 815d31e..8951115 100644
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -31,9 +31,10 @@ class Archiver(ABC):
     name = "default"
     retry_regex = r"retrying at (\d+)$"
 
-    def __init__(self, storage: Storage, driver):
+    def __init__(self, storage: Storage, driver, hash_algorithm):
         self.storage = storage
         self.driver = driver
+        self.hash_algorithm = hash_algorithm
 
     def __str__(self):
         return self.__class__.__name__
@@ -163,10 +164,13 @@ class Archiver(ABC):
     def get_hash(self, filename):
         with open(filename, "rb") as f:
             bytes = f.read()  # read entire file as bytes
-            # TODO: customizable hash
-            hash = hashlib.sha256(bytes)
-            # option to use SHA3_512 instead
-            # hash = hashlib.sha3_512(bytes)
+            ha = self.hash_algorithm
+            logger.debug(f'Hash algorithm is {ha}')
+
+            if ha == "SHA3_512": hash = hashlib.sha3_512(bytes)
+            elif ha == "SHA256": hash = hashlib.sha256(bytes)
+            else: raise Exception("Unknown Hash Algorithm of {ha}")
+
         return hash.hexdigest()
 
     def get_screenshot(self, url):
diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py
index 0b6e777..c38dd30 100644
--- a/archivers/telegram_archiver.py
+++ b/archivers/telegram_archiver.py
@@ -11,6 +11,9 @@ from storages import Storage
 class TelegramArchiver(Archiver):
     name = "telegram"
 
+    def __init__(self, storage: Storage, driver, hash_algorithm):
+        super().__init__(storage, driver, hash_algorithm)
+
     def download(self, url, check_if_exists=False):
         # detect URLs that we definitely cannot handle
         if 't.me' != self.get_netloc(url):
diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py
index f35e323..bce34d2 100644
--- a/archivers/telethon_archiver.py
+++ b/archivers/telethon_archiver.py
@@ -15,8 +15,8 @@ class TelethonArchiver(Archiver):
     name = "telethon"
     link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
 
-    def __init__(self, storage: Storage, driver, config: TelethonConfig):
-        super().__init__(storage, driver)
+    def __init__(self, storage: Storage, driver, config: TelethonConfig, hash_algorithm):
+        super().__init__(storage, driver, hash_algorithm)
         if config:
             self.client = TelegramClient("./anon", config.api_id, config.api_hash)
             self.bot_token = config.bot_token
diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py
index 8100bb1..30b8c7c 100644
--- a/archivers/tiktok_archiver.py
+++ b/archivers/tiktok_archiver.py
@@ -15,6 +15,9 @@ class TiktokArchiver(Archiver):
 
         status = 'success'
 
+        def __init__(self, storage: Storage, driver, hash_algorithm):
+            super().__init__(storage, driver, hash_algorithm)
+
         try:
             info = tiktok_downloader.info_post(url)
             key = self.get_key(f'{info.id}.mp4')
diff --git a/archivers/twitter_api_archiver.py b/archivers/twitter_api_archiver.py
index ef2bf40..99fb8f1 100644
--- a/archivers/twitter_api_archiver.py
+++ b/archivers/twitter_api_archiver.py
@@ -13,8 +13,8 @@ from .twitter_archiver import TwitterArchiver
 class TwitterApiArchiver(TwitterArchiver):
     name = "twitter_api"
 
-    def __init__(self, storage: Storage, driver, config: TwitterApiConfig):
-        super().__init__(storage, driver)
+    def __init__(self, storage: Storage, driver, config: TwitterApiConfig, hash_algorithm):
+        super().__init__(storage, driver, hash_algorithm)
 
         if config.bearer_token:
             self.api = Api(bearer_token=config.bearer_token)
@@ -54,7 +54,7 @@ class TwitterApiArchiver(TwitterArchiver):
 
             for u in urls:
                 if u is None:
-                    logger.error(f"Should not have gotten None url for {tweet.includes.media=}")
+                    logger.debug(f"Should not have gotten None url for {tweet.includes.media=} so going to download_alternative in twitter_archiver")
                     return self.download_alternative(url, tweet_id)
         logger.debug(f"found {urls=}")
 
diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py
index 1c1b173..750d2c4 100644
--- a/archivers/twitter_archiver.py
+++ b/archivers/twitter_archiver.py
@@ -5,12 +5,16 @@ from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
 
 from .base_archiver import Archiver, ArchiveResult
 
+from storages import Storage
 
 class TwitterArchiver(Archiver):
     """
     This Twitter Archiver uses unofficial scraping methods, and it works as 
     an alternative to TwitterApiArchiver when no API credentials are provided.
     """
+    def __init__(self, storage: Storage, driver, hash_algorithm):
+        super().__init__(storage, driver, hash_algorithm)
+
     name = "twitter"
     link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
 
diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py
index c448367..6ddba10 100644
--- a/archivers/vk_archiver.py
+++ b/archivers/vk_archiver.py
@@ -17,8 +17,8 @@ class VkArchiver(Archiver):
     wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)")
     photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)")
 
-    def __init__(self, storage: Storage, driver, config: VkConfig):
-        super().__init__(storage, driver)
+    def __init__(self, storage: Storage, driver, config: VkConfig, hash_algorithm):
+        super().__init__(storage, driver, hash_algorithm)
         if config != None:
             self.vks = VkScraper(config.username, config.password)
 
diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py
index f46d1cb..c19ca4f 100644
--- a/archivers/wayback_archiver.py
+++ b/archivers/wayback_archiver.py
@@ -15,8 +15,8 @@ class WaybackArchiver(Archiver):
     """
     name = "wayback"
 
-    def __init__(self, storage: Storage, driver, config: WaybackConfig):
-        super(WaybackArchiver, self).__init__(storage, driver)
+    def __init__(self, storage: Storage, driver, config: WaybackConfig, hash_algorithm):
+        super(WaybackArchiver, self).__init__(storage, driver, hash_algorithm)
         self.config = config
         self.seen_urls = {}
 
diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py
index 7990131..a41b6c6 100644
--- a/archivers/youtubedl_archiver.py
+++ b/archivers/youtubedl_archiver.py
@@ -12,8 +12,8 @@ class YoutubeDLArchiver(Archiver):
     name = "youtube_dl"
     ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
 
-    def __init__(self, storage: Storage, driver, fb_cookie):
-        super().__init__(storage, driver)
+    def __init__(self, storage: Storage, driver, fb_cookie, hash_algorithm):
+        super().__init__(storage, driver, hash_algorithm)
         self.fb_cookie = fb_cookie
 
     def download(self, url, check_if_exists=False):
diff --git a/auto_archive.py b/auto_archive.py
index f12b9c4..72cb748 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -104,14 +104,14 @@ def process_sheet(c: Config):
 
                 # order matters, first to succeed excludes remaining
                 active_archivers = [
-                    TelethonArchiver(storage, c.webdriver, c.telegram_config),
-                    TiktokArchiver(storage, c.webdriver),
-                    TwitterApiArchiver(storage, c.webdriver, c.twitter_config),
-                    YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
-                    TelegramArchiver(storage, c.webdriver),
-                    TwitterArchiver(storage, c.webdriver),
-                    VkArchiver(storage, c.webdriver, c.vk_config),
-                    WaybackArchiver(storage, c.webdriver, c.wayback_config)
+                    TelethonArchiver(storage, c.webdriver, c.telegram_config, c.hash_algorithm),
+                    TiktokArchiver(storage, c.webdriver, c.hash_algorithm),
+                    TwitterApiArchiver(storage, c.webdriver, c.twitter_config, c.hash_algorithm),
+                    YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie,c.hash_algorithm),
+                    TelegramArchiver(storage, c.webdriver, c.hash_algorithm),
+                    TwitterArchiver(storage, c.webdriver, c.hash_algorithm),
+                    VkArchiver(storage,  c.webdriver, c.vk_config, c.hash_algorithm),
+                    WaybackArchiver(storage, c.webdriver, c.wayback_config, c.hash_algorithm)
                 ]
 
                 for archiver in active_archivers:
diff --git a/configs/config.py b/configs/config.py
index 41b531a..2d134da 100644
--- a/configs/config.py
+++ b/configs/config.py
@@ -81,6 +81,8 @@ class Config:
         )
         self.webdriver = "not initialized"
 
+        self.hash_algorithm = execution.get("hash_algorithm")
+
         # ---------------------- SECRETS - APIs and service configurations
         secrets = self.config.get("secrets", {})
 
diff --git a/example.config.yaml b/example.config.yaml
index 3092efc..f823c47 100644
--- a/example.config.yaml
+++ b/example.config.yaml
@@ -104,3 +104,8 @@ execution:
     duration: duration
     screenshot: screenshot
     hash: hash
+
+  # Must be either SHA256 or SHA3_512
+  hash_algorithm: SHA3_512
+  # hash_algorithm: SHA256
+

From 9f9b9d8f634193bc7c202146da09cb8c2e6ac865 Mon Sep 17 00:00:00 2001
From: Dave Mateer <davemateer@gmail.com>
Date: Mon, 18 Jul 2022 13:25:05 +0100
Subject: [PATCH 08/17] adding in GD token

---
 storages/gd_storage.py | 52 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/storages/gd_storage.py b/storages/gd_storage.py
index d9a11de..e60e37f 100644
--- a/storages/gd_storage.py
+++ b/storages/gd_storage.py
@@ -8,19 +8,54 @@ from googleapiclient.http import MediaFileUpload
 from google.oauth2 import service_account
 
 
+from google.oauth2.credentials import Credentials
+from google.auth.transport.requests import Request
+
 @dataclass
 class GDConfig:
     root_folder_id: str
+    oauth_token_file_path_and_name: str
+    service_account: str 
     folder: str = "default"
-    service_account: str = "service_account.json"
-
 
 class GDStorage(Storage):
     def __init__(self, config: GDConfig):
         self.folder = config.folder
         self.root_folder_id = config.root_folder_id
-        creds = service_account.Credentials.from_service_account_file(
-            config.service_account, scopes=['https://www.googleapis.com/auth/drive'])
+        
+        SCOPES=['https://www.googleapis.com/auth/drive']
+        
+        token_file = config.oauth_token_file_path_and_name
+        if token_file is not None:
+            """
+            Tokens are refreshed after 1 hour
+            however keep working for 7 days (tbc)
+            so as long as the job doesn't last for 7 days
+            then this method of refreshing only once per run will work
+            see this link for details on the token
+            https://davemateer.com/2022/04/28/google-drive-with-python#tokens
+            """
+            logger.debug(f'Using GD OAuth token {token_file}')
+            creds = Credentials.from_authorized_user_file(token_file, SCOPES)
+
+            if not creds or not creds.valid:
+                if creds and creds.expired and creds.refresh_token:
+                    logger.debug('Requesting new GD OAuth token')
+                    creds.refresh(Request())
+                else:
+                    raise Exception("Problem with creds - create the token again")
+
+                # Save the credentials for the next run
+                with open(token_file, 'w') as token:
+                    logger.debug('Saving new GD OAuth token')
+                    token.write(creds.to_json())
+            else:
+                logger.debug('GD OAuth Token valid')
+        else:
+            gd_service_account = config.service_account
+            logger.debug(f'Using GD Service Account {gd_service_account}')
+            creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES)
+
         self.service = build('drive', 'v3', credentials=creds)
 
     def get_cdn_url(self, key):
@@ -88,13 +123,18 @@ class GDStorage(Storage):
             return key[1:]
         return key
 
-    def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=True):
+    # gets the Drive folderID if it is there
+    def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
         """
         Retrieves the id of a folder or file from its @name and the @parent_id folder
         Optionally does multiple @retries and sleeps @sleep_seconds between them
         If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'"
         If @raise_on_missing will throw error when not found, or returns None
         Will remember previous calls to avoid duplication if @use_cache
+        DM - caching giving a perf improvement in order of 41s to 46s
+          So I prefer not to use yet, purely as caching notoriously hard in terms of edge cases
+          and pro's don't outweigh cons for me (yet)
+          to be fair I just need to test this and make sure it always runs well!
         Returns the id of the file or folder from its name as a string
         """
         # cache logic
@@ -107,7 +147,7 @@ class GDStorage(Storage):
 
         # API logic
         debug_header: str = f"[searching {name=} in {parent_id=}]"
-        query_string = f"'{parent_id}' in parents and name = '{name}' "
+        query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false "
         if use_mime_type:
             query_string += f" and mimeType='application/vnd.google-apps.folder' "
 

From 524b40b8692c00d26a35ca256fbf91dab6369c40 Mon Sep 17 00:00:00 2001
From: Dave Mateer <davemateer@gmail.com>
Date: Mon, 18 Jul 2022 13:39:00 +0100
Subject: [PATCH 09/17] Added Google OAuth flow for Google Drive so can use a
 real user and not a service account to save files

---
 configs/config.py                 |  3 +-
 create_update_test_oauth_token.py | 77 +++++++++++++++++++++++++++++++
 example.config.yaml               | 12 ++++-
 3 files changed, 89 insertions(+), 3 deletions(-)
 create mode 100644 create_update_test_oauth_token.py

diff --git a/configs/config.py b/configs/config.py
index 2d134da..2298c51 100644
--- a/configs/config.py
+++ b/configs/config.py
@@ -117,7 +117,8 @@ class Config:
             gd = secrets["google_drive"]
             self.gd_config = GDConfig(
                 root_folder_id=gd.get("root_folder_id"),
-                service_account=gd.get("service_account", GDConfig.service_account)
+                oauth_token_file_path_and_name=gd.get("oauth_token_file_path_and_name"),
+                service_account=gd.get("service_account")
             )
 
         if "local" in secrets:
diff --git a/create_update_test_oauth_token.py b/create_update_test_oauth_token.py
new file mode 100644
index 0000000..cfe2709
--- /dev/null
+++ b/create_update_test_oauth_token.py
@@ -0,0 +1,77 @@
+from __future__ import print_function
+
+import os.path
+
+from google.auth.transport.requests import Request
+from google.oauth2.credentials import Credentials
+from google_auth_oauthlib.flow import InstalledAppFlow
+from googleapiclient.discovery import build
+from googleapiclient.errors import HttpError
+
+from googleapiclient.http import MediaFileUpload
+
+# If creating for first time download the json `credentials.json` from https://console.cloud.google.com/apis/credentials OAuth 2.0 Client IDs
+# https://davemateer.com/2022/04/28/google-drive-with-python for more information
+
+# Can run this code to get a new token and verify the token is the correct user
+# and it will refresh the token accordingly
+
+# Code below from https://developers.google.com/drive/api/quickstart/python
+
+SCOPES = ['https://www.googleapis.com/auth/drive']
+
+def main():
+    # token_file = 'gd-token.json'
+
+    token_file = 'secrets/token-davemateer-gmail.json'
+
+    creds = None
+
+    # The file token.json stores the user's access and refresh tokens, and is
+    # created automatically when the authorization flow completes for the first
+    # time.
+    if os.path.exists(token_file):
+        creds = Credentials.from_authorized_user_file(token_file, SCOPES)
+
+    # If there are no (valid) credentials available, let the user log in.
+    if not creds or not creds.valid:
+        if creds and creds.expired and creds.refresh_token:
+            print('Requesting new token')
+            creds.refresh(Request())
+        else:
+            print('First run through so putting up login dialog')
+            # credentials.json downloaded from https://console.cloud.google.com/apis/credentials
+            flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
+            creds = flow.run_local_server(port=0)
+        # Save the credentials for the next run
+        with open(token_file, 'w') as token:
+            print('Saving new token')
+            token.write(creds.to_json())
+    else:
+        print('Token valid')
+
+    try:
+        service = build('drive', 'v3', credentials=creds)
+
+        # About the user
+        results = service.about().get(fields="*").execute()
+        emailAddress = results['user']['emailAddress']
+        print(emailAddress)
+
+        # Call the Drive v3 API and return some files
+        results = service.files().list(
+            pageSize=10, fields="nextPageToken, files(id, name)").execute()
+        items = results.get('files', [])
+
+        if not items:
+            print('No files found.')
+            return
+        print('Files:')
+        for item in items:
+            print(u'{0} ({1})'.format(item['name'], item['id']))
+
+    except HttpError as error:
+        print(f'An error occurred: {error}')
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/example.config.yaml b/example.config.yaml
index f823c47..60753fa 100644
--- a/example.config.yaml
+++ b/example.config.yaml
@@ -18,8 +18,16 @@ secrets:
 
   # needed if you use storage=gd
   google_drive:
-    # local filename can be the same or different file from google_sheets.service_account, defaults to service_account.json
-    service_account: "service_account.json"
+    # 1.service account to write to google storage - be aware of 15GB limit. Recommend using OAuth user.
+    # filename can be the same or different file from google_sheets.service_account
+    # service_account: "service_account.json"
+
+    # 2.token (only 1. or 2. - if both specified then this 2. token takes precedence)
+    # will need to have write access on the server so refresh flow works
+    # run the file `create_update_test_oauth_token.py` to create the token and save in a secrets directory so 
+    # it is not checked into source control
+    oauth_token_file_path_and_name: "secrets/token-davemateer-gmail.json"
+
     root_folder_id: copy XXXX from https://drive.google.com/drive/folders/XXXX
 
   # needed if you use storage=local

From 7b8be95e250dc3f0c42ca8840972b08048eacadf Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Mon, 25 Jul 2022 12:12:14 +0100
Subject: [PATCH 10/17] removing empty line

---
 .gitignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 62a5815..2059faa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,5 +17,4 @@ config-*.yaml
 logs/*
 local_archive/
 vk_config*.json
-
 secrets/*
\ No newline at end of file

From 2d7d8c4e0803095a967ff78aeba42933a1a8f835 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Mon, 25 Jul 2022 12:12:43 +0100
Subject: [PATCH 11/17] renaming and making default SHA-256

---
 archivers/base_archiver.py | 4 ++--
 configs/config.py          | 3 ++-
 example.config.yaml        | 8 ++++----
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
index 8951115..b377d31 100644
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -167,8 +167,8 @@ class Archiver(ABC):
             ha = self.hash_algorithm
             logger.debug(f'Hash algorithm is {ha}')
 
-            if ha == "SHA3_512": hash = hashlib.sha3_512(bytes)
-            elif ha == "SHA256": hash = hashlib.sha256(bytes)
+            if ha == "SHA3-512": hash = hashlib.sha3_512(bytes)
+            elif ha == "SHA-256": hash = hashlib.sha256(bytes)
             else: raise Exception("Unknown Hash Algorithm of {ha}")
 
         return hash.hexdigest()
diff --git a/configs/config.py b/configs/config.py
index 2d134da..4003282 100644
--- a/configs/config.py
+++ b/configs/config.py
@@ -81,7 +81,7 @@ class Config:
         )
         self.webdriver = "not initialized"
 
-        self.hash_algorithm = execution.get("hash_algorithm")
+        self.hash_algorithm = execution.get("hash_algorithm", "SHA-256")
 
         # ---------------------- SECRETS - APIs and service configurations
         secrets = self.config.get("secrets", {})
@@ -261,6 +261,7 @@ class Config:
             "storage": self.storage,
             "header": self.header,
             "check_if_exists": self.check_if_exists,
+            "hash_algorithm": self.hash_algorithm,
             "save_logs": self.save_logs,
             "selenium_config": asdict(self.selenium_config),
             "selenium_webdriver": self.webdriver != None,
diff --git a/example.config.yaml b/example.config.yaml
index f823c47..2cded09 100644
--- a/example.config.yaml
+++ b/example.config.yaml
@@ -80,6 +80,10 @@ execution:
   storage: s3
   # defaults to false, when true will try to avoid duplicate URL archives
   check_if_exists: true
+
+  # choose a hash algorithm (either SHA-256 or SHA3-512, defaults to SHA-256)
+  # hash_algorithm: SHA-256
+
   # optional configurations for the selenium browser that takes screenshots, these are the defaults
   selenium:
     # values under 10s might mean screenshots fail to grab screenshot
@@ -105,7 +109,3 @@ execution:
     screenshot: screenshot
     hash: hash
 
-  # Must be either SHA256 or SHA3_512
-  hash_algorithm: SHA3_512
-  # hash_algorithm: SHA256
-

From 9317b5e03582a6a27d7eb3318fcbe8ea870fc091 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Mon, 25 Jul 2022 12:27:50 +0100
Subject: [PATCH 12/17] turning HASH_ALGORITHM into global archiver prop

---
 archivers/base_archiver.py        | 15 ++++++---------
 archivers/telegram_archiver.py    |  4 ++--
 archivers/telethon_archiver.py    |  4 ++--
 archivers/tiktok_archiver.py      |  4 ++--
 archivers/twitter_api_archiver.py |  4 ++--
 archivers/twitter_archiver.py     |  4 ++--
 archivers/vk_archiver.py          |  4 ++--
 archivers/wayback_archiver.py     |  4 ++--
 archivers/youtubedl_archiver.py   |  4 ++--
 auto_archive.py                   | 16 ++++++++--------
 configs/config.py                 |  5 +++--
 11 files changed, 33 insertions(+), 35 deletions(-)

diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
index b377d31..902f626 100644
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -26,15 +26,14 @@ class ArchiveResult:
     screenshot: str = None
     hash: str = None
 
-
 class Archiver(ABC):
+    HASH_ALGORITHM="SHA-256" # can be overwritten by user configs
     name = "default"
     retry_regex = r"retrying at (\d+)$"
 
-    def __init__(self, storage: Storage, driver, hash_algorithm):
+    def __init__(self, storage: Storage, driver):
         self.storage = storage
         self.driver = driver
-        self.hash_algorithm = hash_algorithm
 
     def __str__(self):
         return self.__class__.__name__
@@ -48,7 +47,6 @@ class Archiver(ABC):
     def get_netloc(self, url):
         return urlparse(url).netloc
 
-    # generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html
     def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
         """
         Generates an index.html page where each @urls_info is displayed
@@ -164,12 +162,11 @@ class Archiver(ABC):
     def get_hash(self, filename):
         with open(filename, "rb") as f:
             bytes = f.read()  # read entire file as bytes
-            ha = self.hash_algorithm
-            logger.debug(f'Hash algorithm is {ha}')
+            logger.debug(f'Hash algorithm is {self.HASH_ALGORITHM}')
 
-            if ha == "SHA3-512": hash = hashlib.sha3_512(bytes)
-            elif ha == "SHA-256": hash = hashlib.sha256(bytes)
-            else: raise Exception("Unknown Hash Algorithm of {ha}")
+            if self.HASH_ALGORITHM == "SHA-256": hash = hashlib.sha256(bytes)
+            elif self.HASH_ALGORITHM == "SHA3-512": hash = hashlib.sha3_512(bytes)
+            else: raise Exception(f"Unknown Hash Algorithm of {self.HASH_ALGORITHM}")
 
         return hash.hexdigest()
 
diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py
index c38dd30..4b2e59c 100644
--- a/archivers/telegram_archiver.py
+++ b/archivers/telegram_archiver.py
@@ -11,8 +11,8 @@ from storages import Storage
 class TelegramArchiver(Archiver):
     name = "telegram"
 
-    def __init__(self, storage: Storage, driver, hash_algorithm):
-        super().__init__(storage, driver, hash_algorithm)
+    def __init__(self, storage: Storage, driver):
+        super().__init__(storage, driver)
 
     def download(self, url, check_if_exists=False):
         # detect URLs that we definitely cannot handle
diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py
index bce34d2..f35e323 100644
--- a/archivers/telethon_archiver.py
+++ b/archivers/telethon_archiver.py
@@ -15,8 +15,8 @@ class TelethonArchiver(Archiver):
     name = "telethon"
     link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
 
-    def __init__(self, storage: Storage, driver, config: TelethonConfig, hash_algorithm):
-        super().__init__(storage, driver, hash_algorithm)
+    def __init__(self, storage: Storage, driver, config: TelethonConfig):
+        super().__init__(storage, driver)
         if config:
             self.client = TelegramClient("./anon", config.api_id, config.api_hash)
             self.bot_token = config.bot_token
diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py
index 30b8c7c..771a7f4 100644
--- a/archivers/tiktok_archiver.py
+++ b/archivers/tiktok_archiver.py
@@ -15,8 +15,8 @@ class TiktokArchiver(Archiver):
 
         status = 'success'
 
-        def __init__(self, storage: Storage, driver, hash_algorithm):
-            super().__init__(storage, driver, hash_algorithm)
+        def __init__(self, storage: Storage, driver):
+            super().__init__(storage, driver)
 
         try:
             info = tiktok_downloader.info_post(url)
diff --git a/archivers/twitter_api_archiver.py b/archivers/twitter_api_archiver.py
index 99fb8f1..6aa1742 100644
--- a/archivers/twitter_api_archiver.py
+++ b/archivers/twitter_api_archiver.py
@@ -13,8 +13,8 @@ from .twitter_archiver import TwitterArchiver
 class TwitterApiArchiver(TwitterArchiver):
     name = "twitter_api"
 
-    def __init__(self, storage: Storage, driver, config: TwitterApiConfig, hash_algorithm):
-        super().__init__(storage, driver, hash_algorithm)
+    def __init__(self, storage: Storage, driver, config: TwitterApiConfig):
+        super().__init__(storage, driver)
 
         if config.bearer_token:
             self.api = Api(bearer_token=config.bearer_token)
diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py
index 750d2c4..6fe5901 100644
--- a/archivers/twitter_archiver.py
+++ b/archivers/twitter_archiver.py
@@ -12,8 +12,8 @@ class TwitterArchiver(Archiver):
     This Twitter Archiver uses unofficial scraping methods, and it works as 
     an alternative to TwitterApiArchiver when no API credentials are provided.
     """
-    def __init__(self, storage: Storage, driver, hash_algorithm):
-        super().__init__(storage, driver, hash_algorithm)
+    def __init__(self, storage: Storage, driver):
+        super().__init__(storage, driver)
 
     name = "twitter"
     link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py
index 6ddba10..c448367 100644
--- a/archivers/vk_archiver.py
+++ b/archivers/vk_archiver.py
@@ -17,8 +17,8 @@ class VkArchiver(Archiver):
     wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)")
     photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)")
 
-    def __init__(self, storage: Storage, driver, config: VkConfig, hash_algorithm):
-        super().__init__(storage, driver, hash_algorithm)
+    def __init__(self, storage: Storage, driver, config: VkConfig):
+        super().__init__(storage, driver)
         if config != None:
             self.vks = VkScraper(config.username, config.password)
 
diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py
index c19ca4f..f46d1cb 100644
--- a/archivers/wayback_archiver.py
+++ b/archivers/wayback_archiver.py
@@ -15,8 +15,8 @@ class WaybackArchiver(Archiver):
     """
     name = "wayback"
 
-    def __init__(self, storage: Storage, driver, config: WaybackConfig, hash_algorithm):
-        super(WaybackArchiver, self).__init__(storage, driver, hash_algorithm)
+    def __init__(self, storage: Storage, driver, config: WaybackConfig):
+        super(WaybackArchiver, self).__init__(storage, driver)
         self.config = config
         self.seen_urls = {}
 
diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py
index a41b6c6..7990131 100644
--- a/archivers/youtubedl_archiver.py
+++ b/archivers/youtubedl_archiver.py
@@ -12,8 +12,8 @@ class YoutubeDLArchiver(Archiver):
     name = "youtube_dl"
     ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
 
-    def __init__(self, storage: Storage, driver, fb_cookie, hash_algorithm):
-        super().__init__(storage, driver, hash_algorithm)
+    def __init__(self, storage: Storage, driver, fb_cookie):
+        super().__init__(storage, driver)
         self.fb_cookie = fb_cookie
 
     def download(self, url, check_if_exists=False):
diff --git a/auto_archive.py b/auto_archive.py
index 72cb748..c9a6b08 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -104,14 +104,14 @@ def process_sheet(c: Config):
 
                 # order matters, first to succeed excludes remaining
                 active_archivers = [
-                    TelethonArchiver(storage, c.webdriver, c.telegram_config, c.hash_algorithm),
-                    TiktokArchiver(storage, c.webdriver, c.hash_algorithm),
-                    TwitterApiArchiver(storage, c.webdriver, c.twitter_config, c.hash_algorithm),
-                    YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie,c.hash_algorithm),
-                    TelegramArchiver(storage, c.webdriver, c.hash_algorithm),
-                    TwitterArchiver(storage, c.webdriver, c.hash_algorithm),
-                    VkArchiver(storage,  c.webdriver, c.vk_config, c.hash_algorithm),
-                    WaybackArchiver(storage, c.webdriver, c.wayback_config, c.hash_algorithm)
+                    TelethonArchiver(storage, c.webdriver, c.telegram_config),
+                    TiktokArchiver(storage, c.webdriver),
+                    TwitterApiArchiver(storage, c.webdriver, c.twitter_config,),
+                    YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
+                    TelegramArchiver(storage, c.webdriver),
+                    TwitterArchiver(storage, c.webdriver),
+                    VkArchiver(storage,  c.webdriver, c.vk_config),
+                    WaybackArchiver(storage, c.webdriver, c.wayback_config)
                 ]
 
                 for archiver in active_archivers:
diff --git a/configs/config.py b/configs/config.py
index 4003282..063c4d7 100644
--- a/configs/config.py
+++ b/configs/config.py
@@ -1,5 +1,6 @@
 
 import argparse, yaml, json
+from archivers.base_archiver import Archiver
 import gspread
 from loguru import logger
 from selenium import webdriver
@@ -81,7 +82,7 @@ class Config:
         )
         self.webdriver = "not initialized"
 
-        self.hash_algorithm = execution.get("hash_algorithm", "SHA-256")
+        Archiver.HASH_ALGORITHM = execution.get("hash_algorithm", Archiver.HASH_ALGORITHM)
 
         # ---------------------- SECRETS - APIs and service configurations
         secrets = self.config.get("secrets", {})
@@ -261,7 +262,7 @@ class Config:
             "storage": self.storage,
             "header": self.header,
             "check_if_exists": self.check_if_exists,
-            "hash_algorithm": self.hash_algorithm,
+            "hash_algorithm": Archiver.HASH_ALGORITHM,
             "save_logs": self.save_logs,
             "selenium_config": asdict(self.selenium_config),
             "selenium_webdriver": self.webdriver != None,

From e180b82b0d7e7f0168d5771fcbc8f8bf385b4cca Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Mon, 25 Jul 2022 12:29:42 +0100
Subject: [PATCH 13/17] removing useless constructors

---
 archivers/telegram_archiver.py | 3 ---
 archivers/tiktok_archiver.py   | 3 ---
 archivers/twitter_archiver.py  | 4 ----
 3 files changed, 10 deletions(-)

diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py
index 4b2e59c..0b6e777 100644
--- a/archivers/telegram_archiver.py
+++ b/archivers/telegram_archiver.py
@@ -11,9 +11,6 @@ from storages import Storage
 class TelegramArchiver(Archiver):
     name = "telegram"
 
-    def __init__(self, storage: Storage, driver):
-        super().__init__(storage, driver)
-
     def download(self, url, check_if_exists=False):
         # detect URLs that we definitely cannot handle
         if 't.me' != self.get_netloc(url):
diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py
index 771a7f4..8100bb1 100644
--- a/archivers/tiktok_archiver.py
+++ b/archivers/tiktok_archiver.py
@@ -15,9 +15,6 @@ class TiktokArchiver(Archiver):
 
         status = 'success'
 
-        def __init__(self, storage: Storage, driver):
-            super().__init__(storage, driver)
-
         try:
             info = tiktok_downloader.info_post(url)
             key = self.get_key(f'{info.id}.mp4')
diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py
index 6fe5901..8f646fd 100644
--- a/archivers/twitter_archiver.py
+++ b/archivers/twitter_archiver.py
@@ -5,15 +5,11 @@ from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
 
 from .base_archiver import Archiver, ArchiveResult
 
-from storages import Storage
-
 class TwitterArchiver(Archiver):
     """
     This Twitter Archiver uses unofficial scraping methods, and it works as 
     an alternative to TwitterApiArchiver when no API credentials are provided.
     """
-    def __init__(self, storage: Storage, driver):
-        super().__init__(storage, driver)
 
     name = "twitter"
     link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")

From 63140d69c145adb44c22081e7742f1bdc6ca633f Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Mon, 25 Jul 2022 12:35:27 +0100
Subject: [PATCH 14/17] format

---
 auto_archive.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/auto_archive.py b/auto_archive.py
index c9a6b08..f12b9c4 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -106,11 +106,11 @@ def process_sheet(c: Config):
                 active_archivers = [
                     TelethonArchiver(storage, c.webdriver, c.telegram_config),
                     TiktokArchiver(storage, c.webdriver),
-                    TwitterApiArchiver(storage, c.webdriver, c.twitter_config,),
+                    TwitterApiArchiver(storage, c.webdriver, c.twitter_config),
                     YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
                     TelegramArchiver(storage, c.webdriver),
                     TwitterArchiver(storage, c.webdriver),
-                    VkArchiver(storage,  c.webdriver, c.vk_config),
+                    VkArchiver(storage, c.webdriver, c.vk_config),
                     WaybackArchiver(storage, c.webdriver, c.wayback_config)
                 ]
 

From 6124bc5f72b9d2c6c3af62169bae5140be8f3f15 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Mon, 25 Jul 2022 14:52:50 +0100
Subject: [PATCH 15/17] refactored and simplified obtaining credentials

---
 .gitignore                        |  3 ++-
 configs/config.py                 |  4 ++--
 create_update_test_oauth_token.py | 22 +++++++++-------------
 example.config.yaml               | 17 ++++++++++-------
 storages/gd_storage.py            |  6 +++---
 5 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/.gitignore b/.gitignore
index 62a5815..8da75c3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,5 +17,6 @@ config-*.yaml
 logs/*
 local_archive/
 vk_config*.json
-
+gd-token.json
+credentials.json
 secrets/*
\ No newline at end of file
diff --git a/configs/config.py b/configs/config.py
index 2298c51..1169048 100644
--- a/configs/config.py
+++ b/configs/config.py
@@ -117,8 +117,8 @@ class Config:
             gd = secrets["google_drive"]
             self.gd_config = GDConfig(
                 root_folder_id=gd.get("root_folder_id"),
-                oauth_token_file_path_and_name=gd.get("oauth_token_file_path_and_name"),
-                service_account=gd.get("service_account")
+                oauth_token_filename=gd.get("oauth_token_filename"),
+                service_account=gd.get("service_account", GDConfig.service_account)
             )
 
         if "local" in secrets:
diff --git a/create_update_test_oauth_token.py b/create_update_test_oauth_token.py
index cfe2709..65b3086 100644
--- a/create_update_test_oauth_token.py
+++ b/create_update_test_oauth_token.py
@@ -1,5 +1,3 @@
-from __future__ import print_function
-
 import os.path
 
 from google.auth.transport.requests import Request
@@ -8,23 +6,20 @@ from google_auth_oauthlib.flow import InstalledAppFlow
 from googleapiclient.discovery import build
 from googleapiclient.errors import HttpError
 
-from googleapiclient.http import MediaFileUpload
-
-# If creating for first time download the json `credentials.json` from https://console.cloud.google.com/apis/credentials OAuth 2.0 Client IDs
+# If creating for first time download the OAuth Client Ids json `credentials.json` from https://console.cloud.google.com/apis/credentials OAuth 2.0 Client IDs
+# add "http://localhost:55192/" to the list of "Authorised redirect URIs"
 # https://davemateer.com/2022/04/28/google-drive-with-python for more information
 
-# Can run this code to get a new token and verify the token is the correct user
-# and it will refresh the token accordingly
+# You can run this code to get a new token and verify it belongs to the correct user
+# This token will be refresh automatically by the auto-archiver
 
 # Code below from https://developers.google.com/drive/api/quickstart/python
 
 SCOPES = ['https://www.googleapis.com/auth/drive']
 
+
 def main():
-    # token_file = 'gd-token.json'
-
-    token_file = 'secrets/token-davemateer-gmail.json'
-
+    token_file = 'gd-token.json'
     creds = None
 
     # The file token.json stores the user's access and refresh tokens, and is
@@ -42,7 +37,7 @@ def main():
             print('First run through so putting up login dialog')
             # credentials.json downloaded from https://console.cloud.google.com/apis/credentials
             flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
-            creds = flow.run_local_server(port=0)
+            creds = flow.run_local_server(port=55192)
         # Save the credentials for the next run
         with open(token_file, 'w') as token:
             print('Saving new token')
@@ -73,5 +68,6 @@ def main():
     except HttpError as error:
         print(f'An error occurred: {error}')
 
+
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/example.config.yaml b/example.config.yaml
index 60753fa..dc78803 100644
--- a/example.config.yaml
+++ b/example.config.yaml
@@ -18,15 +18,18 @@ secrets:
 
   # needed if you use storage=gd
   google_drive:
-    # 1.service account to write to google storage - be aware of 15GB limit. Recommend using OAuth user.
-    # filename can be the same or different file from google_sheets.service_account
+    # To authenticate with google you have two options (1. service account OR 2. OAuth token)
+
+    # 1. service account - storage space will count towards the developer account
+    # filename can be the same or different file from google_sheets.service_account, defaults to "service_account.json"
     # service_account: "service_account.json"
 
-    # 2.token (only 1. or 2. - if both specified then this 2. token takes precedence)
-    # will need to have write access on the server so refresh flow works
-    # run the file `create_update_test_oauth_token.py` to create the token and save in a secrets directory so 
-    # it is not checked into source control
-    oauth_token_file_path_and_name: "secrets/token-davemateer-gmail.json"
+    # 2. OAuth token  - storage space will count towards the owner of the GDrive folder
+    # (only 1. or 2. - if both specified then this 2. takes precedence)
+    # needs write access on the server so refresh flow works
+    # To get the token, run the file `create_update_test_oauth_token.py`
+    # you can edit that file if you want a different token filename, default is "gd-token.json"
+    oauth_token_filename: "gd-token.json"
 
     root_folder_id: copy XXXX from https://drive.google.com/drive/folders/XXXX
 
diff --git a/storages/gd_storage.py b/storages/gd_storage.py
index e60e37f..be12625 100644
--- a/storages/gd_storage.py
+++ b/storages/gd_storage.py
@@ -14,8 +14,8 @@ from google.auth.transport.requests import Request
 @dataclass
 class GDConfig:
     root_folder_id: str
-    oauth_token_file_path_and_name: str
-    service_account: str 
+    oauth_token_filename: str
+    service_account: str = "service_account.json"
     folder: str = "default"
 
 class GDStorage(Storage):
@@ -25,7 +25,7 @@ class GDStorage(Storage):
         
         SCOPES=['https://www.googleapis.com/auth/drive']
         
-        token_file = config.oauth_token_file_path_and_name
+        token_file = config.oauth_token_filename
         if token_file is not None:
             """
             Tokens are refreshed after 1 hour

From 992dee022a366718ce22e58d9d66783daf12247e Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Mon, 25 Jul 2022 14:59:04 +0100
Subject: [PATCH 16/17] format

---
 storages/gd_storage.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/storages/gd_storage.py b/storages/gd_storage.py
index be12625..933c168 100644
--- a/storages/gd_storage.py
+++ b/storages/gd_storage.py
@@ -28,7 +28,7 @@ class GDStorage(Storage):
         token_file = config.oauth_token_filename
         if token_file is not None:
             """
-            Tokens are refreshed after 1 hour
+            Tokens are refreshed after 1 hour 
             however keep working for 7 days (tbc)
             so as long as the job doesn't last for 7 days
             then this method of refreshing only once per run will work

From c77b4a080a84c701a8a3c000776d269576d02027 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 21 Sep 2022 18:52:23 +0200
Subject: [PATCH 17/17] update comment

---
 storages/gd_storage.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/storages/gd_storage.py b/storages/gd_storage.py
index 933c168..5f3bbeb 100644
--- a/storages/gd_storage.py
+++ b/storages/gd_storage.py
@@ -130,11 +130,7 @@ class GDStorage(Storage):
         Optionally does multiple @retries and sleeps @sleep_seconds between them
         If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'"
         If @raise_on_missing will throw error when not found, or returns None
-        Will remember previous calls to avoid duplication if @use_cache
-        DM - caching giving a perf improvement in order of 41s to 46s
-          So I prefer not to use yet, purely as caching notoriously hard in terms of edge cases
-          and pro's don't outweigh cons for me (yet)
-          to be fair I just need to test this and make sure it always runs well!
+        Will remember previous calls to avoid duplication if @use_cache - might not have all edge cases tested, so use at own risk
         Returns the id of the file or folder from its name as a string
         """
         # cache logic