From 51d448f0cbabbf2ba6a5c4408ae9b8c09d356019 Mon Sep 17 00:00:00 2001
From: Logan Williams <logan.williams@alum.mit.edu>
Date: Sun, 20 Feb 2022 10:27:25 +0100
Subject: [PATCH 01/16] Refactor archivers to make it easier to add support for
 new types of URLs

---
 .gitignore           |   1 +
 Pipfile              |   8 +-
 Pipfile.lock         | 421 ++++++++++++++++++++++++++++++++++++-------
 archivers.py         | 390 +++++++++++++++++++++++++++++++++++++++
 auto_archive.py      | 391 +++++-----------------------------------
 auto_auto_archive.py |   8 +-
 requirements.txt     |   5 -
 7 files changed, 807 insertions(+), 417 deletions(-)
 create mode 100644 archivers.py
 delete mode 100644 requirements.txt

diff --git a/.gitignore b/.gitignore
index 4f3d132..b6a6b68 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ tmp/
 expmt/
 service_account.json
 __pycache__/
+._*
diff --git a/Pipfile b/Pipfile
index 8c71f78..88dbebf 100644
--- a/Pipfile
+++ b/Pipfile
@@ -11,8 +11,14 @@ youtube_dl = "*"
 argparse = "*"
 ffmpeg-python = "*"
 beautifulsoup4 = "*"
+nordvpn-switcher = "*"
+tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"}
+telethon = "*"
+ffmpeg = "*"
+bs4 = "*"
+loguru = "*"
 
 [dev-packages]
 
 [requires]
-python_version = "3.8"
+python_version = "3.9"
diff --git a/Pipfile.lock b/Pipfile.lock
index ef838d8..8a5f227 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,11 +1,11 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "2aa6e5f9d7cda1a459444bf812fb2f7a4acfe547e7c65a975ab41530f9213da5"
+            "sha256": "420a5d5c155830dac792fe2f037bebce97c30f4271301bb5950a288254798660"
         },
         "pipfile-spec": 6,
         "requires": {
-            "python_version": "3.8"
+            "python_version": "3.9"
         },
         "sources": [
             {
@@ -26,49 +26,79 @@
         },
         "beautifulsoup4": {
             "hashes": [
-                "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35",
-                "sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25",
-                "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"
+                "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf",
+                "sha256:c23ad23c521d818955a4151a67d81580319d4bf548d3d49f4223ae041ff98891"
             ],
             "index": "pypi",
-            "version": "==4.9.3"
+            "version": "==4.10.0"
         },
         "boto3": {
             "hashes": [
-                "sha256:7209b79833bdf13753aa24f76bf533890ffed2cc4fe1fe08619d223c209bbd11",
-                "sha256:f46c93d09acd4d4bfc6b9522ed852fecbdc508e0365f29ddfb3c146aae784b4e"
+                "sha256:aa00024cc1f3d24b2318dae4d5dbaa173c8da8bc6f9d12f0b2e67467ec460989",
+                "sha256:ab4ab2392f7520c01ce6e40e6df4b5b65a575ee6bd9fb78db0239cb2a06de557"
             ],
             "index": "pypi",
-            "version": "==1.18.27"
+            "version": "==1.21.3"
         },
         "botocore": {
             "hashes": [
-                "sha256:8c99abd7093ab11ce8d09c68732aeeb6065a53d2fe371568452e99291817fff5",
-                "sha256:b9e2c90bad164d111c229102f58f995c28576e719dd116b446965e1b786f8fa5"
+                "sha256:979e5c5e826ff115f4903fe9887b191f3809229f694a747f910e1221fe63efc7",
+                "sha256:ca33f747c67cd0e109fab9398d39c38c1a2df352c1e1f9823839df8f1db58046"
             ],
-            "version": "==1.21.27"
+            "markers": "python_version >= '3.6'",
+            "version": "==1.24.3"
+        },
+        "bs4": {
+            "hashes": [
+                "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"
+            ],
+            "index": "pypi",
+            "version": "==0.0.1"
         },
         "cachetools": {
             "hashes": [
-                "sha256:2cc0b89715337ab6dbba85b5b50effe2b0c74e035d83ee8ed637cf52f12ae001",
-                "sha256:61b5ed1e22a0924aed1d23b478f37e8d52549ff8a961de2909c69bf950020cff"
+                "sha256:486471dfa8799eb7ec503a8059e263db000cdda20075ce5e48903087f79d5fd6",
+                "sha256:8fecd4203a38af17928be7b90689d8083603073622229ca7077b72d8e5a976e4"
             ],
-            "version": "==4.2.2"
+            "markers": "python_version ~= '3.7'",
+            "version": "==5.0.0"
         },
         "certifi": {
             "hashes": [
-                "sha256:2bbf76fd432960138b3ef6dda3dde0544f27cbf8546c458e60baf371917ba9ee",
-                "sha256:50b1e4f8446b06f41be7dd6338db18e0990601dce795c2b1686458aa7e8fa7d8"
+                "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
+                "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
             ],
-            "version": "==2021.5.30"
+            "version": "==2021.10.8"
         },
         "charset-normalizer": {
             "hashes": [
-                "sha256:0c8911edd15d19223366a194a513099a302055a962bca2cec0f54b8b63175d8b",
-                "sha256:f23667ebe1084be45f6ae0538e4a5a865206544097e4e8bbcacf42cd02a348f3"
+                "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
+                "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"
             ],
             "markers": "python_version >= '3'",
-            "version": "==2.0.4"
+            "version": "==2.0.12"
+        },
+        "click": {
+            "hashes": [
+                "sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1",
+                "sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==8.0.4"
+        },
+        "cloudscraper": {
+            "hashes": [
+                "sha256:674fd739f9412188aae8d6614e3e6316939fc0670ef5646abd3d316f1a59d3c2",
+                "sha256:dda29028c5628b5ba3e4dc43816ed38fd46bd945ef938c420f185586a6d8dff2"
+            ],
+            "version": "==1.2.58"
+        },
+        "ffmpeg": {
+            "hashes": [
+                "sha256:6931692c890ff21d39938433c2189747815dca0c60ddc7f9bb97f199dba0b5b9"
+            ],
+            "index": "pypi",
+            "version": "==1.4"
         },
         "ffmpeg-python": {
             "hashes": [
@@ -78,55 +108,272 @@
             "index": "pypi",
             "version": "==0.2.0"
         },
+        "flask": {
+            "hashes": [
+                "sha256:59da8a3170004800a2837844bfa84d49b022550616070f7cb1a659682b2e7c9f",
+                "sha256:e1120c228ca2f553b470df4a5fa927ab66258467526069981b3eb0a91902687d"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==2.0.3"
+        },
         "future": {
             "hashes": [
                 "sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d"
             ],
+            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
             "version": "==0.18.2"
         },
         "google-auth": {
             "hashes": [
-                "sha256:c012c8be7c442c8309ca8fa0876fef33f5fd977c467be1e1c1c2f721e8ebd73c",
-                "sha256:ea1af050b3e06eb73e4470f704d23007307bc0e87c13e015f6b90460f1407bd3"
+                "sha256:218ca03d7744ca0c8b6697b6083334be7df49b7bf76a69d555962fd1a7657b5f",
+                "sha256:ad160fc1ea8f19e331a16a14a79f3d643d813a69534ba9611d2c80dc10439dad"
             ],
-            "version": "==2.0.1"
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
+            "version": "==2.6.0"
         },
         "google-auth-oauthlib": {
             "hashes": [
-                "sha256:4ab58e6c3dc6ccf112f921fcced40e5426fba266768986ea502228488276eaba",
-                "sha256:b5a1ce7c617d247ccb2dfbba9d4bfc734b41096803d854a2c52592ae80150a67"
+                "sha256:3f2a6e802eebbb6fb736a370fbf3b055edcb6b52878bf2f26330b5e041316c73",
+                "sha256:a90a072f6993f2c327067bf65270046384cda5a8ecb20b94ea9a687f1f233a7a"
             ],
-            "version": "==0.4.5"
+            "markers": "python_version >= '3.6'",
+            "version": "==0.4.6"
         },
         "gspread": {
             "hashes": [
-                "sha256:236a0f24e3724b49bae4cbd5144ed036b0ae6feaf5828ad033eb2824bf05e5be",
-                "sha256:4933c3e2359e82698c0990f3b0e312627fcbf8fecc8bc81d26713f5860e20b48"
+                "sha256:d9db8c43d552f541ea072d4727d1e955bc2368b095dd86c5429a845c9d8aed8f",
+                "sha256:ffba57786e27519fb97125e3de37a0f062134a396506681f5baacaf47a9febe3"
             ],
             "index": "pypi",
-            "version": "==4.0.1"
+            "version": "==5.1.1"
         },
         "idna": {
             "hashes": [
-                "sha256:14475042e284991034cb48e06f6851428fb14c4dc953acd9be9a5e95c7b6dd7a",
-                "sha256:467fbad99067910785144ce333826c71fb0e63a425657295239737f7ecd125f3"
+                "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
+                "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
             ],
             "markers": "python_version >= '3'",
-            "version": "==3.2"
+            "version": "==3.3"
+        },
+        "itsdangerous": {
+            "hashes": [
+                "sha256:29285842166554469a56d427addc0843914172343784cb909695fdbe90a3e129",
+                "sha256:d848fcb8bc7d507c4546b448574e8a44fc4ea2ba84ebf8d783290d53e81992f5"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==2.1.0"
+        },
+        "jinja2": {
+            "hashes": [
+                "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8",
+                "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==3.0.3"
         },
         "jmespath": {
             "hashes": [
                 "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9",
                 "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f"
             ],
+            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
             "version": "==0.10.0"
         },
+        "loguru": {
+            "hashes": [
+                "sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c",
+                "sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3"
+            ],
+            "index": "pypi",
+            "version": "==0.6.0"
+        },
+        "lxml": {
+            "hashes": [
+                "sha256:078306d19a33920004addeb5f4630781aaeabb6a8d01398045fcde085091a169",
+                "sha256:0c1978ff1fd81ed9dcbba4f91cf09faf1f8082c9d72eb122e92294716c605428",
+                "sha256:1010042bfcac2b2dc6098260a2ed022968dbdfaf285fc65a3acf8e4eb1ffd1bc",
+                "sha256:1d650812b52d98679ed6c6b3b55cbb8fe5a5460a0aef29aeb08dc0b44577df85",
+                "sha256:20b8a746a026017acf07da39fdb10aa80ad9877046c9182442bf80c84a1c4696",
+                "sha256:2403a6d6fb61c285969b71f4a3527873fe93fd0abe0832d858a17fe68c8fa507",
+                "sha256:24f5c5ae618395ed871b3d8ebfcbb36e3f1091fd847bf54c4de623f9107942f3",
+                "sha256:28d1af847786f68bec57961f31221125c29d6f52d9187c01cd34dc14e2b29430",
+                "sha256:31499847fc5f73ee17dbe1b8e24c6dafc4e8d5b48803d17d22988976b0171f03",
+                "sha256:31ba2cbc64516dcdd6c24418daa7abff989ddf3ba6d3ea6f6ce6f2ed6e754ec9",
+                "sha256:330bff92c26d4aee79c5bc4d9967858bdbe73fdbdbacb5daf623a03a914fe05b",
+                "sha256:5045ee1ccd45a89c4daec1160217d363fcd23811e26734688007c26f28c9e9e7",
+                "sha256:52cbf2ff155b19dc4d4100f7442f6a697938bf4493f8d3b0c51d45568d5666b5",
+                "sha256:530f278849031b0eb12f46cca0e5db01cfe5177ab13bd6878c6e739319bae654",
+                "sha256:545bd39c9481f2e3f2727c78c169425efbfb3fbba6e7db4f46a80ebb249819ca",
+                "sha256:5804e04feb4e61babf3911c2a974a5b86f66ee227cc5006230b00ac6d285b3a9",
+                "sha256:5a58d0b12f5053e270510bf12f753a76aaf3d74c453c00942ed7d2c804ca845c",
+                "sha256:5f148b0c6133fb928503cfcdfdba395010f997aa44bcf6474fcdd0c5398d9b63",
+                "sha256:5f7d7d9afc7b293147e2d506a4596641d60181a35279ef3aa5778d0d9d9123fe",
+                "sha256:60d2f60bd5a2a979df28ab309352cdcf8181bda0cca4529769a945f09aba06f9",
+                "sha256:6259b511b0f2527e6d55ad87acc1c07b3cbffc3d5e050d7e7bcfa151b8202df9",
+                "sha256:6268e27873a3d191849204d00d03f65c0e343b3bcb518a6eaae05677c95621d1",
+                "sha256:627e79894770783c129cc5e89b947e52aa26e8e0557c7e205368a809da4b7939",
+                "sha256:62f93eac69ec0f4be98d1b96f4d6b964855b8255c345c17ff12c20b93f247b68",
+                "sha256:6d6483b1229470e1d8835e52e0ff3c6973b9b97b24cd1c116dca90b57a2cc613",
+                "sha256:6f7b82934c08e28a2d537d870293236b1000d94d0b4583825ab9649aef7ddf63",
+                "sha256:6fe4ef4402df0250b75ba876c3795510d782def5c1e63890bde02d622570d39e",
+                "sha256:719544565c2937c21a6f76d520e6e52b726d132815adb3447ccffbe9f44203c4",
+                "sha256:730766072fd5dcb219dd2b95c4c49752a54f00157f322bc6d71f7d2a31fecd79",
+                "sha256:74eb65ec61e3c7c019d7169387d1b6ffcfea1b9ec5894d116a9a903636e4a0b1",
+                "sha256:7993232bd4044392c47779a3c7e8889fea6883be46281d45a81451acfd704d7e",
+                "sha256:80bbaddf2baab7e6de4bc47405e34948e694a9efe0861c61cdc23aa774fcb141",
+                "sha256:86545e351e879d0b72b620db6a3b96346921fa87b3d366d6c074e5a9a0b8dadb",
+                "sha256:891dc8f522d7059ff0024cd3ae79fd224752676447f9c678f2a5c14b84d9a939",
+                "sha256:8a31f24e2a0b6317f33aafbb2f0895c0bce772980ae60c2c640d82caac49628a",
+                "sha256:8b99ec73073b37f9ebe8caf399001848fced9c08064effdbfc4da2b5a8d07b93",
+                "sha256:986b7a96228c9b4942ec420eff37556c5777bfba6758edcb95421e4a614b57f9",
+                "sha256:a1547ff4b8a833511eeaceacbcd17b043214fcdb385148f9c1bc5556ca9623e2",
+                "sha256:a2bfc7e2a0601b475477c954bf167dee6d0f55cb167e3f3e7cefad906e7759f6",
+                "sha256:a3c5f1a719aa11866ffc530d54ad965063a8cbbecae6515acbd5f0fae8f48eaa",
+                "sha256:a9f1c3489736ff8e1c7652e9dc39f80cff820f23624f23d9eab6e122ac99b150",
+                "sha256:aa0cf4922da7a3c905d000b35065df6184c0dc1d866dd3b86fd961905bbad2ea",
+                "sha256:ad4332a532e2d5acb231a2e5d33f943750091ee435daffca3fec0a53224e7e33",
+                "sha256:b2582b238e1658c4061ebe1b4df53c435190d22457642377fd0cb30685cdfb76",
+                "sha256:b6fc2e2fb6f532cf48b5fed57567ef286addcef38c28874458a41b7837a57807",
+                "sha256:b92d40121dcbd74831b690a75533da703750f7041b4bf951befc657c37e5695a",
+                "sha256:bbab6faf6568484707acc052f4dfc3802bdb0cafe079383fbaa23f1cdae9ecd4",
+                "sha256:c0b88ed1ae66777a798dc54f627e32d3b81c8009967c63993c450ee4cbcbec15",
+                "sha256:ce13d6291a5f47c1c8dbd375baa78551053bc6b5e5c0e9bb8e39c0a8359fd52f",
+                "sha256:db3535733f59e5605a88a706824dfcb9bd06725e709ecb017e165fc1d6e7d429",
+                "sha256:dd10383f1d6b7edf247d0960a3db274c07e96cf3a3fc7c41c8448f93eac3fb1c",
+                "sha256:e01f9531ba5420838c801c21c1b0f45dbc9607cb22ea2cf132844453bec863a5",
+                "sha256:e11527dc23d5ef44d76fef11213215c34f36af1608074561fcc561d983aeb870",
+                "sha256:e1ab2fac607842ac36864e358c42feb0960ae62c34aa4caaf12ada0a1fb5d99b",
+                "sha256:e1fd7d2fe11f1cb63d3336d147c852f6d07de0d0020d704c6031b46a30b02ca8",
+                "sha256:e9f84ed9f4d50b74fbc77298ee5c870f67cb7e91dcdc1a6915cb1ff6a317476c",
+                "sha256:ec4b4e75fc68da9dc0ed73dcdb431c25c57775383fec325d23a770a64e7ebc87",
+                "sha256:f10ce66fcdeb3543df51d423ede7e238be98412232fca5daec3e54bcd16b8da0",
+                "sha256:f63f62fc60e6228a4ca9abae28228f35e1bd3ce675013d1dfb828688d50c6e23",
+                "sha256:fa56bb08b3dd8eac3a8c5b7d075c94e74f755fd9d8a04543ae8d37b1612dd170",
+                "sha256:fa9b7c450be85bfc6cd39f6df8c5b8cbd76b5d6fc1f69efec80203f9894b885f"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==4.8.0"
+        },
+        "markupsafe": {
+            "hashes": [
+                "sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3",
+                "sha256:09c86c9643cceb1d87ca08cdc30160d1b7ab49a8a21564868921959bd16441b8",
+                "sha256:142119fb14a1ef6d758912b25c4e803c3ff66920635c44078666fe7cc3f8f759",
+                "sha256:1d1fb9b2eec3c9714dd936860850300b51dbaa37404209c8d4cb66547884b7ed",
+                "sha256:204730fd5fe2fe3b1e9ccadb2bd18ba8712b111dcabce185af0b3b5285a7c989",
+                "sha256:24c3be29abb6b34052fd26fc7a8e0a49b1ee9d282e3665e8ad09a0a68faee5b3",
+                "sha256:290b02bab3c9e216da57c1d11d2ba73a9f73a614bbdcc027d299a60cdfabb11a",
+                "sha256:3028252424c72b2602a323f70fbf50aa80a5d3aa616ea6add4ba21ae9cc9da4c",
+                "sha256:30c653fde75a6e5eb814d2a0a89378f83d1d3f502ab710904ee585c38888816c",
+                "sha256:3cace1837bc84e63b3fd2dfce37f08f8c18aeb81ef5cf6bb9b51f625cb4e6cd8",
+                "sha256:4056f752015dfa9828dce3140dbadd543b555afb3252507348c493def166d454",
+                "sha256:454ffc1cbb75227d15667c09f164a0099159da0c1f3d2636aa648f12675491ad",
+                "sha256:598b65d74615c021423bd45c2bc5e9b59539c875a9bdb7e5f2a6b92dfcfc268d",
+                "sha256:599941da468f2cf22bf90a84f6e2a65524e87be2fce844f96f2dd9a6c9d1e635",
+                "sha256:5ddea4c352a488b5e1069069f2f501006b1a4362cb906bee9a193ef1245a7a61",
+                "sha256:62c0285e91414f5c8f621a17b69fc0088394ccdaa961ef469e833dbff64bd5ea",
+                "sha256:679cbb78914ab212c49c67ba2c7396dc599a8479de51b9a87b174700abd9ea49",
+                "sha256:6e104c0c2b4cd765b4e83909cde7ec61a1e313f8a75775897db321450e928cce",
+                "sha256:736895a020e31b428b3382a7887bfea96102c529530299f426bf2e636aacec9e",
+                "sha256:75bb36f134883fdbe13d8e63b8675f5f12b80bb6627f7714c7d6c5becf22719f",
+                "sha256:7d2f5d97fcbd004c03df8d8fe2b973fe2b14e7bfeb2cfa012eaa8759ce9a762f",
+                "sha256:80beaf63ddfbc64a0452b841d8036ca0611e049650e20afcb882f5d3c266d65f",
+                "sha256:84ad5e29bf8bab3ad70fd707d3c05524862bddc54dc040982b0dbcff36481de7",
+                "sha256:8da5924cb1f9064589767b0f3fc39d03e3d0fb5aa29e0cb21d43106519bd624a",
+                "sha256:961eb86e5be7d0973789f30ebcf6caab60b844203f4396ece27310295a6082c7",
+                "sha256:96de1932237abe0a13ba68b63e94113678c379dca45afa040a17b6e1ad7ed076",
+                "sha256:a0a0abef2ca47b33fb615b491ce31b055ef2430de52c5b3fb19a4042dbc5cadb",
+                "sha256:b2a5a856019d2833c56a3dcac1b80fe795c95f401818ea963594b345929dffa7",
+                "sha256:b8811d48078d1cf2a6863dafb896e68406c5f513048451cd2ded0473133473c7",
+                "sha256:c532d5ab79be0199fa2658e24a02fce8542df196e60665dd322409a03db6a52c",
+                "sha256:d3b64c65328cb4cd252c94f83e66e3d7acf8891e60ebf588d7b493a55a1dbf26",
+                "sha256:d4e702eea4a2903441f2735799d217f4ac1b55f7d8ad96ab7d4e25417cb0827c",
+                "sha256:d5653619b3eb5cbd35bfba3c12d575db2a74d15e0e1c08bf1db788069d410ce8",
+                "sha256:d66624f04de4af8bbf1c7f21cc06649c1c69a7f84109179add573ce35e46d448",
+                "sha256:e67ec74fada3841b8c5f4c4f197bea916025cb9aa3fe5abf7d52b655d042f956",
+                "sha256:e6f7f3f41faffaea6596da86ecc2389672fa949bd035251eab26dc6697451d05",
+                "sha256:f02cf7221d5cd915d7fa58ab64f7ee6dd0f6cddbb48683debf5d04ae9b1c2cc1",
+                "sha256:f0eddfcabd6936558ec020130f932d479930581171368fd728efcfb6ef0dd357",
+                "sha256:fabbe18087c3d33c5824cb145ffca52eccd053061df1d79d4b66dafa5ad2a5ea",
+                "sha256:fc3150f85e2dbcf99e65238c842d1cfe69d3e7649b19864c1cc043213d9cd730"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==2.1.0"
+        },
+        "nordvpn-switcher": {
+            "hashes": [
+                "sha256:764db054715d949af0f836da5e46c4053afe92282a0d4b2cfc6b8cfe8c3045de",
+                "sha256:9788c2c3113d0d7b00894dae3ea19bed14f3b38d111d7223c126f001b1729a3b"
+            ],
+            "index": "pypi",
+            "version": "==0.2.9"
+        },
         "oauthlib": {
             "hashes": [
-                "sha256:42bf6354c2ed8c6acb54d971fce6f88193d97297e18602a3a886603f9d7730cc",
-                "sha256:8f0215fcc533dd8dd1bee6f4c412d4f0cd7297307d43ac61666389e3bc3198a3"
+                "sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2",
+                "sha256:6db33440354787f9b7f3a6dbd4febf5d0f93758354060e802f6c06cb493022fe"
             ],
-            "version": "==3.1.1"
+            "markers": "python_version >= '3.6'",
+            "version": "==3.2.0"
+        },
+        "pathlib": {
+            "hashes": [
+                "sha256:6940718dfc3eff4258203ad5021090933e5c04707d5ca8cc9e73c94a7894ea9f"
+            ],
+            "version": "==1.0.1"
+        },
+        "psutil": {
+            "hashes": [
+                "sha256:072664401ae6e7c1bfb878c65d7282d4b4391f1bc9a56d5e03b5a490403271b5",
+                "sha256:1070a9b287846a21a5d572d6dddd369517510b68710fca56b0e9e02fd24bed9a",
+                "sha256:1d7b433519b9a38192dfda962dd8f44446668c009833e1429a52424624f408b4",
+                "sha256:3151a58f0fbd8942ba94f7c31c7e6b310d2989f4da74fcbf28b934374e9bf841",
+                "sha256:32acf55cb9a8cbfb29167cd005951df81b567099295291bcfd1027365b36591d",
+                "sha256:3611e87eea393f779a35b192b46a164b1d01167c9d323dda9b1e527ea69d697d",
+                "sha256:3d00a664e31921009a84367266b35ba0aac04a2a6cad09c550a89041034d19a0",
+                "sha256:4e2fb92e3aeae3ec3b7b66c528981fd327fb93fd906a77215200404444ec1845",
+                "sha256:539e429da49c5d27d5a58e3563886057f8fc3868a5547b4f1876d9c0f007bccf",
+                "sha256:55ce319452e3d139e25d6c3f85a1acf12d1607ddedea5e35fb47a552c051161b",
+                "sha256:58c7d923dc209225600aec73aa2c4ae8ea33b1ab31bc11ef8a5933b027476f07",
+                "sha256:7336292a13a80eb93c21f36bde4328aa748a04b68c13d01dfddd67fc13fd0618",
+                "sha256:742c34fff804f34f62659279ed5c5b723bb0195e9d7bd9907591de9f8f6558e2",
+                "sha256:7641300de73e4909e5d148e90cc3142fb890079e1525a840cf0dfd39195239fd",
+                "sha256:76cebf84aac1d6da5b63df11fe0d377b46b7b500d892284068bacccf12f20666",
+                "sha256:7779be4025c540d1d65a2de3f30caeacc49ae7a2152108adeaf42c7534a115ce",
+                "sha256:7d190ee2eaef7831163f254dc58f6d2e2a22e27382b936aab51c835fc080c3d3",
+                "sha256:8293942e4ce0c5689821f65ce6522ce4786d02af57f13c0195b40e1edb1db61d",
+                "sha256:869842dbd66bb80c3217158e629d6fceaecc3a3166d3d1faee515b05dd26ca25",
+                "sha256:90a58b9fcae2dbfe4ba852b57bd4a1dded6b990a33d6428c7614b7d48eccb492",
+                "sha256:9b51917c1af3fa35a3f2dabd7ba96a2a4f19df3dec911da73875e1edaf22a40b",
+                "sha256:b2237f35c4bbae932ee98902a08050a27821f8f6dfa880a47195e5993af4702d",
+                "sha256:c3400cae15bdb449d518545cbd5b649117de54e3596ded84aacabfbb3297ead2",
+                "sha256:c51f1af02334e4b516ec221ee26b8fdf105032418ca5a5ab9737e8c87dafe203",
+                "sha256:cb8d10461c1ceee0c25a64f2dd54872b70b89c26419e147a05a10b753ad36ec2",
+                "sha256:d62a2796e08dd024b8179bd441cb714e0f81226c352c802fca0fd3f89eeacd94",
+                "sha256:df2c8bd48fb83a8408c8390b143c6a6fa10cb1a674ca664954de193fdcab36a9",
+                "sha256:e5c783d0b1ad6ca8a5d3e7b680468c9c926b804be83a3a8e95141b05c39c9f64",
+                "sha256:e9805fed4f2a81de98ae5fe38b75a74c6e6ad2df8a5c479594c7629a1fe35f56",
+                "sha256:ea42d747c5f71b5ccaa6897b216a7dadb9f52c72a0fe2b872ef7d3e1eacf3ba3",
+                "sha256:ef216cc9feb60634bda2f341a9559ac594e2eeaadd0ba187a4c2eb5b5d40b91c",
+                "sha256:ff0d41f8b3e9ebb6b6110057e40019a432e96aae2008951121ba4e56040b84f3"
+            ],
+            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==5.9.0"
+        },
+        "py-mini-racer": {
+            "hashes": [
+                "sha256:346e73bb89a2024888244d487834be24a121089ceb0641dd0200cb96c4e24b57",
+                "sha256:42896c24968481dd953eeeb11de331f6870917811961c9b26ba09071e07180e2",
+                "sha256:97cab31bbf63ce462ba4cd6e978c572c916d8b15586156c7c5e0b2e42c10baab",
+                "sha256:f71e36b643d947ba698c57cd9bd2232c83ca997b0802fc2f7f79582377040c11"
+            ],
+            "version": "==0.6.0"
+        },
+        "pyaes": {
+            "hashes": [
+                "sha256:02c1b1405c38d3c370b085fb952dd8bea3fadcee6411ad99f312cc129c536d8f"
+            ],
+            "version": "==1.6.1"
         },
         "pyasn1": {
             "hashes": [
@@ -164,80 +411,128 @@
             ],
             "version": "==0.2.8"
         },
+        "pyparsing": {
+            "hashes": [
+                "sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea",
+                "sha256:a6c06a88f252e6c322f65faf8f418b16213b51bdfaece0524c1c1bc30c63c484"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==3.0.7"
+        },
         "python-dateutil": {
             "hashes": [
                 "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
                 "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"
             ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
             "version": "==2.8.2"
         },
         "python-dotenv": {
             "hashes": [
-                "sha256:aae25dc1ebe97c420f50b81fb0e5c949659af713f31fdb63c749ca68748f34b1",
-                "sha256:f521bc2ac9a8e03c736f62911605c5d83970021e3fa95b37d769e2bbbe9b6172"
+                "sha256:32b2bdc1873fd3a3c346da1c6db83d0053c3c62f28f1f38516070c4c8971b1d3",
+                "sha256:a5de49a31e953b45ff2d2fd434bbc2670e8db5273606c1e737cc6b93eff3655f"
             ],
             "index": "pypi",
-            "version": "==0.19.0"
+            "version": "==0.19.2"
+        },
+        "random-user-agent": {
+            "hashes": [
+                "sha256:535636a55fb63fe3d74fd0260d854c241d9f2946447026464e578e68eac17dac",
+                "sha256:8f8ca26ec8cb1d24ad1758d8b8f700d154064d641dbe9a255cfec42960fbd012"
+            ],
+            "version": "==1.0.1"
         },
         "requests": {
             "hashes": [
-                "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
-                "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
+                "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
+                "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
             ],
-            "version": "==2.26.0"
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
+            "version": "==2.27.1"
         },
         "requests-oauthlib": {
             "hashes": [
-                "sha256:7f71572defaecd16372f9006f33c2ec8c077c3cfa6f5911a9a90202beb513f3d",
-                "sha256:b4261601a71fd721a8bd6d7aa1cc1d6a8a93b4a9f5e96626f8e4d91e8beeaa6a",
-                "sha256:fa6c47b933f01060936d87ae9327fead68768b69c6c9ea2109c48be30f2d4dbc"
+                "sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5",
+                "sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a"
             ],
-            "version": "==1.3.0"
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==1.3.1"
+        },
+        "requests-toolbelt": {
+            "hashes": [
+                "sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f",
+                "sha256:968089d4584ad4ad7c171454f0a5c6dac23971e9472521ea3b6d49d610aa6fc0"
+            ],
+            "version": "==0.9.1"
         },
         "rsa": {
             "hashes": [
-                "sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2",
-                "sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9"
+                "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17",
+                "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"
             ],
-            "version": "==4.7.2"
+            "markers": "python_version >= '3.6'",
+            "version": "==4.8"
         },
         "s3transfer": {
             "hashes": [
-                "sha256:50ed823e1dc5868ad40c8dc92072f757aa0e653a192845c94a3b676f4a62da4c",
-                "sha256:9c1dc369814391a6bda20ebbf4b70a0f34630592c9aa520856bf384916af2803"
+                "sha256:25c140f5c66aa79e1ac60be50dcd45ddc59e83895f062a3aab263b870102911f",
+                "sha256:69d264d3e760e569b78aaa0f22c97e955891cd22e32b10c51f784eeda4d9d10a"
             ],
-            "version": "==0.5.0"
+            "markers": "python_version >= '3.6'",
+            "version": "==0.5.1"
         },
         "six": {
             "hashes": [
                 "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
                 "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
             ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
             "version": "==1.16.0"
         },
         "soupsieve": {
             "hashes": [
-                "sha256:052774848f448cf19c7e959adf5566904d525f33a3f8b6ba6f6f8f26ec7de0cc",
-                "sha256:c2c1c2d44f158cdbddab7824a9af8c4f83c76b1e23e049479aa432feb6c4c23b"
+                "sha256:1a3cca2617c6b38c0343ed661b1fa5de5637f257d4fe22bd9f1338010a1efefb",
+                "sha256:b8d49b1cd4f037c7082a9683dfa1801aa2597fb11c3a1155b7a5b94829b4f1f9"
             ],
-            "markers": "python_version >= '3.0'",
-            "version": "==2.2.1"
+            "markers": "python_version >= '3.6'",
+            "version": "==2.3.1"
+        },
+        "telethon": {
+            "hashes": [
+                "sha256:04fdc5fa4ed3e886e6ecf4bad79205ab8880c6aefbd42c29c89c689a502aa816",
+                "sha256:818cb61281ed3f75ba4da9b68cb69486bed9474d2db4e0aa16e482053117452c"
+            ],
+            "index": "pypi",
+            "version": "==1.24.0"
+        },
+        "tiktok-downloader": {
+            "git": "https://github.com/msramalho/tiktok-downloader",
+            "ref": "81c6ea1f959b2cc5620961043272592bd1bfc2e2"
         },
         "urllib3": {
             "hashes": [
-                "sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4",
-                "sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f"
+                "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
+                "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
             ],
-            "version": "==1.26.6"
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
+            "version": "==1.26.8"
+        },
+        "werkzeug": {
+            "hashes": [
+                "sha256:1421ebfc7648a39a5c58c601b154165d05cf47a3cd0ccb70857cbdacf6c8f2b8",
+                "sha256:b863f8ff057c522164b6067c9e28b041161b4be5ba4d0daceeaa50a163822d3c"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==2.0.3"
         },
         "youtube-dl": {
             "hashes": [
-                "sha256:263e04d53fb8ba3dfbd246ad09b7d388e896c132a20cc770c26ee7684de050ac",
-                "sha256:cb2d3ee002158ede783e97a82c95f3817594df54367ea6a77ce5ceea4772f0ab"
+                "sha256:bc59e86c5d15d887ac590454511f08ce2c47698d5a82c27bfe27b5d814bbaed2",
+                "sha256:f1336d5de68647e0364a47b3c0712578e59ec76f02048ff5c50ef1c69d79cd55"
             ],
             "index": "pypi",
-            "version": "==2021.6.6"
+            "version": "==2021.12.17"
         }
     },
     "develop": {}
-}
+}
\ No newline at end of file
diff --git a/archivers.py b/archivers.py
new file mode 100644
index 0000000..d8a72f6
--- /dev/null
+++ b/archivers.py
@@ -0,0 +1,390 @@
+from dataclasses import dataclass
+import youtube_dl
+from bs4 import BeautifulSoup
+import requests
+import tiktok_downloader
+from loguru import logger
+import os
+import datetime
+import ffmpeg
+from botocore.errorfactory import ClientError
+import time
+import traceback
+
+# TODO There should be a better way of generating keys, that adds the following info:
+#           - name of sheet that it is being archived from
+#             (this means we might archive the same media twice on different sheets, but that's OK I think)
+#           - name of archiver/platform that the video comes from
+#       This should make it easier to maintain and clean the archive later
+
+# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be
+#      cleaned up? Difficult is we don't know the filename until the archivers start working.
+
+
+def get_cdn_url(key):
+    return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
+        os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
+
+
+def do_s3_upload(s3_client, f, key):
+    s3_client.upload_fileobj(f, Bucket=os.getenv(
+        'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
+
+
+def get_key(filename):
+    key = filename.split('/')[1]
+    if 'unknown_video' in key:
+        key = key.replace('unknown_video', 'jpg')
+    return key
+
+
+def get_thumbnails(filename, s3_client, duration=None):
+    if not os.path.exists(filename.split('.')[0]):
+        os.mkdir(filename.split('.')[0])
+
+    fps = 0.5
+    if duration is not None:
+        duration = float(duration)
+
+        if duration < 60:
+            fps = 10.0 / duration
+        elif duration < 120:
+            fps = 20.0 / duration
+        else:
+            fps = 40.0 / duration
+
+    stream = ffmpeg.input(filename)
+    stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
+    stream.output(filename.split('.')[0] + '/out%d.jpg').run()
+
+    thumbnails = os.listdir(filename.split('.')[0] + '/')
+    cdn_urls = []
+
+    for fname in thumbnails:
+        if fname[-3:] == 'jpg':
+            thumbnail_filename = filename.split('.')[0] + '/' + fname
+            key = filename.split('/')[1].split('.')[0] + '/' + fname
+
+            cdn_url = get_cdn_url(key)
+
+            with open(thumbnail_filename, 'rb') as f:
+                do_s3_upload(s3_client, f, key)
+
+            cdn_urls.append(cdn_url)
+            os.remove(thumbnail_filename)
+
+    if len(cdn_urls) == 0:
+        return ('None', 'None')
+
+    key_thumb = cdn_urls[int(len(cdn_urls)*0.1)]
+
+    index_page = f'''<html><head><title>{filename}</title></head>
+        <body>'''
+
+    for t in cdn_urls:
+        index_page += f'<img src="{t}" />'
+
+    index_page += f"</body></html>"
+    index_fname = filename.split('.')[0] + '/index.html'
+
+    with open(index_fname, 'w') as f:
+        f.write(index_page)
+
+    thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
+
+    s3_client.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
+        'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
+
+    thumb_index_cdn_url = get_cdn_url(thumb_index)
+
+    return (key_thumb, thumb_index_cdn_url)
+
+
+@dataclass
+class ArchiveResult:
+    status: str
+    cdn_url: str = None
+    thumbnail: str = None
+    thumbnail_index: str = None
+    duration: float = None
+    title: str = None
+    timestamp: datetime.datetime = None
+
+
+class Archiver:
+    def __init__(self, s3_client):
+        self.s3 = s3_client
+
+    def download(self, url):
+        pass
+
+
+class TelegramArchiver(Archiver):
+    def download(self, url, check_if_exists=False):
+        # detect URLs that we definitely cannot handle
+        if 'http://t.me/' not in url and 'https://t.me/' not in url:
+            return False
+
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
+        status = "success"
+
+        original_url = url
+
+        if url[-8:] != "?embed=1":
+            url += "?embed=1"
+
+        t = requests.get(url, headers=headers)
+        s = BeautifulSoup(t.content, 'html.parser')
+        video = s.find("video")
+
+        if video is None:
+            return False  # could not find video
+
+        video_url = video.get('src')
+        key = video_url.split('/')[-1].split('?')[0]
+        filename = 'tmp/' + key
+
+        if check_if_exists:
+            try:
+                self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
+
+                # file exists
+                cdn_url = get_cdn_url(key)
+
+                status = 'already archived'
+
+            except ClientError:
+                pass
+
+        v = requests.get(video_url, headers=headers)
+
+        with open(filename, 'wb') as f:
+            f.write(v.content)
+
+        if status != 'already archived':
+            cdn_url = get_cdn_url(key)
+
+            with open(filename, 'rb') as f:
+                do_s3_upload(self.s3, f, key)
+
+        # extract duration from HTML
+        duration = s.find_all('time')[0].contents[0]
+        if ':' in duration:
+            duration = float(duration.split(
+                ':')[0])*60 + float(duration.split(':')[1])
+        else:
+            duration = float(duration)
+
+        # process thumbnails
+        key_thumb, thumb_index = get_thumbnails(
+            filename, self.s3, duration=duration)
+        os.remove(filename)
+
+        return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
+                             duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))
+
+
+class YoutubeDLArchiver(Archiver):
+    def download(self, url, check_if_exists=False):
+        ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
+        if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
+            logger.info('Using Facebook cookie')
+            youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
+
+        ydl = youtube_dl.YoutubeDL(ydl_opts)
+        cdn_url = None
+        status = 'success'
+
+        try:
+            info = ydl.extract_info(url, download=False)
+        except youtube_dl.utils.DownloadError:
+            # no video here
+            return False
+
+        if 'is_live' in info and info['is_live']:
+            logger.warning("Live streaming media, not archiving now")
+            return ArchiveResult(status="Streaming media")
+
+        if check_if_exists:
+            if 'entries' in info:
+                if len(info['entries']) > 1:
+                    logger.warning(
+                        'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
+                    return False
+
+                filename = ydl.prepare_filename(info['entries'][0])
+            else:
+                filename = ydl.prepare_filename(info)
+
+            key = get_key(filename)
+
+            try:
+                self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
+
+                # file exists
+                cdn_url = get_cdn_url(key)
+
+                status = 'already archived'
+
+            except ClientError:
+                pass
+
+        # sometimes this results in a different filename, so do this again
+        info = ydl.extract_info(url, download=True)
+
+        if 'entries' in info:
+            if len(info['entries']) > 1:
+                logger.warning(
+                    'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
+                return False
+            else:
+                info = info['entries'][0]
+
+        filename = ydl.prepare_filename(info)
+
+        if not os.path.exists(filename):
+            filename = filename.split('.')[0] + '.mkv'
+
+        if status != 'already archived':
+            key = get_key(filename)
+            cdn_url = get_cdn_url(key)
+
+            with open(filename, 'rb') as f:
+                do_s3_upload(self.s3, f, key)
+
+        # get duration
+        duration = info['duration'] if 'duration' in info else None
+
+        # get thumbnails
+        key_thumb, thumb_index = get_thumbnails(
+            filename, self.s3, duration=duration)
+        os.remove(filename)
+
+        return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
+                             title=info['title'] if 'title' in info else None,
+                             timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
+
+
+class WaybackArchiver(Archiver):
+    def __init__(self, s3_client):
+        self.s3 = s3_client
+        self.seen_urls = {}
+
+    def download(self, url, check_if_exists=False):
+        if check_if_exists and url in self.seen_urls:
+            return self.seen_urls[url]
+
+        ia_headers = {
+            "Accept": "application/json",
+            "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
+        }
+
+        r = requests.post(
+            'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
+
+        if r.status_code != 200:
+            return ArchiveResult(status="Internet archive failed")
+
+        job_id = r.json()['job_id']
+
+        status_r = requests.get(
+            'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
+
+        retries = 0
+
+        # wait 90-120 seconds for the archive job to finish
+        while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
+            time.sleep(3)
+
+            try:
+                status_r = requests.get(
+                    'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
+            except:
+                time.sleep(1)
+
+            retries += 1
+
+        if status_r.status_code != 200:
+            return ArchiveResult(status="Internet archive failed")
+
+        status_json = status_r.json()
+
+        if status_json['status'] != 'success':
+            return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
+
+        archive_url = 'https://web.archive.org/web/' + \
+            status_json['timestamp'] + '/' + status_json['original_url']
+
+        try:
+            r = requests.get(archive_url)
+
+            parsed = BeautifulSoup(
+                r.content, 'html.parser')
+
+            title = parsed.find_all('title')[
+                0].text
+        except:
+            title = "Could not get title"
+
+        result = ArchiveResult(
+            status='Internet Archive fallback', cdn_url=archive_url, title=title)
+        self.seen_urls[url] = result
+        return result
+
+
+class TiktokArchiver(Archiver):
+    def download(self, url, check_if_exists=False):
+        if 'tiktok.com' not in url:
+            return False
+
+        status = 'success'
+
+        try:
+            info = tiktok_downloader.info_post(url)
+            key = 'tiktok_' + str(info.id) + '.mp4'
+            filename = 'tmp/' + key
+
+            if check_if_exists:
+                try:
+                    self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
+
+                    # file exists
+                    cdn_url = get_cdn_url(key)
+
+                    status = 'already archived'
+
+                except ClientError:
+                    pass
+
+            if status != 'already archived':
+                media = tiktok_downloader.snaptik(url).get_media()
+                if len(media) > 0:
+                    media[0].download(filename)
+                    with open(filename, 'rb') as f:
+                        do_s3_upload(self.s3, f, key)
+
+                    cdn_url = get_cdn_url(key)
+                else:
+                    status = 'could not download media'
+
+            try:
+                key_thumb, thumb_index = get_thumbnails(
+                    filename, self.s3, duration=info.duration)
+            except:
+                key_thumb = ''
+                thumb_index = 'error creating thumbnails'
+
+            os.remove(filename)
+
+            return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
+                                 thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat())
+
+        except tiktok_downloader.Except.InvalidUrl:
+            status = 'Invalid URL'
+            return ArchiveResult(status=status)
+
+        except:
+            error = traceback.format_exc()
+            status = 'Other Tiktok error: ' + str(error)
+            return ArchiveResult(status=status)
diff --git a/auto_archive.py b/auto_archive.py
index f0f6862..ef4f89c 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -1,19 +1,15 @@
+from dataclasses import dataclass
 import gspread
-import youtube_dl
 from pathlib import Path
-import sys
 import datetime
 import boto3
 import os
 from dotenv import load_dotenv
-from botocore.errorfactory import ClientError
 import argparse
 import math
-import ffmpeg
 import threading
-import time
-from bs4 import BeautifulSoup
-import requests
+from loguru import logger
+import archivers
 
 load_dotenv()
 
@@ -46,328 +42,64 @@ def index_to_col(index):
     else:
         return alphabet[index]
 
-def get_cdn_url(key):
-    return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
-                os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
 
-def do_s3_upload(s3_client, f, key):
-    s3_client.upload_fileobj(f, Bucket=os.getenv(
-                    'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
-
-
-def get_thumbnails(filename, s3_client, duration = None):
-    if not os.path.exists(filename.split('.')[0]):
-        os.mkdir(filename.split('.')[0])
-
-    fps = 0.5
-    if duration is not None:
-        duration = float(duration)
-
-        if duration < 60:
-            fps = 10.0 / duration
-        elif duration < 120:
-            fps = 20.0 / duration
-        else:
-            fps = 40.0 / duration
-
-
-    stream = ffmpeg.input(filename)
-    stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
-    stream.output(filename.split('.')[0] + '/out%d.jpg').run()
-
-    thumbnails = os.listdir(filename.split('.')[0] + '/')
-    cdn_urls = []
-
-    for fname in thumbnails:
-        if fname[-3:] == 'jpg':
-            thumbnail_filename = filename.split('.')[0] + '/' + fname
-            key = filename.split('/')[1].split('.')[0] + '/' + fname
-
-            cdn_url = get_cdn_url(key)
-
-            with open(thumbnail_filename, 'rb') as f:
-                do_s3_upload(s3_client, f, key)
-
-            cdn_urls.append(cdn_url)
-            os.remove(thumbnail_filename)
-
-    if len(cdn_urls) == 0:
-        return ('None', 'None')
-
-    key_thumb = cdn_urls[int(len(cdn_urls)*0.1)]
-
-    index_page = f'''<html><head><title>{filename}</title></head>
-        <body>'''
-
-    for t in cdn_urls:
-        index_page += f'<img src="{t}" />'
-
-    index_page += f"</body></html>"
-    index_fname = filename.split('.')[0] + '/index.html'
-
-    with open(index_fname, 'w') as f:
-        f.write(index_page)
-
-    thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
-
-    s3_client.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
-        'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
-
-    thumb_index_cdn_url =  get_cdn_url(thumb_index)
-
-    return (key_thumb, thumb_index_cdn_url)
-
-
-def download_telegram_video(url, s3_client, check_if_exists=False):
-    status = 'success'
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
-
-    original_url = url
-
-    if url[-8:] != "?embed=1":
-        url += "?embed=1"
-
-    t = requests.get(url, headers=headers)
-    s = BeautifulSoup(t.content, 'html.parser')
-    video = s.find("video")
-
-    if video is None:
-        return ({}, 'No telegram video found')
-    else:
-        video_url = video.get('src')
-        key = video_url.split('/')[-1].split('?')[0]
-        filename = 'tmp/' + key
-
-        if check_if_exists:
-            try:
-                s3_client.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
-                # file exists
-                cdn_url = get_cdn_url(key)
-
-                status = 'already archived'
-
-            except ClientError:
-                pass
-
-        v = requests.get(video_url, headers=headers)
-
-        with open(filename, 'wb') as f:
-            f.write(v.content)
-
-        if status != 'already archived':
-            cdn_url = get_cdn_url(key)
-
-            with open(filename, 'rb') as f:
-                do_s3_upload(s3_client, f, key)
-
-        duration = s.find_all('time')[0].contents[0]
-        if ':' in duration:
-            duration = float(duration.split(':')[0])*60 + float(duration.split(':')[1])
-        else:
-            duration = float(duration)
-
-        key_thumb, thumb_index = get_thumbnails(filename, s3_client, duration=duration)
-        os.remove(filename)
-
-        video_data = {
-            'cdn_url': cdn_url,
-            'thumbnail': key_thumb,
-            'thumbnail_index': thumb_index,
-            'duration': duration,
-            'title': original_url,
-            'timestamp': s.find_all('time')[1].get('datetime')
-        }
-
-        return (video_data, status)
-
-
-def internet_archive(url, s3_client):
-
-
-    ia_headers = {
-            "Accept": "application/json",
-            "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
-        }
-
-    r = requests.post(
-        'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
-
-    if r.status_code != 200:
-        return ({}, 'Internet archive failed')
-    else:
-        job_id = r.json()['job_id']
-
-        status_r = requests.get(
-            'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
-
-        retries = 0
-
-        while status_r.json()['status'] == 'pending' and retries < 40:
-            time.sleep(5)
-
-            try:
-                status_r = requests.get(
-                    'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
-            except:
-                time.sleep(1)
-
-            retries += 1
-
-        status_json = status_r.json()
-
-        if status_json['status'] == 'success':
-            url = 'https://web.archive.org/web/' + \
-                status_json['timestamp'] + '/' + status_json['original_url']
-
-            r = requests.get(url)
-
-            parsed = BeautifulSoup(
-                r.content, 'html.parser')
-            title = parsed.find_all('title')[
-                0].text
-
-            return ({'cdn_url': url, 'title': title}, 'Internet Archive fallback')
-        else:
-            return ({}, 'Internet Archive failed: ' + status_json['message'])
-
-def get_key(filename):
-    key = filename.split('/')[1]
-    if 'unknown_video' in key:
-        key = key.replace('unknown_video', 'jpg')
-    return key
-
-
-def download_vid(url, s3_client, check_if_exists=False):
-    ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
-    if (url[0:21] == 'https://facebook.com/' or url[0:25]  == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
-        print('Using cookie')
-        youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
-    ydl = youtube_dl.YoutubeDL(ydl_opts)
-    cdn_url = None
-    status = 'success'
-
-    if check_if_exists:
-        info = ydl.extract_info(url, download=False)
-
-        if 'entries' in info:
-            if len(info['entries']) > 1:
-                raise Exception(
-                    'ERROR: Cannot archive channels or pages with multiple videos')
-
-            filename = ydl.prepare_filename(info['entries'][0])
-        else:
-            filename = ydl.prepare_filename(info)
-
-        key = get_key(filename)
-
-        try:
-            s3_client.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
-            # file exists
-            cdn_url = get_cdn_url(os, key)
-
-            status = 'already archived'
-
-        except ClientError:
-            pass
-
-    # sometimes this results in a different filename, so do this again
-    info = ydl.extract_info(url, download=True)
-
-    if 'entries' in info:
-        if len(info['entries']) > 1:
-            raise Exception(
-                'ERROR: Cannot archive channels or pages with multiple videos')
-        else:
-            info = info['entries'][0]
-
-    filename = ydl.prepare_filename(info)
-
-    if not os.path.exists(filename):
-        filename = filename.split('.')[0] + '.mkv'
-
-    if status != 'already archived':
-        key = get_key(filename)
-        cdn_url = get_cdn_url(os, key)
-
-        with open(filename, 'rb') as f:
-            do_s3_upload(s3_client, f, key)
-
-    duration = info['duration'] if 'duration' in info else None
-    key_thumb, thumb_index = get_thumbnails(filename, s3_client, duration=duration)
-    os.remove(filename)
-
-    video_data = {
-        'cdn_url': cdn_url,
-        'thumbnail': key_thumb,
-        'thumbnail_index': thumb_index,
-        'duration': duration,
-        'title': info['title'] if 'title' in info else None,
-        'timestamp': info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None,
-    }
-
-    return (video_data, status)
-
-
-def update_sheet(wks, row, status, video_data, columns, v):
+def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v):
     update = []
 
     if columns['status'] is not None:
         update += [{
             'range': columns['status'] + str(row),
-            'values': [[status]]
+            'values': [[result.status]]
         }]
 
-    if 'cdn_url' in video_data and video_data['cdn_url'] is not None and columns['archive'] is not None and v[col_to_index(columns['archive'])] == '':
+    if result.cdn_url and columns['archive'] is not None and v[col_to_index(columns['archive'])] == '':
         update += [{
             'range': columns['archive'] + str(row),
-            'values': [[video_data['cdn_url']]]
+            'values': [[result.cdn_url]]
         }]
 
-    if 'date' in video_data and columns['date'] is not None and v[col_to_index(columns['date'])] == '':
+    if columns['date'] is not None and v[col_to_index(columns['date'])] == '':
         update += [{
             'range': columns['date'] + str(row),
             'values': [[datetime.datetime.now().isoformat()]]
         }]
 
-    if 'thumbnail' in video_data and columns['thumbnail'] is not None and v[col_to_index(columns['thumbnail'])] == '':
+    if result.thumbnail and columns['thumbnail'] is not None and v[col_to_index(columns['thumbnail'])] == '':
         update += [{
             'range': columns['thumbnail'] + str(row),
-            'values': [['=IMAGE("' + video_data['thumbnail'] + '")']]
+            'values': [['=IMAGE("' + result.thumbnail + '")']]
         }]
 
-    if 'thumbnail_index' in video_data and columns['thumbnail_index'] is not None and v[col_to_index(columns['thumbnail_index'])] == '':
+    if result.thumbnail_index and columns['thumbnail_index'] is not None and v[col_to_index(columns['thumbnail_index'])] == '':
         update += [{
             'range': columns['thumbnail_index'] + str(row),
-            'values': [[video_data['thumbnail_index']]]
+            'values': [[result.thumbnail_index]]
         }]
 
-    if 'timestamp' in video_data and columns['timestamp'] is not None and video_data['timestamp'] is not None and v[col_to_index(columns['timestamp'])] == '':
+    if result.timestamp and columns['timestamp'] is not None and v[col_to_index(columns['timestamp'])] == '':
         update += [{
             'range': columns['timestamp'] + str(row),
-            'values': [[video_data['timestamp']]] if type(video_data['timestamp']) == str else [[datetime.datetime.fromtimestamp(video_data['timestamp']).isoformat()]]
+            'values': [[result.timestamp]] if type(result.timestamp) == str else [[datetime.datetime.fromtimestamp(result.timestamp).isoformat()]]
         }]
 
-    if 'title' in video_data and columns['title'] is not None and video_data['title'] is not None and v[col_to_index(columns['title'])] == '':
+    if result.title and columns['title'] is not None and v[col_to_index(columns['title'])] == '':
         update += [{
             'range': columns['title'] + str(row),
-            'values': [[video_data['title']]]
+            'values': [[result.title]]
         }]
 
-    if 'duration' in video_data and columns['duration'] is not None and video_data['duration'] is not None and v[col_to_index(columns['duration'])] == '':
+    if result.duration and columns['duration'] is not None and v[col_to_index(columns['duration'])] == '':
         update += [{
             'range': columns['duration'] + str(row),
-            'values': [[str(video_data['duration'])]]
+            'values': [[str(result.duration)]]
         }]
 
     wks.batch_update(update, value_input_option='USER_ENTERED')
 
 
-def record_stream(url, s3_client, wks, i, columns, v):
-    video_data, status = download_vid(url, s3_client)
-    update_sheet(wks, i, status, video_data, columns, v)
+# def record_stream(url, s3_client, wks, i, columns, v):
+#     video_data, status = download_vid(url, s3_client)
+#     update_sheet(wks, i, status, video_data, columns, v)
 
 
 def process_sheet(sheet):
@@ -384,7 +116,7 @@ def process_sheet(sheet):
 
     # loop through worksheets to check
     for ii in range(n_worksheets):
-        print("Opening worksheet " + str(ii))
+        logger.info("Opening worksheet " + str(ii))
         wks = sh.get_worksheet(ii)
         values = wks.get_all_values()
 
@@ -396,7 +128,7 @@ def process_sheet(sheet):
                 'source url')) if 'source url' in headers else None
 
         if columns['url'] is None:
-            print("No 'Media URL' column found, skipping")
+            logger.warning("No 'Media URL' column found, skipping")
             continue
 
         url_index = col_to_index(columns['url'])
@@ -409,7 +141,7 @@ def process_sheet(sheet):
             'archive status')) if 'archive status' in headers else None
 
         if columns['status'] is None:
-            print("No 'Archive status' column found, skipping")
+            logger.warning("No 'Archive status' column found, skipping")
             continue
 
         columns['thumbnail'] = index_to_col(headers.index(
@@ -423,6 +155,15 @@ def process_sheet(sheet):
         columns['duration'] = index_to_col(headers.index(
             'duration')) if 'duration' in headers else None
 
+
+        active_archivers = [
+            archivers.TelegramArchiver(s3_client),
+            archivers.TiktokArchiver(s3_client),
+            archivers.YoutubeDLArchiver(s3_client),
+            archivers.WaybackArchiver(s3_client)
+        ]
+
+
         # loop through rows in worksheet
         for i in range(2, len(values)+1):
             v = values[i-1]
@@ -434,61 +175,25 @@ def process_sheet(sheet):
                 # check so we don't step on each others' toes
                 if latest_val == '' or latest_val is None:
                     wks.update(
-                            columns['status'] + str(i), 'Archive in progress')
+                        columns['status'] + str(i), 'Archive in progress')
 
-                    if 'http://t.me/' in v[url_index] or 'https://t.me/' in v[url_index]:
-                        video_data, status = download_telegram_video(
-                            v[url_index], s3_client, check_if_exists=True)
-                        
-                        if status == 'No telegram video found':
-                            print("Trying Internet Archive fallback")
+                    for archiver in active_archivers:
+                        logger.debug(f"Trying {archiver} on row {i}")
+                        result = archiver.download(v[url_index], check_if_exists=True)
+                        if result:
+                            logger.info(f"{archiver} succeeded on row {i}")
+                            break
 
-                            video_data, status = internet_archive(
-                                v[url_index], s3_client)
-                        
-                        update_sheet(wks, i, status, video_data, columns, v)
+                    if result:
+                        update_sheet(wks, i, result, columns, v)
 
-                    else:
-                        try:
-                            ydl_opts = {
-                                'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
-                            if (v[url_index][0:21] == 'https://facebook.com/' or v[url_index][0:25] == 'https://www.facebook.com/') and os.getenv('FB_COOKIE'):
-                                print('Using cookie')
-                                youtube_dl.utils.std_headers['cookie'] = os.getenv(
-                                    'FB_COOKIE')
-                            ydl = youtube_dl.YoutubeDL(ydl_opts)
-                            info = ydl.extract_info(
-                                v[url_index], download=False)
 
-                            if 'is_live' in info and info['is_live']:
-                                wks.update(columns['status'] +
-                                            str(i), 'Recording stream')
-                                t = threading.Thread(target=record_stream, args=(
-                                    v[url_index], s3_client, wks, i, columns, v))
-                                t.start()
-                                continue
-                            elif 'is_live' not in info or not info['is_live']:
-                                video_data, status = download_vid(
-                                    v[url_index], s3_client, check_if_exists=True)
-                                update_sheet(wks, i, status,
-                                                video_data, columns, v)
-                        
-                        except:
-                            # i'm sure there's a better way to handle this than nested try/catch blocks
-                            try:
-                                print("Trying Internet Archive fallback")
+                        # except:
+                            # if any unexpected errors occured, log these into the Google Sheet
+                            # t, value, traceback = sys.exc_info()
 
-                                video_data, status = internet_archive(
-                                    v[url_index], s3_client)
-                                update_sheet(wks, i, status,
-                                             video_data, columns, v)
-
-                            except:
-                                # if any unexpected errors occured, log these into the Google Sheet
-                                t, value, traceback = sys.exc_info()
-
-                                update_sheet(wks, i, str(
-                                    value), {}, columns, v)
+                            # update_sheet(wks, i, str(
+                            #     value), {}, columns, v)
 
 
 def main():
@@ -497,7 +202,7 @@ def main():
     parser.add_argument("--sheet", action="store", dest="sheet")
     args = parser.parse_args()
 
-    print("Opening document " + args.sheet)
+    logger.info("Opening document " + args.sheet)
 
     process_sheet(args.sheet)
 
diff --git a/auto_auto_archive.py b/auto_auto_archive.py
index f725d10..a518204 100644
--- a/auto_auto_archive.py
+++ b/auto_auto_archive.py
@@ -1,8 +1,7 @@
 import gspread
-import subprocess
 import argparse
 import auto_archive
-import datetime
+from loguru import logger
 
 def main():
     parser = argparse.ArgumentParser(
@@ -11,8 +10,7 @@ def main():
 
     args = parser.parse_args()
 
-    print(datetime.datetime.now())
-    print("Opening document " + args.sheet)
+    logger.info("Opening document " + args.sheet)
 
     gc = gspread.service_account(filename='service_account.json')
     sh = gc.open(args.sheet)
@@ -23,7 +21,7 @@ def main():
     for i in range(11, len(values)):
         sheet_name = values[i][0]
 
-        print("Processing " + sheet_name)
+        logger.info("Processing " + sheet_name)
 
         auto_archive.process_sheet(sheet_name)
 
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 53073dc..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-gspread
-youtube_dl
-boto3
-python-dotenv
-

From 009c0dd8cadb3420fc99acc1e6b1243c9f0f4ddb Mon Sep 17 00:00:00 2001
From: Logan Williams <logan.williams@alum.mit.edu>
Date: Sun, 20 Feb 2022 11:06:47 +0100
Subject: [PATCH 02/16] Clean up dependencies

---
 Pipfile      |  4 +---
 Pipfile.lock | 25 ++-----------------------
 2 files changed, 3 insertions(+), 26 deletions(-)

diff --git a/Pipfile b/Pipfile
index 88dbebf..0d954c9 100644
--- a/Pipfile
+++ b/Pipfile
@@ -9,14 +9,12 @@ boto3 = "*"
 python-dotenv = "*"
 youtube_dl = "*"
 argparse = "*"
-ffmpeg-python = "*"
 beautifulsoup4 = "*"
 nordvpn-switcher = "*"
 tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"}
-telethon = "*"
-ffmpeg = "*"
 bs4 = "*"
 loguru = "*"
+ffmpeg-python = "*"
 
 [dev-packages]
 
diff --git a/Pipfile.lock b/Pipfile.lock
index 8a5f227..b354d59 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "420a5d5c155830dac792fe2f037bebce97c30f4271301bb5950a288254798660"
+            "sha256": "af39efbad8c78641a732697001193b5f4f92a0af8a9709081428001362a47060"
         },
         "pipfile-spec": 6,
         "requires": {
@@ -93,13 +93,6 @@
             ],
             "version": "==1.2.58"
         },
-        "ffmpeg": {
-            "hashes": [
-                "sha256:6931692c890ff21d39938433c2189747815dca0c60ddc7f9bb97f199dba0b5b9"
-            ],
-            "index": "pypi",
-            "version": "==1.4"
-        },
         "ffmpeg-python": {
             "hashes": [
                 "sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127",
@@ -369,12 +362,6 @@
             ],
             "version": "==0.6.0"
         },
-        "pyaes": {
-            "hashes": [
-                "sha256:02c1b1405c38d3c370b085fb952dd8bea3fadcee6411ad99f312cc129c536d8f"
-            ],
-            "version": "==1.6.1"
-        },
         "pyasn1": {
             "hashes": [
                 "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359",
@@ -497,14 +484,6 @@
             "markers": "python_version >= '3.6'",
             "version": "==2.3.1"
         },
-        "telethon": {
-            "hashes": [
-                "sha256:04fdc5fa4ed3e886e6ecf4bad79205ab8880c6aefbd42c29c89c689a502aa816",
-                "sha256:818cb61281ed3f75ba4da9b68cb69486bed9474d2db4e0aa16e482053117452c"
-            ],
-            "index": "pypi",
-            "version": "==1.24.0"
-        },
         "tiktok-downloader": {
             "git": "https://github.com/msramalho/tiktok-downloader",
             "ref": "81c6ea1f959b2cc5620961043272592bd1bfc2e2"
@@ -535,4 +514,4 @@
         }
     },
     "develop": {}
-}
\ No newline at end of file
+}

From f3ce22666562bed2780181dbef95b8dee5a5e69e Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Mon, 21 Feb 2022 14:19:09 +0100
Subject: [PATCH 03/16] split into multiple files MVP

---
 .gitignore                      |   3 +-
 Pipfile                         |   1 -
 Pipfile.lock                    | 145 +-----------
 README.md                       |   2 +
 archivers.py                    | 390 --------------------------------
 archivers/__init__.py           |   6 +
 archivers/base_archiver.py      | 115 ++++++++++
 archivers/telegram_archiver.py  |  76 +++++++
 archivers/tiktok_archiver.py    |  68 ++++++
 archivers/wayback_archiver.py   |  73 ++++++
 archivers/youtubedl_archiver.py |  88 +++++++
 auto_archive.py                 |  15 +-
 12 files changed, 446 insertions(+), 536 deletions(-)
 delete mode 100644 archivers.py
 create mode 100644 archivers/__init__.py
 create mode 100644 archivers/base_archiver.py
 create mode 100644 archivers/telegram_archiver.py
 create mode 100644 archivers/tiktok_archiver.py
 create mode 100644 archivers/wayback_archiver.py
 create mode 100644 archivers/youtubedl_archiver.py

diff --git a/.gitignore b/.gitignore
index b6a6b68..5d7eec9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,8 @@
 tmp/
-.env
+.env*
 .DS_Store
 expmt/
 service_account.json
 __pycache__/
 ._*
+anu.html
\ No newline at end of file
diff --git a/Pipfile b/Pipfile
index 0d954c9..27071fa 100644
--- a/Pipfile
+++ b/Pipfile
@@ -10,7 +10,6 @@ python-dotenv = "*"
 youtube_dl = "*"
 argparse = "*"
 beautifulsoup4 = "*"
-nordvpn-switcher = "*"
 tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"}
 bs4 = "*"
 loguru = "*"
diff --git a/Pipfile.lock b/Pipfile.lock
index b354d59..9879884 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "af39efbad8c78641a732697001193b5f4f92a0af8a9709081428001362a47060"
+            "sha256": "9a5218275503e5ae779407349d0a76f44712dc4824e066b10aeb047264a168be"
         },
         "pipfile-spec": 6,
         "requires": {
@@ -93,6 +93,14 @@
             ],
             "version": "==1.2.58"
         },
+        "faker": {
+            "hashes": [
+                "sha256:ee8d9181137cdd2b198bd3d0653b0a3b7b385213862348e15ba8a423324b702b",
+                "sha256:f545b2a1ba5f7effc4ed71af0a5204d939445f0190838d41bee6bc160958bfbe"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==13.0.0"
+        },
         "ffmpeg-python": {
             "hashes": [
                 "sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127",
@@ -180,73 +188,6 @@
             "index": "pypi",
             "version": "==0.6.0"
         },
-        "lxml": {
-            "hashes": [
-                "sha256:078306d19a33920004addeb5f4630781aaeabb6a8d01398045fcde085091a169",
-                "sha256:0c1978ff1fd81ed9dcbba4f91cf09faf1f8082c9d72eb122e92294716c605428",
-                "sha256:1010042bfcac2b2dc6098260a2ed022968dbdfaf285fc65a3acf8e4eb1ffd1bc",
-                "sha256:1d650812b52d98679ed6c6b3b55cbb8fe5a5460a0aef29aeb08dc0b44577df85",
-                "sha256:20b8a746a026017acf07da39fdb10aa80ad9877046c9182442bf80c84a1c4696",
-                "sha256:2403a6d6fb61c285969b71f4a3527873fe93fd0abe0832d858a17fe68c8fa507",
-                "sha256:24f5c5ae618395ed871b3d8ebfcbb36e3f1091fd847bf54c4de623f9107942f3",
-                "sha256:28d1af847786f68bec57961f31221125c29d6f52d9187c01cd34dc14e2b29430",
-                "sha256:31499847fc5f73ee17dbe1b8e24c6dafc4e8d5b48803d17d22988976b0171f03",
-                "sha256:31ba2cbc64516dcdd6c24418daa7abff989ddf3ba6d3ea6f6ce6f2ed6e754ec9",
-                "sha256:330bff92c26d4aee79c5bc4d9967858bdbe73fdbdbacb5daf623a03a914fe05b",
-                "sha256:5045ee1ccd45a89c4daec1160217d363fcd23811e26734688007c26f28c9e9e7",
-                "sha256:52cbf2ff155b19dc4d4100f7442f6a697938bf4493f8d3b0c51d45568d5666b5",
-                "sha256:530f278849031b0eb12f46cca0e5db01cfe5177ab13bd6878c6e739319bae654",
-                "sha256:545bd39c9481f2e3f2727c78c169425efbfb3fbba6e7db4f46a80ebb249819ca",
-                "sha256:5804e04feb4e61babf3911c2a974a5b86f66ee227cc5006230b00ac6d285b3a9",
-                "sha256:5a58d0b12f5053e270510bf12f753a76aaf3d74c453c00942ed7d2c804ca845c",
-                "sha256:5f148b0c6133fb928503cfcdfdba395010f997aa44bcf6474fcdd0c5398d9b63",
-                "sha256:5f7d7d9afc7b293147e2d506a4596641d60181a35279ef3aa5778d0d9d9123fe",
-                "sha256:60d2f60bd5a2a979df28ab309352cdcf8181bda0cca4529769a945f09aba06f9",
-                "sha256:6259b511b0f2527e6d55ad87acc1c07b3cbffc3d5e050d7e7bcfa151b8202df9",
-                "sha256:6268e27873a3d191849204d00d03f65c0e343b3bcb518a6eaae05677c95621d1",
-                "sha256:627e79894770783c129cc5e89b947e52aa26e8e0557c7e205368a809da4b7939",
-                "sha256:62f93eac69ec0f4be98d1b96f4d6b964855b8255c345c17ff12c20b93f247b68",
-                "sha256:6d6483b1229470e1d8835e52e0ff3c6973b9b97b24cd1c116dca90b57a2cc613",
-                "sha256:6f7b82934c08e28a2d537d870293236b1000d94d0b4583825ab9649aef7ddf63",
-                "sha256:6fe4ef4402df0250b75ba876c3795510d782def5c1e63890bde02d622570d39e",
-                "sha256:719544565c2937c21a6f76d520e6e52b726d132815adb3447ccffbe9f44203c4",
-                "sha256:730766072fd5dcb219dd2b95c4c49752a54f00157f322bc6d71f7d2a31fecd79",
-                "sha256:74eb65ec61e3c7c019d7169387d1b6ffcfea1b9ec5894d116a9a903636e4a0b1",
-                "sha256:7993232bd4044392c47779a3c7e8889fea6883be46281d45a81451acfd704d7e",
-                "sha256:80bbaddf2baab7e6de4bc47405e34948e694a9efe0861c61cdc23aa774fcb141",
-                "sha256:86545e351e879d0b72b620db6a3b96346921fa87b3d366d6c074e5a9a0b8dadb",
-                "sha256:891dc8f522d7059ff0024cd3ae79fd224752676447f9c678f2a5c14b84d9a939",
-                "sha256:8a31f24e2a0b6317f33aafbb2f0895c0bce772980ae60c2c640d82caac49628a",
-                "sha256:8b99ec73073b37f9ebe8caf399001848fced9c08064effdbfc4da2b5a8d07b93",
-                "sha256:986b7a96228c9b4942ec420eff37556c5777bfba6758edcb95421e4a614b57f9",
-                "sha256:a1547ff4b8a833511eeaceacbcd17b043214fcdb385148f9c1bc5556ca9623e2",
-                "sha256:a2bfc7e2a0601b475477c954bf167dee6d0f55cb167e3f3e7cefad906e7759f6",
-                "sha256:a3c5f1a719aa11866ffc530d54ad965063a8cbbecae6515acbd5f0fae8f48eaa",
-                "sha256:a9f1c3489736ff8e1c7652e9dc39f80cff820f23624f23d9eab6e122ac99b150",
-                "sha256:aa0cf4922da7a3c905d000b35065df6184c0dc1d866dd3b86fd961905bbad2ea",
-                "sha256:ad4332a532e2d5acb231a2e5d33f943750091ee435daffca3fec0a53224e7e33",
-                "sha256:b2582b238e1658c4061ebe1b4df53c435190d22457642377fd0cb30685cdfb76",
-                "sha256:b6fc2e2fb6f532cf48b5fed57567ef286addcef38c28874458a41b7837a57807",
-                "sha256:b92d40121dcbd74831b690a75533da703750f7041b4bf951befc657c37e5695a",
-                "sha256:bbab6faf6568484707acc052f4dfc3802bdb0cafe079383fbaa23f1cdae9ecd4",
-                "sha256:c0b88ed1ae66777a798dc54f627e32d3b81c8009967c63993c450ee4cbcbec15",
-                "sha256:ce13d6291a5f47c1c8dbd375baa78551053bc6b5e5c0e9bb8e39c0a8359fd52f",
-                "sha256:db3535733f59e5605a88a706824dfcb9bd06725e709ecb017e165fc1d6e7d429",
-                "sha256:dd10383f1d6b7edf247d0960a3db274c07e96cf3a3fc7c41c8448f93eac3fb1c",
-                "sha256:e01f9531ba5420838c801c21c1b0f45dbc9607cb22ea2cf132844453bec863a5",
-                "sha256:e11527dc23d5ef44d76fef11213215c34f36af1608074561fcc561d983aeb870",
-                "sha256:e1ab2fac607842ac36864e358c42feb0960ae62c34aa4caaf12ada0a1fb5d99b",
-                "sha256:e1fd7d2fe11f1cb63d3336d147c852f6d07de0d0020d704c6031b46a30b02ca8",
-                "sha256:e9f84ed9f4d50b74fbc77298ee5c870f67cb7e91dcdc1a6915cb1ff6a317476c",
-                "sha256:ec4b4e75fc68da9dc0ed73dcdb431c25c57775383fec325d23a770a64e7ebc87",
-                "sha256:f10ce66fcdeb3543df51d423ede7e238be98412232fca5daec3e54bcd16b8da0",
-                "sha256:f63f62fc60e6228a4ca9abae28228f35e1bd3ce675013d1dfb828688d50c6e23",
-                "sha256:fa56bb08b3dd8eac3a8c5b7d075c94e74f755fd9d8a04543ae8d37b1612dd170",
-                "sha256:fa9b7c450be85bfc6cd39f6df8c5b8cbd76b5d6fc1f69efec80203f9894b885f"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==4.8.0"
-        },
         "markupsafe": {
             "hashes": [
                 "sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3",
@@ -293,14 +234,6 @@
             "markers": "python_version >= '3.7'",
             "version": "==2.1.0"
         },
-        "nordvpn-switcher": {
-            "hashes": [
-                "sha256:764db054715d949af0f836da5e46c4053afe92282a0d4b2cfc6b8cfe8c3045de",
-                "sha256:9788c2c3113d0d7b00894dae3ea19bed14f3b38d111d7223c126f001b1729a3b"
-            ],
-            "index": "pypi",
-            "version": "==0.2.9"
-        },
         "oauthlib": {
             "hashes": [
                 "sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2",
@@ -309,59 +242,6 @@
             "markers": "python_version >= '3.6'",
             "version": "==3.2.0"
         },
-        "pathlib": {
-            "hashes": [
-                "sha256:6940718dfc3eff4258203ad5021090933e5c04707d5ca8cc9e73c94a7894ea9f"
-            ],
-            "version": "==1.0.1"
-        },
-        "psutil": {
-            "hashes": [
-                "sha256:072664401ae6e7c1bfb878c65d7282d4b4391f1bc9a56d5e03b5a490403271b5",
-                "sha256:1070a9b287846a21a5d572d6dddd369517510b68710fca56b0e9e02fd24bed9a",
-                "sha256:1d7b433519b9a38192dfda962dd8f44446668c009833e1429a52424624f408b4",
-                "sha256:3151a58f0fbd8942ba94f7c31c7e6b310d2989f4da74fcbf28b934374e9bf841",
-                "sha256:32acf55cb9a8cbfb29167cd005951df81b567099295291bcfd1027365b36591d",
-                "sha256:3611e87eea393f779a35b192b46a164b1d01167c9d323dda9b1e527ea69d697d",
-                "sha256:3d00a664e31921009a84367266b35ba0aac04a2a6cad09c550a89041034d19a0",
-                "sha256:4e2fb92e3aeae3ec3b7b66c528981fd327fb93fd906a77215200404444ec1845",
-                "sha256:539e429da49c5d27d5a58e3563886057f8fc3868a5547b4f1876d9c0f007bccf",
-                "sha256:55ce319452e3d139e25d6c3f85a1acf12d1607ddedea5e35fb47a552c051161b",
-                "sha256:58c7d923dc209225600aec73aa2c4ae8ea33b1ab31bc11ef8a5933b027476f07",
-                "sha256:7336292a13a80eb93c21f36bde4328aa748a04b68c13d01dfddd67fc13fd0618",
-                "sha256:742c34fff804f34f62659279ed5c5b723bb0195e9d7bd9907591de9f8f6558e2",
-                "sha256:7641300de73e4909e5d148e90cc3142fb890079e1525a840cf0dfd39195239fd",
-                "sha256:76cebf84aac1d6da5b63df11fe0d377b46b7b500d892284068bacccf12f20666",
-                "sha256:7779be4025c540d1d65a2de3f30caeacc49ae7a2152108adeaf42c7534a115ce",
-                "sha256:7d190ee2eaef7831163f254dc58f6d2e2a22e27382b936aab51c835fc080c3d3",
-                "sha256:8293942e4ce0c5689821f65ce6522ce4786d02af57f13c0195b40e1edb1db61d",
-                "sha256:869842dbd66bb80c3217158e629d6fceaecc3a3166d3d1faee515b05dd26ca25",
-                "sha256:90a58b9fcae2dbfe4ba852b57bd4a1dded6b990a33d6428c7614b7d48eccb492",
-                "sha256:9b51917c1af3fa35a3f2dabd7ba96a2a4f19df3dec911da73875e1edaf22a40b",
-                "sha256:b2237f35c4bbae932ee98902a08050a27821f8f6dfa880a47195e5993af4702d",
-                "sha256:c3400cae15bdb449d518545cbd5b649117de54e3596ded84aacabfbb3297ead2",
-                "sha256:c51f1af02334e4b516ec221ee26b8fdf105032418ca5a5ab9737e8c87dafe203",
-                "sha256:cb8d10461c1ceee0c25a64f2dd54872b70b89c26419e147a05a10b753ad36ec2",
-                "sha256:d62a2796e08dd024b8179bd441cb714e0f81226c352c802fca0fd3f89eeacd94",
-                "sha256:df2c8bd48fb83a8408c8390b143c6a6fa10cb1a674ca664954de193fdcab36a9",
-                "sha256:e5c783d0b1ad6ca8a5d3e7b680468c9c926b804be83a3a8e95141b05c39c9f64",
-                "sha256:e9805fed4f2a81de98ae5fe38b75a74c6e6ad2df8a5c479594c7629a1fe35f56",
-                "sha256:ea42d747c5f71b5ccaa6897b216a7dadb9f52c72a0fe2b872ef7d3e1eacf3ba3",
-                "sha256:ef216cc9feb60634bda2f341a9559ac594e2eeaadd0ba187a4c2eb5b5d40b91c",
-                "sha256:ff0d41f8b3e9ebb6b6110057e40019a432e96aae2008951121ba4e56040b84f3"
-            ],
-            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==5.9.0"
-        },
-        "py-mini-racer": {
-            "hashes": [
-                "sha256:346e73bb89a2024888244d487834be24a121089ceb0641dd0200cb96c4e24b57",
-                "sha256:42896c24968481dd953eeeb11de331f6870917811961c9b26ba09071e07180e2",
-                "sha256:97cab31bbf63ce462ba4cd6e978c572c916d8b15586156c7c5e0b2e42c10baab",
-                "sha256:f71e36b643d947ba698c57cd9bd2232c83ca997b0802fc2f7f79582377040c11"
-            ],
-            "version": "==0.6.0"
-        },
         "pyasn1": {
             "hashes": [
                 "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359",
@@ -422,13 +302,6 @@
             "index": "pypi",
             "version": "==0.19.2"
         },
-        "random-user-agent": {
-            "hashes": [
-                "sha256:535636a55fb63fe3d74fd0260d854c241d9f2946447026464e578e68eac17dac",
-                "sha256:8f8ca26ec8cb1d24ad1758d8b8f700d154064d641dbe9a255cfec42960fbd012"
-            ],
-            "version": "==1.0.1"
-        },
         "requests": {
             "hashes": [
                 "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
diff --git a/README.md b/README.md
index 2e40bcc..cec6e9a 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,8 @@ If you are using `pipenv` (recommended), `pipenv install` is sufficient to insta
 
 [A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script.
 
+[ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work. 
+
 A `.env` file is required for saving content to a Digital Ocean space, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables:
 
 ```
diff --git a/archivers.py b/archivers.py
deleted file mode 100644
index d8a72f6..0000000
--- a/archivers.py
+++ /dev/null
@@ -1,390 +0,0 @@
-from dataclasses import dataclass
-import youtube_dl
-from bs4 import BeautifulSoup
-import requests
-import tiktok_downloader
-from loguru import logger
-import os
-import datetime
-import ffmpeg
-from botocore.errorfactory import ClientError
-import time
-import traceback
-
-# TODO There should be a better way of generating keys, that adds the following info:
-#           - name of sheet that it is being archived from
-#             (this means we might archive the same media twice on different sheets, but that's OK I think)
-#           - name of archiver/platform that the video comes from
-#       This should make it easier to maintain and clean the archive later
-
-# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be
-#      cleaned up? Difficult is we don't know the filename until the archivers start working.
-
-
-def get_cdn_url(key):
-    return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
-        os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
-
-
-def do_s3_upload(s3_client, f, key):
-    s3_client.upload_fileobj(f, Bucket=os.getenv(
-        'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
-
-
-def get_key(filename):
-    key = filename.split('/')[1]
-    if 'unknown_video' in key:
-        key = key.replace('unknown_video', 'jpg')
-    return key
-
-
-def get_thumbnails(filename, s3_client, duration=None):
-    if not os.path.exists(filename.split('.')[0]):
-        os.mkdir(filename.split('.')[0])
-
-    fps = 0.5
-    if duration is not None:
-        duration = float(duration)
-
-        if duration < 60:
-            fps = 10.0 / duration
-        elif duration < 120:
-            fps = 20.0 / duration
-        else:
-            fps = 40.0 / duration
-
-    stream = ffmpeg.input(filename)
-    stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
-    stream.output(filename.split('.')[0] + '/out%d.jpg').run()
-
-    thumbnails = os.listdir(filename.split('.')[0] + '/')
-    cdn_urls = []
-
-    for fname in thumbnails:
-        if fname[-3:] == 'jpg':
-            thumbnail_filename = filename.split('.')[0] + '/' + fname
-            key = filename.split('/')[1].split('.')[0] + '/' + fname
-
-            cdn_url = get_cdn_url(key)
-
-            with open(thumbnail_filename, 'rb') as f:
-                do_s3_upload(s3_client, f, key)
-
-            cdn_urls.append(cdn_url)
-            os.remove(thumbnail_filename)
-
-    if len(cdn_urls) == 0:
-        return ('None', 'None')
-
-    key_thumb = cdn_urls[int(len(cdn_urls)*0.1)]
-
-    index_page = f'''<html><head><title>{filename}</title></head>
-        <body>'''
-
-    for t in cdn_urls:
-        index_page += f'<img src="{t}" />'
-
-    index_page += f"</body></html>"
-    index_fname = filename.split('.')[0] + '/index.html'
-
-    with open(index_fname, 'w') as f:
-        f.write(index_page)
-
-    thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
-
-    s3_client.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
-        'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
-
-    thumb_index_cdn_url = get_cdn_url(thumb_index)
-
-    return (key_thumb, thumb_index_cdn_url)
-
-
-@dataclass
-class ArchiveResult:
-    status: str
-    cdn_url: str = None
-    thumbnail: str = None
-    thumbnail_index: str = None
-    duration: float = None
-    title: str = None
-    timestamp: datetime.datetime = None
-
-
-class Archiver:
-    def __init__(self, s3_client):
-        self.s3 = s3_client
-
-    def download(self, url):
-        pass
-
-
-class TelegramArchiver(Archiver):
-    def download(self, url, check_if_exists=False):
-        # detect URLs that we definitely cannot handle
-        if 'http://t.me/' not in url and 'https://t.me/' not in url:
-            return False
-
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
-        status = "success"
-
-        original_url = url
-
-        if url[-8:] != "?embed=1":
-            url += "?embed=1"
-
-        t = requests.get(url, headers=headers)
-        s = BeautifulSoup(t.content, 'html.parser')
-        video = s.find("video")
-
-        if video is None:
-            return False  # could not find video
-
-        video_url = video.get('src')
-        key = video_url.split('/')[-1].split('?')[0]
-        filename = 'tmp/' + key
-
-        if check_if_exists:
-            try:
-                self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
-                # file exists
-                cdn_url = get_cdn_url(key)
-
-                status = 'already archived'
-
-            except ClientError:
-                pass
-
-        v = requests.get(video_url, headers=headers)
-
-        with open(filename, 'wb') as f:
-            f.write(v.content)
-
-        if status != 'already archived':
-            cdn_url = get_cdn_url(key)
-
-            with open(filename, 'rb') as f:
-                do_s3_upload(self.s3, f, key)
-
-        # extract duration from HTML
-        duration = s.find_all('time')[0].contents[0]
-        if ':' in duration:
-            duration = float(duration.split(
-                ':')[0])*60 + float(duration.split(':')[1])
-        else:
-            duration = float(duration)
-
-        # process thumbnails
-        key_thumb, thumb_index = get_thumbnails(
-            filename, self.s3, duration=duration)
-        os.remove(filename)
-
-        return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
-                             duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))
-
-
-class YoutubeDLArchiver(Archiver):
-    def download(self, url, check_if_exists=False):
-        ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
-        if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
-            logger.info('Using Facebook cookie')
-            youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
-
-        ydl = youtube_dl.YoutubeDL(ydl_opts)
-        cdn_url = None
-        status = 'success'
-
-        try:
-            info = ydl.extract_info(url, download=False)
-        except youtube_dl.utils.DownloadError:
-            # no video here
-            return False
-
-        if 'is_live' in info and info['is_live']:
-            logger.warning("Live streaming media, not archiving now")
-            return ArchiveResult(status="Streaming media")
-
-        if check_if_exists:
-            if 'entries' in info:
-                if len(info['entries']) > 1:
-                    logger.warning(
-                        'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
-                    return False
-
-                filename = ydl.prepare_filename(info['entries'][0])
-            else:
-                filename = ydl.prepare_filename(info)
-
-            key = get_key(filename)
-
-            try:
-                self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
-                # file exists
-                cdn_url = get_cdn_url(key)
-
-                status = 'already archived'
-
-            except ClientError:
-                pass
-
-        # sometimes this results in a different filename, so do this again
-        info = ydl.extract_info(url, download=True)
-
-        if 'entries' in info:
-            if len(info['entries']) > 1:
-                logger.warning(
-                    'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
-                return False
-            else:
-                info = info['entries'][0]
-
-        filename = ydl.prepare_filename(info)
-
-        if not os.path.exists(filename):
-            filename = filename.split('.')[0] + '.mkv'
-
-        if status != 'already archived':
-            key = get_key(filename)
-            cdn_url = get_cdn_url(key)
-
-            with open(filename, 'rb') as f:
-                do_s3_upload(self.s3, f, key)
-
-        # get duration
-        duration = info['duration'] if 'duration' in info else None
-
-        # get thumbnails
-        key_thumb, thumb_index = get_thumbnails(
-            filename, self.s3, duration=duration)
-        os.remove(filename)
-
-        return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
-                             title=info['title'] if 'title' in info else None,
-                             timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
-
-
-class WaybackArchiver(Archiver):
-    def __init__(self, s3_client):
-        self.s3 = s3_client
-        self.seen_urls = {}
-
-    def download(self, url, check_if_exists=False):
-        if check_if_exists and url in self.seen_urls:
-            return self.seen_urls[url]
-
-        ia_headers = {
-            "Accept": "application/json",
-            "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
-        }
-
-        r = requests.post(
-            'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
-
-        if r.status_code != 200:
-            return ArchiveResult(status="Internet archive failed")
-
-        job_id = r.json()['job_id']
-
-        status_r = requests.get(
-            'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
-
-        retries = 0
-
-        # wait 90-120 seconds for the archive job to finish
-        while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
-            time.sleep(3)
-
-            try:
-                status_r = requests.get(
-                    'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
-            except:
-                time.sleep(1)
-
-            retries += 1
-
-        if status_r.status_code != 200:
-            return ArchiveResult(status="Internet archive failed")
-
-        status_json = status_r.json()
-
-        if status_json['status'] != 'success':
-            return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
-
-        archive_url = 'https://web.archive.org/web/' + \
-            status_json['timestamp'] + '/' + status_json['original_url']
-
-        try:
-            r = requests.get(archive_url)
-
-            parsed = BeautifulSoup(
-                r.content, 'html.parser')
-
-            title = parsed.find_all('title')[
-                0].text
-        except:
-            title = "Could not get title"
-
-        result = ArchiveResult(
-            status='Internet Archive fallback', cdn_url=archive_url, title=title)
-        self.seen_urls[url] = result
-        return result
-
-
-class TiktokArchiver(Archiver):
-    def download(self, url, check_if_exists=False):
-        if 'tiktok.com' not in url:
-            return False
-
-        status = 'success'
-
-        try:
-            info = tiktok_downloader.info_post(url)
-            key = 'tiktok_' + str(info.id) + '.mp4'
-            filename = 'tmp/' + key
-
-            if check_if_exists:
-                try:
-                    self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
-                    # file exists
-                    cdn_url = get_cdn_url(key)
-
-                    status = 'already archived'
-
-                except ClientError:
-                    pass
-
-            if status != 'already archived':
-                media = tiktok_downloader.snaptik(url).get_media()
-                if len(media) > 0:
-                    media[0].download(filename)
-                    with open(filename, 'rb') as f:
-                        do_s3_upload(self.s3, f, key)
-
-                    cdn_url = get_cdn_url(key)
-                else:
-                    status = 'could not download media'
-
-            try:
-                key_thumb, thumb_index = get_thumbnails(
-                    filename, self.s3, duration=info.duration)
-            except:
-                key_thumb = ''
-                thumb_index = 'error creating thumbnails'
-
-            os.remove(filename)
-
-            return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
-                                 thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat())
-
-        except tiktok_downloader.Except.InvalidUrl:
-            status = 'Invalid URL'
-            return ArchiveResult(status=status)
-
-        except:
-            error = traceback.format_exc()
-            status = 'Other Tiktok error: ' + str(error)
-            return ArchiveResult(status=status)
diff --git a/archivers/__init__.py b/archivers/__init__.py
new file mode 100644
index 0000000..e6c4ba6
--- /dev/null
+++ b/archivers/__init__.py
@@ -0,0 +1,6 @@
+# we need to explicitly expose the available imports here
+from .base_archiver import *
+from .telegram_archiver import *
+from .tiktok_archiver import *
+from .wayback_archiver import *
+from .youtubedl_archiver import *
\ No newline at end of file
diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
new file mode 100644
index 0000000..3f9f4ac
--- /dev/null
+++ b/archivers/base_archiver.py
@@ -0,0 +1,115 @@
+import os
+import ffmpeg
+from dataclasses import dataclass
+import datetime
+from loguru import logger
+
+# TODO There should be a better way of generating keys, that adds the following info:
+#           - name of sheet that it is being archived from
+#             (this means we might archive the same media twice on different sheets, but that's OK I think)
+#           - name of archiver/platform that the video comes from
+#       This should make it easier to maintain and clean the archive later
+
+# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be
+#      cleaned up? Difficult is we don't know the filename until the archivers start working.
+
+
+@dataclass
+class ArchiveResult:
+    status: str
+    cdn_url: str = None
+    thumbnail: str = None
+    thumbnail_index: str = None
+    duration: float = None
+    title: str = None
+    timestamp: datetime.datetime = None
+
+
+class Archiver:
+    name = "default"
+
+    def __init__(self, s3_client):
+        self.s3 = s3_client
+
+    def __str__(self):
+        return self.__class__.__name__
+
+    def download(self, url, check_if_exists=False):
+        logger.error("method 'download' not implemented")
+
+    def get_cdn_url(self, key):
+        return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
+            os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
+
+    def do_s3_upload(self, f, key):
+        self.s3.upload_fileobj(f, Bucket=os.getenv(
+            'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
+
+    def get_key(self, filename):
+        print(f"key base implementation: {self.name}")
+        # TODO: refactor to be more manageable
+        key = filename.split('/')[1]
+        if 'unknown_video' in key:
+            key = key.replace('unknown_video', 'jpg')
+        return key
+
+    def get_thumbnails(self, filename, duration=None):
+        if not os.path.exists(filename.split('.')[0]):
+            os.mkdir(filename.split('.')[0])
+
+        fps = 0.5
+        if duration is not None:
+            duration = float(duration)
+
+            if duration < 60:
+                fps = 10.0 / duration
+            elif duration < 120:
+                fps = 20.0 / duration
+            else:
+                fps = 40.0 / duration
+
+        stream = ffmpeg.input(filename)
+        stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
+        stream.output(filename.split('.')[0] + '/out%d.jpg').run()
+
+        thumbnails = os.listdir(filename.split('.')[0] + '/')
+        cdn_urls = []
+
+        for fname in thumbnails:
+            if fname[-3:] == 'jpg':
+                thumbnail_filename = filename.split('.')[0] + '/' + fname
+                key = filename.split('/')[1].split('.')[0] + '/' + fname
+
+                cdn_url = self.get_cdn_url(key)
+
+                with open(thumbnail_filename, 'rb') as f:
+                    self.do_s3_upload(f, key)
+
+                cdn_urls.append(cdn_url)
+                os.remove(thumbnail_filename)
+
+        if len(cdn_urls) == 0:
+            return ('None', 'None')
+
+        key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]
+
+        index_page = f'''<html><head><title>{filename}</title></head>
+            <body>'''
+
+        for t in cdn_urls:
+            index_page += f'<img src="{t}" />'
+
+        index_page += f"</body></html>"
+        index_fname = filename.split('.')[0] + '/index.html'
+
+        with open(index_fname, 'w') as f:
+            f.write(index_page)
+
+        thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
+
+        self.s3.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
+            'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
+
+        thumb_index_cdn_url = self.get_cdn_url(thumb_index)
+
+        return (key_thumb, thumb_index_cdn_url)
diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py
new file mode 100644
index 0000000..d9168e4
--- /dev/null
+++ b/archivers/telegram_archiver.py
@@ -0,0 +1,76 @@
+import os
+import requests
+from bs4 import BeautifulSoup
+from botocore.errorfactory import ClientError
+from .base_archiver import Archiver, ArchiveResult
+
+# TODO: get_cdn_url, get_thumbnails, do_s3_upload
+
+
+class TelegramArchiver(Archiver):
+    name = "telegram"
+    
+    def download(self, url, check_if_exists=False):
+        # detect URLs that we definitely cannot handle
+        if 'http://t.me/' not in url and 'https://t.me/' not in url:
+            return False
+
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
+        }
+        status = "success"
+
+        original_url = url
+
+        # TODO: check if we can do this more resilient to user-input
+        if url[-8:] != "?embed=1":
+            url += "?embed=1"
+
+        t = requests.get(url, headers=headers)
+        s = BeautifulSoup(t.content, 'html.parser')
+        video = s.find("video")
+
+        if video is None:
+            return False  # could not find video
+
+        video_url = video.get('src')
+        key = video_url.split('/')[-1].split('?')[0]
+        filename = 'tmp/' + key
+
+        if check_if_exists:
+            try:
+                self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
+
+                # file exists
+                cdn_url = self.get_cdn_url(key)
+
+                status = 'already archived'
+
+            except ClientError:
+                pass
+
+        v = requests.get(video_url, headers=headers)
+
+        with open(filename, 'wb') as f:
+            f.write(v.content)
+
+        if status != 'already archived':
+            cdn_url = self.get_cdn_url(key)
+
+            with open(filename, 'rb') as f:
+                self.do_s3_upload(f, key)
+
+        # extract duration from HTML
+        duration = s.find_all('time')[0].contents[0]
+        if ':' in duration:
+            duration = float(duration.split(
+                ':')[0]) * 60 + float(duration.split(':')[1])
+        else:
+            duration = float(duration)
+
+        # process thumbnails
+        key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
+        os.remove(filename)
+
+        return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
+                             duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))
diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py
new file mode 100644
index 0000000..1e3bcaf
--- /dev/null
+++ b/archivers/tiktok_archiver.py
@@ -0,0 +1,68 @@
+import os, traceback
+from botocore.errorfactory import ClientError
+import tiktok_downloader
+from loguru import logger
+from .base_archiver import Archiver, ArchiveResult
+
+# TODO: get_cdn_url, do_s3_upload, get_thumbnails
+
+
+class TiktokArchiver(Archiver):
+    name = "tiktok"
+    
+    def download(self, url, check_if_exists=False):
+        if 'tiktok.com' not in url:
+            return False
+
+        status = 'success'
+
+        try:
+            info = tiktok_downloader.info_post(url)
+            key = 'tiktok_' + str(info.id) + '.mp4'
+            filename = 'tmp/' + key
+
+            if check_if_exists:
+                try:
+                    self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
+
+                    # file exists
+                    cdn_url = self.get_cdn_url(key)
+
+                    status = 'already archived'
+
+                except ClientError:
+                    pass
+
+            if status != 'already archived':
+                media = tiktok_downloader.snaptik(url).get_media()
+                if len(media) > 0:
+                    media[0].download(filename)
+                    with open(filename, 'rb') as f:
+                        self.do_s3_upload(f, key)
+
+                    cdn_url = self.get_cdn_url(key)
+                else:
+                    status = 'could not download media'
+
+            try:
+                key_thumb, thumb_index = self.get_thumbnails(
+                    filename, duration=info.duration)
+            except:
+                key_thumb = ''
+                thumb_index = 'error creating thumbnails'
+
+            try: os.remove(filename)
+            except FileNotFoundError:
+                logger.info(f'tmp file not found thus not deleted {filename}')
+
+            return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
+                                 thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat())
+
+        except tiktok_downloader.Except.InvalidUrl:
+            status = 'Invalid URL'
+            return ArchiveResult(status=status)
+
+        except:
+            error = traceback.format_exc()
+            status = 'Other Tiktok error: ' + str(error)
+            return ArchiveResult(status=status)
diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py
new file mode 100644
index 0000000..a021324
--- /dev/null
+++ b/archivers/wayback_archiver.py
@@ -0,0 +1,73 @@
+import time, requests, os
+from bs4 import BeautifulSoup
+
+from .base_archiver import Archiver, ArchiveResult
+
+
+class WaybackArchiver(Archiver):
+    name = "wayback"
+    
+    def __init__(self, s3_client):
+        self.s3 = s3_client
+        self.seen_urls = {}
+
+    def download(self, url, check_if_exists=False):
+        if check_if_exists and url in self.seen_urls:
+            return self.seen_urls[url]
+
+        ia_headers = {
+            "Accept": "application/json",
+            "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
+        }
+
+        r = requests.post(
+            'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
+
+        if r.status_code != 200:
+            return ArchiveResult(status="Internet archive failed")
+
+        job_id = r.json()['job_id']
+
+        status_r = requests.get(
+            'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
+
+        retries = 0
+
+        # wait 90-120 seconds for the archive job to finish
+        while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
+            time.sleep(3)
+
+            try:
+                status_r = requests.get(
+                    'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
+            except:
+                time.sleep(1)
+
+            retries += 1
+
+        if status_r.status_code != 200:
+            return ArchiveResult(status="Internet archive failed")
+
+        status_json = status_r.json()
+
+        if status_json['status'] != 'success':
+            return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
+
+        archive_url = 'https://web.archive.org/web/' + \
+            status_json['timestamp'] + '/' + status_json['original_url']
+
+        try:
+            r = requests.get(archive_url)
+
+            parsed = BeautifulSoup(
+                r.content, 'html.parser')
+
+            title = parsed.find_all('title')[
+                0].text
+        except:
+            title = "Could not get title"
+
+        result = ArchiveResult(
+            status='Internet Archive fallback', cdn_url=archive_url, title=title)
+        self.seen_urls[url] = result
+        return result
diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py
new file mode 100644
index 0000000..8249cfa
--- /dev/null
+++ b/archivers/youtubedl_archiver.py
@@ -0,0 +1,88 @@
+
+import os
+import datetime
+import youtube_dl
+from loguru import logger
+from botocore.errorfactory import ClientError
+from .base_archiver import Archiver, ArchiveResult
+
+class YoutubeDLArchiver(Archiver):
+    name = "yotube_dl"
+    
+    def download(self, url, check_if_exists=False):
+        ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
+        if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
+            logger.info('Using Facebook cookie')
+            youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
+
+        ydl = youtube_dl.YoutubeDL(ydl_opts)
+        cdn_url = None
+        status = 'success'
+
+        try:
+            info = ydl.extract_info(url, download=False)
+        except youtube_dl.utils.DownloadError:
+            # no video here
+            return False
+
+        if 'is_live' in info and info['is_live']:
+            logger.warning("Live streaming media, not archiving now")
+            return ArchiveResult(status="Streaming media")
+
+        if check_if_exists:
+            if 'entries' in info:
+                if len(info['entries']) > 1:
+                    logger.warning(
+                        'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
+                    return False
+
+                filename = ydl.prepare_filename(info['entries'][0])
+            else:
+                filename = ydl.prepare_filename(info)
+
+            key = self.get_key(filename)
+
+            try:
+                self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
+
+                # file exists
+                cdn_url = self.get_cdn_url(key)
+
+                status = 'already archived'
+
+            except ClientError:
+                pass
+
+        # sometimes this results in a different filename, so do this again
+        info = ydl.extract_info(url, download=True)
+
+        if 'entries' in info:
+            if len(info['entries']) > 1:
+                logger.warning(
+                    'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
+                return False
+            else:
+                info = info['entries'][0]
+
+        filename = ydl.prepare_filename(info)
+
+        if not os.path.exists(filename):
+            filename = filename.split('.')[0] + '.mkv'
+
+        if status != 'already archived':
+            key = self. get_key(filename)
+            cdn_url = self.get_cdn_url(key)
+
+            with open(filename, 'rb') as f:
+                self.do_s3_upload(f, key)
+
+        # get duration
+        duration = info['duration'] if 'duration' in info else None
+
+        # get thumbnails
+        key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
+        os.remove(filename)
+
+        return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
+                             title=info['title'] if 'title' in info else None,
+                             timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
diff --git a/auto_archive.py b/auto_archive.py
index ef4f89c..c478463 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -1,14 +1,12 @@
-from dataclasses import dataclass
-import gspread
-from pathlib import Path
-import datetime
-import boto3
 import os
-from dotenv import load_dotenv
+import datetime
 import argparse
 import math
-import threading
+import gspread
+import boto3
 from loguru import logger
+from dotenv import load_dotenv
+
 import archivers
 
 load_dotenv()
@@ -156,6 +154,7 @@ def process_sheet(sheet):
             'duration')) if 'duration' in headers else None
 
 
+        # order matters, first to succeed excludes remaining
         active_archivers = [
             archivers.TelegramArchiver(s3_client),
             archivers.TiktokArchiver(s3_client),
@@ -198,7 +197,7 @@ def process_sheet(sheet):
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Automatically use youtube-dl to download media from a Google Sheet")
+        description="Automatically archive social media videos from a Google Sheet")
     parser.add_argument("--sheet", action="store", dest="sheet")
     args = parser.parse_args()
 

From 07b5d357b478b892313f8813e4fb77764fd811c9 Mon Sep 17 00:00:00 2001
From: Logan Williams <logan.williams@alum.mit.edu>
Date: Tue, 22 Feb 2022 08:20:45 +0100
Subject: [PATCH 04/16] Fix bugs in WaybackArchiver, follow redirects sometimes

---
 archivers.py    | 48 +++++++++++++++++++++++++++++++++++-------------
 auto_archive.py | 30 +++++++++++++++++++-----------
 2 files changed, 54 insertions(+), 24 deletions(-)

diff --git a/archivers.py b/archivers.py
index d8a72f6..7c8df8c 100644
--- a/archivers.py
+++ b/archivers.py
@@ -210,7 +210,11 @@ class YoutubeDLArchiver(Archiver):
             if 'entries' in info:
                 if len(info['entries']) > 1:
                     logger.warning(
-                        'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
+                        'YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos')
+                    return False
+                elif len(info['entries']) == 0:
+                    logger.warning(
+                        'YoutubeDLArchiver succeeded but did not find video')
                     return False
 
                 filename = ydl.prepare_filename(info['entries'][0])
@@ -257,13 +261,21 @@ class YoutubeDLArchiver(Archiver):
         duration = info['duration'] if 'duration' in info else None
 
         # get thumbnails
-        key_thumb, thumb_index = get_thumbnails(
-            filename, self.s3, duration=duration)
+        try:
+            key_thumb, thumb_index = get_thumbnails(
+                filename, self.s3, duration=duration)
+        except:
+            key_thumb = ''
+            thumb_index = 'Could not generate thumbnails'
+
         os.remove(filename)
 
+        timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(
+            info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None
+
         return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
                              title=info['title'] if 'title' in info else None,
-                             timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
+                             timestamp=timestamp)
 
 
 class WaybackArchiver(Archiver):
@@ -286,6 +298,9 @@ class WaybackArchiver(Archiver):
         if r.status_code != 200:
             return ArchiveResult(status="Internet archive failed")
 
+        if 'job_id' not in r.json() and 'message' in r.json():
+            return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
+
         job_id = r.json()['job_id']
 
         status_r = requests.get(
@@ -311,7 +326,7 @@ class WaybackArchiver(Archiver):
         status_json = status_r.json()
 
         if status_json['status'] != 'success':
-            return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
+            return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
 
         archive_url = 'https://web.archive.org/web/' + \
             status_json['timestamp'] + '/' + status_json['original_url']
@@ -324,6 +339,9 @@ class WaybackArchiver(Archiver):
 
             title = parsed.find_all('title')[
                 0].text
+
+            if title == 'Wayback Machine':
+                title = 'Could not get title'
         except:
             title = "Could not get title"
 
@@ -343,6 +361,7 @@ class TiktokArchiver(Archiver):
         try:
             info = tiktok_downloader.info_post(url)
             key = 'tiktok_' + str(info.id) + '.mp4'
+            cdn_url = get_cdn_url(key)
             filename = 'tmp/' + key
 
             if check_if_exists:
@@ -357,16 +376,19 @@ class TiktokArchiver(Archiver):
                 except ClientError:
                     pass
 
-            if status != 'already archived':
-                media = tiktok_downloader.snaptik(url).get_media()
-                if len(media) > 0:
-                    media[0].download(filename)
-                    with open(filename, 'rb') as f:
-                        do_s3_upload(self.s3, f, key)
+            media = tiktok_downloader.snaptik(url).get_media()
 
-                    cdn_url = get_cdn_url(key)
+            if len(media) <= 0:
+                if status == 'already archived':
+                    return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url)
                 else:
-                    status = 'could not download media'
+                    return ArchiveResult(status='Could not download media')
+
+            media[0].download(filename)
+
+            if status != 'already archived':
+                with open(filename, 'rb') as f:
+                    do_s3_upload(self.s3, f, key)
 
             try:
                 key_thumb, thumb_index = get_thumbnails(
diff --git a/auto_archive.py b/auto_archive.py
index ef4f89c..fe2ccfd 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -10,6 +10,7 @@ import math
 import threading
 from loguru import logger
 import archivers
+import requests
 
 load_dotenv()
 
@@ -43,7 +44,7 @@ def index_to_col(index):
         return alphabet[index]
 
 
-def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v):
+def update_sheet(wks, row, result: archivers.ArchiveResult, columns, v):
     update = []
 
     if columns['status'] is not None:
@@ -155,7 +156,6 @@ def process_sheet(sheet):
         columns['duration'] = index_to_col(headers.index(
             'duration')) if 'duration' in headers else None
 
-
         active_archivers = [
             archivers.TelegramArchiver(s3_client),
             archivers.TiktokArchiver(s3_client),
@@ -163,7 +163,6 @@ def process_sheet(sheet):
             archivers.WaybackArchiver(s3_client)
         ]
 
-
         # loop through rows in worksheet
         for i in range(2, len(values)+1):
             v = values[i-1]
@@ -174,26 +173,35 @@ def process_sheet(sheet):
 
                 # check so we don't step on each others' toes
                 if latest_val == '' or latest_val is None:
-                    wks.update(
-                        columns['status'] + str(i), 'Archive in progress')
+                    wks.update(columns['status'] + str(i),
+                               'Archive in progress')
 
                     for archiver in active_archivers:
                         logger.debug(f"Trying {archiver} on row {i}")
-                        result = archiver.download(v[url_index], check_if_exists=True)
+
+                        url = v[url_index]
+                        # expand short URL links
+                        if 'https://t.co/' in url:
+                            r = requests.get(url)
+                            url = r.url
+
+                        result = archiver.download(url, check_if_exists=True)
                         if result:
                             logger.info(f"{archiver} succeeded on row {i}")
                             break
 
                     if result:
                         update_sheet(wks, i, result, columns, v)
-
+                    else:
+                        wks.update(columns['status'] +
+                                   str(i), 'failed: no archiver')
 
                         # except:
-                            # if any unexpected errors occured, log these into the Google Sheet
-                            # t, value, traceback = sys.exc_info()
+                        # if any unexpected errors occured, log these into the Google Sheet
+                        # t, value, traceback = sys.exc_info()
 
-                            # update_sheet(wks, i, str(
-                            #     value), {}, columns, v)
+                        # update_sheet(wks, i, str(
+                        #     value), {}, columns, v)
 
 
 def main():

From e4603a942305bcd9ad772d14e21af7cb896ba759 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 22 Feb 2022 16:03:35 +0100
Subject: [PATCH 05/16] refactoring storage and bringing changes from origin

---
 __init__.py                     |  1 +
 archivers/base_archiver.py      | 55 +++++++++----------------
 archivers/telegram_archiver.py  | 27 ++++--------
 archivers/tiktok_archiver.py    | 39 +++++++-----------
 archivers/wayback_archiver.py   | 27 ++++++------
 archivers/youtubedl_archiver.py | 41 +++++++++---------
 auto_archive.py                 | 73 ++++++++++++++++++++-------------
 storages/__init__.py            |  3 ++
 storages/base_storage.py        | 19 +++++++++
 storages/s3_storage.py          | 49 ++++++++++++++++++++++
 10 files changed, 197 insertions(+), 137 deletions(-)
 create mode 100644 __init__.py
 create mode 100644 storages/__init__.py
 create mode 100644 storages/base_storage.py
 create mode 100644 storages/s3_storage.py

diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..b85e02a
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1 @@
+from storages import *
\ No newline at end of file
diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
index 3f9f4ac..b13a77f 100644
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -1,17 +1,10 @@
 import os
 import ffmpeg
-from dataclasses import dataclass
 import datetime
-from loguru import logger
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
 
-# TODO There should be a better way of generating keys, that adds the following info:
-#           - name of sheet that it is being archived from
-#             (this means we might archive the same media twice on different sheets, but that's OK I think)
-#           - name of archiver/platform that the video comes from
-#       This should make it easier to maintain and clean the archive later
-
-# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be
-#      cleaned up? Difficult is we don't know the filename until the archivers start working.
+from storages import Storage
 
 
 @dataclass
@@ -25,33 +18,27 @@ class ArchiveResult:
     timestamp: datetime.datetime = None
 
 
-class Archiver:
+class Archiver(ABC):
     name = "default"
 
-    def __init__(self, s3_client):
-        self.s3 = s3_client
+    def __init__(self, storage: Storage):
+        self.storage = storage
 
     def __str__(self):
         return self.__class__.__name__
 
-    def download(self, url, check_if_exists=False):
-        logger.error("method 'download' not implemented")
-
-    def get_cdn_url(self, key):
-        return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
-            os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
-
-    def do_s3_upload(self, f, key):
-        self.s3.upload_fileobj(f, Bucket=os.getenv(
-            'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
+    @abstractmethod
+    def download(self, url, check_if_exists=False): pass
 
     def get_key(self, filename):
-        print(f"key base implementation: {self.name}")
-        # TODO: refactor to be more manageable
-        key = filename.split('/')[1]
-        if 'unknown_video' in key:
-            key = key.replace('unknown_video', 'jpg')
-        return key
+        """
+        returns a key in the format "[archiverName]_[filename]" includes extension
+        """
+        tail = os.path.split(filename)[1]  # returns filename.ext from full path
+        _id, extension = os.path.splitext(tail)  # returns [filename, .ext]
+        if 'unknown_video' in _id:
+            _id = _id.replace('unknown_video', 'jpg')
+        return f'{self.name}_{_id}{extension}'
 
     def get_thumbnails(self, filename, duration=None):
         if not os.path.exists(filename.split('.')[0]):
@@ -80,10 +67,9 @@ class Archiver:
                 thumbnail_filename = filename.split('.')[0] + '/' + fname
                 key = filename.split('/')[1].split('.')[0] + '/' + fname
 
-                cdn_url = self.get_cdn_url(key)
+                cdn_url = self.storage.get_cdn_url(key)
 
-                with open(thumbnail_filename, 'rb') as f:
-                    self.do_s3_upload(f, key)
+                self.storage.upload(thumbnail_filename, key)
 
                 cdn_urls.append(cdn_url)
                 os.remove(thumbnail_filename)
@@ -107,9 +93,8 @@ class Archiver:
 
         thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
 
-        self.s3.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
-            'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
+        self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'})
 
-        thumb_index_cdn_url = self.get_cdn_url(thumb_index)
+        thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)
 
         return (key_thumb, thumb_index_cdn_url)
diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py
index d9168e4..16c6ccf 100644
--- a/archivers/telegram_archiver.py
+++ b/archivers/telegram_archiver.py
@@ -1,15 +1,13 @@
 import os
 import requests
 from bs4 import BeautifulSoup
-from botocore.errorfactory import ClientError
-from .base_archiver import Archiver, ArchiveResult
 
-# TODO: get_cdn_url, get_thumbnails, do_s3_upload
+from .base_archiver import Archiver, ArchiveResult
 
 
 class TelegramArchiver(Archiver):
     name = "telegram"
-    
+
     def download(self, url, check_if_exists=False):
         # detect URLs that we definitely cannot handle
         if 'http://t.me/' not in url and 'https://t.me/' not in url:
@@ -35,19 +33,13 @@ class TelegramArchiver(Archiver):
 
         video_url = video.get('src')
         key = video_url.split('/')[-1].split('?')[0]
+        key = self.get_key(key)
+
         filename = 'tmp/' + key
 
-        if check_if_exists:
-            try:
-                self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
-                # file exists
-                cdn_url = self.get_cdn_url(key)
-
-                status = 'already archived'
-
-            except ClientError:
-                pass
+        if check_if_exists and self.storage.exists(key):
+            status = 'already archived'
+            cdn_url = self.storage.get_cdn_url(key)
 
         v = requests.get(video_url, headers=headers)
 
@@ -55,10 +47,9 @@ class TelegramArchiver(Archiver):
             f.write(v.content)
 
         if status != 'already archived':
-            cdn_url = self.get_cdn_url(key)
+            cdn_url = self.storage.get_cdn_url(key)
 
-            with open(filename, 'rb') as f:
-                self.do_s3_upload(f, key)
+            self.storage.upload(filename, key)
 
         # extract duration from HTML
         duration = s.find_all('time')[0].contents[0]
diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py
index 1e3bcaf..e61fec9 100644
--- a/archivers/tiktok_archiver.py
+++ b/archivers/tiktok_archiver.py
@@ -1,15 +1,13 @@
 import os, traceback
-from botocore.errorfactory import ClientError
 import tiktok_downloader
 from loguru import logger
-from .base_archiver import Archiver, ArchiveResult
 
-# TODO: get_cdn_url, do_s3_upload, get_thumbnails
+from .base_archiver import Archiver, ArchiveResult
 
 
 class TiktokArchiver(Archiver):
     name = "tiktok"
-    
+
     def download(self, url, check_if_exists=False):
         if 'tiktok.com' not in url:
             return False
@@ -18,35 +16,28 @@ class TiktokArchiver(Archiver):
 
         try:
             info = tiktok_downloader.info_post(url)
-            key = 'tiktok_' + str(info.id) + '.mp4'
+            key = self.get_key(f'{info.id}.mp4')
+            cdn_url = self.get_cdn_url(key)
             filename = 'tmp/' + key
 
-            if check_if_exists:
-                try:
-                    self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
+            if check_if_exists and self.storage.exists(key):
+                status = 'already archived'
 
-                    # file exists
-                    cdn_url = self.get_cdn_url(key)
+            media = tiktok_downloader.snaptik(url).get_media()
 
-                    status = 'already archived'
+            if len(media) <= 0:
+                if status == 'already archived':
+                    return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url)
+                else:
+                    return ArchiveResult(status='Could not download media')
 
-                except ClientError:
-                    pass
+            media[0].download(filename)
 
             if status != 'already archived':
-                media = tiktok_downloader.snaptik(url).get_media()
-                if len(media) > 0:
-                    media[0].download(filename)
-                    with open(filename, 'rb') as f:
-                        self.do_s3_upload(f, key)
-
-                    cdn_url = self.get_cdn_url(key)
-                else:
-                    status = 'could not download media'
+                self.storage.upload(filename, key)
 
             try:
-                key_thumb, thumb_index = self.get_thumbnails(
-                    filename, duration=info.duration)
+                key_thumb, thumb_index = self.get_thumbnails(filename, duration=info.duration)
             except:
                 key_thumb = ''
                 thumb_index = 'error creating thumbnails'
diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py
index a021324..53b356f 100644
--- a/archivers/wayback_archiver.py
+++ b/archivers/wayback_archiver.py
@@ -1,14 +1,15 @@
 import time, requests, os
 from bs4 import BeautifulSoup
 
+from storages import Storage
 from .base_archiver import Archiver, ArchiveResult
 
 
 class WaybackArchiver(Archiver):
     name = "wayback"
-    
-    def __init__(self, s3_client):
-        self.s3 = s3_client
+
+    def __init__(self, storage: Storage):
+        super(WaybackArchiver, self).__init__(storage)
         self.seen_urls = {}
 
     def download(self, url, check_if_exists=False):
@@ -26,10 +27,12 @@ class WaybackArchiver(Archiver):
         if r.status_code != 200:
             return ArchiveResult(status="Internet archive failed")
 
+        if 'job_id' not in r.json() and 'message' in r.json():
+            return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
+
         job_id = r.json()['job_id']
 
-        status_r = requests.get(
-            'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
+        status_r = requests.get('https://web.archive.org/save/status/' + job_id, headers=ia_headers)
 
         retries = 0
 
@@ -51,7 +54,7 @@ class WaybackArchiver(Archiver):
         status_json = status_r.json()
 
         if status_json['status'] != 'success':
-            return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
+            return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
 
         archive_url = 'https://web.archive.org/web/' + \
             status_json['timestamp'] + '/' + status_json['original_url']
@@ -59,15 +62,15 @@ class WaybackArchiver(Archiver):
         try:
             r = requests.get(archive_url)
 
-            parsed = BeautifulSoup(
-                r.content, 'html.parser')
+            parsed = BeautifulSoup(r.content, 'html.parser')
 
-            title = parsed.find_all('title')[
-                0].text
+            title = parsed.find_all('title')[0].text
+
+            if title == 'Wayback Machine':
+                title = 'Could not get title'
         except:
             title = "Could not get title"
 
-        result = ArchiveResult(
-            status='Internet Archive fallback', cdn_url=archive_url, title=title)
+        result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title)
         self.seen_urls[url] = result
         return result
diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py
index 8249cfa..88f7970 100644
--- a/archivers/youtubedl_archiver.py
+++ b/archivers/youtubedl_archiver.py
@@ -3,12 +3,13 @@ import os
 import datetime
 import youtube_dl
 from loguru import logger
-from botocore.errorfactory import ClientError
+
 from .base_archiver import Archiver, ArchiveResult
 
+
 class YoutubeDLArchiver(Archiver):
     name = "yotube_dl"
-    
+
     def download(self, url, check_if_exists=False):
         ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
         if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
@@ -32,8 +33,11 @@ class YoutubeDLArchiver(Archiver):
         if check_if_exists:
             if 'entries' in info:
                 if len(info['entries']) > 1:
+                    logger.warning('YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos')
+                    return False
+                elif len(info['entries']) == 0:
                     logger.warning(
-                        'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
+                        'YoutubeDLArchiver succeeded but did not find video')
                     return False
 
                 filename = ydl.prepare_filename(info['entries'][0])
@@ -42,20 +46,14 @@ class YoutubeDLArchiver(Archiver):
 
             key = self.get_key(filename)
 
-            try:
-                self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
-                # file exists
-                cdn_url = self.get_cdn_url(key)
-
+            if self.storage.exists(key):
                 status = 'already archived'
-
-            except ClientError:
-                pass
+                cdn_url = self.storage.get_cdn_url(key)
 
         # sometimes this results in a different filename, so do this again
         info = ydl.extract_info(url, download=True)
 
+        # TODO: add support for multiple videos
         if 'entries' in info:
             if len(info['entries']) > 1:
                 logger.warning(
@@ -70,19 +68,24 @@ class YoutubeDLArchiver(Archiver):
             filename = filename.split('.')[0] + '.mkv'
 
         if status != 'already archived':
-            key = self. get_key(filename)
-            cdn_url = self.get_cdn_url(key)
+            key = self.get_key(filename)
+            cdn_url = self.storage.get_cdn_url(key)
 
-            with open(filename, 'rb') as f:
-                self.do_s3_upload(f, key)
+            self.storage.upload(filename, key)
 
         # get duration
         duration = info['duration'] if 'duration' in info else None
 
         # get thumbnails
-        key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
+        try:
+            key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
+        except:
+            key_thumb = ''
+            thumb_index = 'Could not generate thumbnails'
+
         os.remove(filename)
 
+        timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None
+
         return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
-                             title=info['title'] if 'title' in info else None,
-                             timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
+                             title=info['title'] if 'title' in info else None, timestamp=timestamp)
diff --git a/auto_archive.py b/auto_archive.py
index c478463..36bbadb 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -2,12 +2,13 @@ import os
 import datetime
 import argparse
 import math
+import requests
 import gspread
-import boto3
 from loguru import logger
 from dotenv import load_dotenv
 
 import archivers
+from storages import S3Storage, S3Config
 
 load_dotenv()
 
@@ -41,7 +42,7 @@ def index_to_col(index):
         return alphabet[index]
 
 
-def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v):
+def update_sheet(wks, row, result: archivers.ArchiveResult, columns, v):
     update = []
 
     if columns['status'] is not None:
@@ -103,19 +104,24 @@ def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v):
 def process_sheet(sheet):
     gc = gspread.service_account(filename='service_account.json')
     sh = gc.open(sheet)
-    n_worksheets = len(sh.worksheets())
 
-    s3_client = boto3.client('s3',
-                             region_name=os.getenv('DO_SPACES_REGION'),
-                             endpoint_url='https://{}.digitaloceanspaces.com'.format(
-                                 os.getenv('DO_SPACES_REGION')),
-                             aws_access_key_id=os.getenv('DO_SPACES_KEY'),
-                             aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
+    s3_config = S3Config(
+        bucket=os.getenv('DO_BUCKET'),
+        region=os.getenv('DO_SPACES_REGION'),
+        key=os.getenv('DO_SPACES_KEY'),
+        secret=os.getenv('DO_SPACES_SECRET')
+    )
+
+    # s3_client = boto3.client('s3',
+    #                          region_name=os.getenv('DO_SPACES_REGION'),
+    #                          endpoint_url='https://{}.digitaloceanspaces.com'.format(
+    #                              os.getenv('DO_SPACES_REGION')),
+    #                          aws_access_key_id=os.getenv('DO_SPACES_KEY'),
+    #                          aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
 
     # loop through worksheets to check
-    for ii in range(n_worksheets):
-        logger.info("Opening worksheet " + str(ii))
-        wks = sh.get_worksheet(ii)
+    for ii, wks in enumerate(sh.worksheets()):
+        logger.info(f'Opening worksheet {ii}: "{wks.title}"')
         values = wks.get_all_values()
 
         headers = [v.lower() for v in values[0]]
@@ -126,7 +132,7 @@ def process_sheet(sheet):
                 'source url')) if 'source url' in headers else None
 
         if columns['url'] is None:
-            logger.warning("No 'Media URL' column found, skipping")
+            logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}')
             continue
 
         url_index = col_to_index(columns['url'])
@@ -153,6 +159,9 @@ def process_sheet(sheet):
         columns['duration'] = index_to_col(headers.index(
             'duration')) if 'duration' in headers else None
 
+        # archives will be in a folder 'doc_name/worksheet_name'
+        s3_config.folder = f'{sheet}/{wks.title}/'
+        s3_client = S3Storage(s3_config)
 
         # order matters, first to succeed excludes remaining
         active_archivers = [
@@ -162,37 +171,43 @@ def process_sheet(sheet):
             archivers.WaybackArchiver(s3_client)
         ]
 
-
         # loop through rows in worksheet
-        for i in range(2, len(values)+1):
-            v = values[i-1]
+        for i in range(2, len(values) + 1):
+            v = values[i - 1]
+            url = v[url_index]
 
-            if v[url_index] != "" and v[col_to_index(columns['status'])] == "":
-                latest_val = wks.acell(
-                    columns['status'] + str(i)).value
+            if url != "" and v[col_to_index(columns['status'])] == "":
+                latest_val = wks.acell(columns['status'] + str(i)).value
 
                 # check so we don't step on each others' toes
                 if latest_val == '' or latest_val is None:
-                    wks.update(
-                        columns['status'] + str(i), 'Archive in progress')
+                    wks.update(columns['status'] + str(i), 'Archive in progress')
+
+                    # expand short URL links
+                    if 'https://t.co/' in url:
+                        r = requests.get(url)
+                        url = r.url
 
                     for archiver in active_archivers:
                         logger.debug(f"Trying {archiver} on row {i}")
-                        result = archiver.download(v[url_index], check_if_exists=True)
+
+                        result = archiver.download(url, check_if_exists=True)
+
                         if result:
-                            logger.info(f"{archiver} succeeded on row {i}")
+                            logger.success(f"{archiver} succeeded on row {i}")
                             break
 
                     if result:
                         update_sheet(wks, i, result, columns, v)
+                    else:
+                        wks.update(columns['status'] + str(i), 'failed: no archiver')
 
+                    # except:
+                    # if any unexpected errors occured, log these into the Google Sheet
+                    # t, value, traceback = sys.exc_info()
 
-                        # except:
-                            # if any unexpected errors occured, log these into the Google Sheet
-                            # t, value, traceback = sys.exc_info()
-
-                            # update_sheet(wks, i, str(
-                            #     value), {}, columns, v)
+                    # update_sheet(wks, i, str(
+                    #     value), {}, columns, v)
 
 
 def main():
diff --git a/storages/__init__.py b/storages/__init__.py
new file mode 100644
index 0000000..3054d36
--- /dev/null
+++ b/storages/__init__.py
@@ -0,0 +1,3 @@
+# we need to explicitly expose the available imports here
+from .base_storage import *
+from .s3_storage import *
\ No newline at end of file
diff --git a/storages/base_storage.py b/storages/base_storage.py
new file mode 100644
index 0000000..050a8eb
--- /dev/null
+++ b/storages/base_storage.py
@@ -0,0 +1,19 @@
+from abc import ABC, abstractmethod
+
+
+class Storage(ABC):
+    @abstractmethod
+    def __init__(self, config): pass
+
+    @abstractmethod
+    def get_cdn_url(self, path): pass
+
+    @abstractmethod
+    def exists(self, path): pass
+
+    @abstractmethod
+    def uploadf(self, file, key, **kwargs): pass
+
+    def upload(self, filename: str, key: str, **kwargs):
+        with open(filename, 'rb') as f:
+            self.uploadf(f, key, **kwargs)
diff --git a/storages/s3_storage.py b/storages/s3_storage.py
new file mode 100644
index 0000000..188db7e
--- /dev/null
+++ b/storages/s3_storage.py
@@ -0,0 +1,49 @@
+import boto3
+from botocore.errorfactory import ClientError
+from .base_storage import Storage
+from dataclasses import dataclass
+
+
+@dataclass
+class S3Config:
+    bucket: str
+    region: str
+    key: str
+    secret: str
+    folder: str = ""
+
+
+class S3Storage(Storage):
+
+    def __init__(self, config: S3Config):
+        self.bucket = config.bucket
+        self.region = config.region
+        self.folder = config.folder
+
+        if len(self.folder) and self.folder[-1] != '/':
+            self.folder += '/'
+
+        self.s3 = boto3.client(
+            's3',
+            region_name=self.region,
+            endpoint_url=f'https://{self.region}.digitaloceanspaces.com',
+            aws_access_key_id=config.key,
+            aws_secret_access_key=config.secret
+        )
+
+    def _get_path(self, key):
+        return self.folder + key
+
+    def get_cdn_url(self, key):
+        return f'https://{self.bucket}.{self.region}.cdn.digitaloceanspaces.com/{self._get_path(key)}'
+
+    def exists(self, key):
+        try:
+            self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key))
+            return True
+        except ClientError:
+            return False
+
+    def uploadf(self, file, key, **kwargs):
+        extra_args = kwargs["extra_args"] if "extra_args" in kwargs else {'ACL': 'public-read'}
+        self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)

From 2d145802b54d27012b5afedca4bef60b5a3038b5 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 23 Feb 2022 09:54:03 +0100
Subject: [PATCH 06/16] extracted worksheet operations

---
 archivers/tiktok_archiver.py |   2 +-
 auto_archive.py              | 196 +++++++++--------------------------
 gworksheet.py                |  97 +++++++++++++++++
 3 files changed, 145 insertions(+), 150 deletions(-)
 create mode 100644 gworksheet.py

diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py
index e61fec9..b54f956 100644
--- a/archivers/tiktok_archiver.py
+++ b/archivers/tiktok_archiver.py
@@ -17,7 +17,7 @@ class TiktokArchiver(Archiver):
         try:
             info = tiktok_downloader.info_post(url)
             key = self.get_key(f'{info.id}.mp4')
-            cdn_url = self.get_cdn_url(key)
+            cdn_url = self.storage.get_cdn_url(key)
             filename = 'tmp/' + key
 
             if check_if_exists and self.storage.exists(key):
diff --git a/auto_archive.py b/auto_archive.py
index 36bbadb..d636cbd 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -1,7 +1,6 @@
 import os
 import datetime
 import argparse
-import math
 import requests
 import gspread
 from loguru import logger
@@ -9,96 +8,34 @@ from dotenv import load_dotenv
 
 import archivers
 from storages import S3Storage, S3Config
+from gworksheet import GWorksheet
 
 load_dotenv()
 
 
-def col_to_index(col):
-    col = list(col)
-    ndigits = len(col)
-    alphabet = ' ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-    v = 0
-    i = ndigits - 1
-
-    for digit in col:
-        index = alphabet.find(digit)
-        v += (26 ** i) * index
-        i -= 1
-
-    return v - 1
-
-
-def index_to_col(index):
-    alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-
-    if index > 25:
-        t = index
-        dig = 0
-        while t > 25:
-            t = math.floor(t / 26)
-            dig += 1
-        return alphabet[t - 1] + index_to_col(index - t * int(math.pow(26, dig)))
-    else:
-        return alphabet[index]
-
-
-def update_sheet(wks, row, result: archivers.ArchiveResult, columns, v):
+def update_sheet(gw, row, result: archivers.ArchiveResult):
     update = []
 
-    if columns['status'] is not None:
-        update += [{
-            'range': columns['status'] + str(row),
-            'values': [[result.status]]
-        }]
+    def batch_if_valid(col, val, final_value=None):
+        final_value = final_value or val
+        if val and gw.col_exists(col) and gw.cell(row, col) == '':
+            update.append((row, col, final_value))
 
-    if result.cdn_url and columns['archive'] is not None and v[col_to_index(columns['archive'])] == '':
-        update += [{
-            'range': columns['archive'] + str(row),
-            'values': [[result.cdn_url]]
-        }]
+    update.append((row, 'status', result.status))
 
-    if columns['date'] is not None and v[col_to_index(columns['date'])] == '':
-        update += [{
-            'range': columns['date'] + str(row),
-            'values': [[datetime.datetime.now().isoformat()]]
-        }]
+    batch_if_valid('archive', result.cdn_url)
+    batch_if_valid('archive', True, datetime.datetime.now().isoformat())
+    batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
+    batch_if_valid('thumbnail_index', result.thumbnail_index)
+    batch_if_valid('title', result.title)
+    batch_if_valid('duration', result.duration, str(result.duration))
 
-    if result.thumbnail and columns['thumbnail'] is not None and v[col_to_index(columns['thumbnail'])] == '':
-        update += [{
-            'range': columns['thumbnail'] + str(row),
-            'values': [['=IMAGE("' + result.thumbnail + '")']]
-        }]
+    if result.timestamp and type(result.timestamp) != str:
+        result.timestamp = datetime.datetime.fromtimestamp(result.timestamp).isoformat()
+    batch_if_valid('timestamp', result.timestamp)
 
-    if result.thumbnail_index and columns['thumbnail_index'] is not None and v[col_to_index(columns['thumbnail_index'])] == '':
-        update += [{
-            'range': columns['thumbnail_index'] + str(row),
-            'values': [[result.thumbnail_index]]
-        }]
-
-    if result.timestamp and columns['timestamp'] is not None and v[col_to_index(columns['timestamp'])] == '':
-        update += [{
-            'range': columns['timestamp'] + str(row),
-            'values': [[result.timestamp]] if type(result.timestamp) == str else [[datetime.datetime.fromtimestamp(result.timestamp).isoformat()]]
-        }]
-
-    if result.title and columns['title'] is not None and v[col_to_index(columns['title'])] == '':
-        update += [{
-            'range': columns['title'] + str(row),
-            'values': [[result.title]]
-        }]
-
-    if result.duration and columns['duration'] is not None and v[col_to_index(columns['duration'])] == '':
-        update += [{
-            'range': columns['duration'] + str(row),
-            'values': [[str(result.duration)]]
-        }]
-
-    wks.batch_update(update, value_input_option='USER_ENTERED')
-
-
-# def record_stream(url, s3_client, wks, i, columns, v):
-#     video_data, status = download_vid(url, s3_client)
-#     update_sheet(wks, i, status, video_data, columns, v)
+    gw.update_batch(update)
+    
 
 
 def process_sheet(sheet):
@@ -112,53 +49,19 @@ def process_sheet(sheet):
         secret=os.getenv('DO_SPACES_SECRET')
     )
 
-    # s3_client = boto3.client('s3',
-    #                          region_name=os.getenv('DO_SPACES_REGION'),
-    #                          endpoint_url='https://{}.digitaloceanspaces.com'.format(
-    #                              os.getenv('DO_SPACES_REGION')),
-    #                          aws_access_key_id=os.getenv('DO_SPACES_KEY'),
-    #                          aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
-
     # loop through worksheets to check
     for ii, wks in enumerate(sh.worksheets()):
         logger.info(f'Opening worksheet {ii}: "{wks.title}"')
-        values = wks.get_all_values()
+        gw = GWorksheet(wks)
 
-        headers = [v.lower() for v in values[0]]
-        columns = {}
-
-        columns['url'] = index_to_col(headers.index(
-            'media url')) if 'media url' in headers else index_to_col(headers.index(
-                'source url')) if 'source url' in headers else None
-
-        if columns['url'] is None:
+        if not gw.col_exists("url"):
             logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}')
             continue
 
-        url_index = col_to_index(columns['url'])
-
-        columns['archive'] = index_to_col(headers.index(
-            'archive location')) if 'archive location' in headers else None
-        columns['date'] = index_to_col(headers.index(
-            'archive date')) if 'archive date' in headers else None
-        columns['status'] = index_to_col(headers.index(
-            'archive status')) if 'archive status' in headers else None
-
-        if columns['status'] is None:
+        if not gw.col_exists("status"):
             logger.warning("No 'Archive status' column found, skipping")
             continue
 
-        columns['thumbnail'] = index_to_col(headers.index(
-            'thumbnail')) if 'thumbnail' in headers else None
-        columns['thumbnail_index'] = index_to_col(headers.index(
-            'thumbnail index')) if 'thumbnail index' in headers else None
-        columns['timestamp'] = index_to_col(headers.index(
-            'upload timestamp')) if 'upload timestamp' in headers else None
-        columns['title'] = index_to_col(headers.index(
-            'upload title')) if 'upload title' in headers else None
-        columns['duration'] = index_to_col(headers.index(
-            'duration')) if 'duration' in headers else None
-
         # archives will be in a folder 'doc_name/worksheet_name'
         s3_config.folder = f'{sheet}/{wks.title}/'
         s3_client = S3Storage(s3_config)
@@ -172,47 +75,42 @@ def process_sheet(sheet):
         ]
 
         # loop through rows in worksheet
-        for i in range(2, len(values) + 1):
-            v = values[i - 1]
-            url = v[url_index]
+        for i in range(2, gw.count_rows() + 1):
+            row = gw.get_row(i)
+            url = gw.cell(row, 'url')
+            status = gw.cell(row, 'status')
+            if url != '' and status in ['', None]:
+                gw.update(i, 'status', 'Archive in progress')
 
-            if url != "" and v[col_to_index(columns['status'])] == "":
-                latest_val = wks.acell(columns['status'] + str(i)).value
+                # expand short URL links
+                if 'https://t.co/' in url:
+                    r = requests.get(url)
+                    url = r.url
 
-                # check so we don't step on each others' toes
-                if latest_val == '' or latest_val is None:
-                    wks.update(columns['status'] + str(i), 'Archive in progress')
-
-                    # expand short URL links
-                    if 'https://t.co/' in url:
-                        r = requests.get(url)
-                        url = r.url
-
-                    for archiver in active_archivers:
-                        logger.debug(f"Trying {archiver} on row {i}")
-
-                        result = archiver.download(url, check_if_exists=True)
-
-                        if result:
-                            logger.success(f"{archiver} succeeded on row {i}")
-                            break
+                for archiver in active_archivers:
+                    logger.debug(f'Trying {archiver} on row {i}')
+                    result = archiver.download(url, check_if_exists=True)
 
                     if result:
-                        update_sheet(wks, i, result, columns, v)
-                    else:
-                        wks.update(columns['status'] + str(i), 'failed: no archiver')
+                        logger.success(f'{archiver} succeeded on row {i}')
+                        break
 
-                    # except:
-                    # if any unexpected errors occured, log these into the Google Sheet
-                    # t, value, traceback = sys.exc_info()
+                if result:
+                    update_sheet(gw, i, result)
+                else:
+                    gw.update(i, 'status', 'failed: no archiver')
 
-                    # update_sheet(wks, i, str(
-                    #     value), {}, columns, v)
+        #             # except:
+        #             # if any unexpected errors occured, log these into the Google Sheet
+        #             # t, value, traceback = sys.exc_info()
+
+        #             # update_sheet(wks, i, str(
+        #             #     value), {}, columns, v)
 
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Automatically archive social media videos from a Google Sheet")
+        description="Automatically archive social media videos from a Google Sheets document")
     parser.add_argument("--sheet", action="store", dest="sheet")
     args = parser.parse_args()
 
diff --git a/gworksheet.py b/gworksheet.py
new file mode 100644
index 0000000..721bb01
--- /dev/null
+++ b/gworksheet.py
@@ -0,0 +1,97 @@
+from gspread import utils
+
+
+class GWorksheet:
+    COLUMN_NAMES = {
+        'url': 'media url',
+        'archive': 'archive location',
+        'date': 'archive date',
+        'status': 'archive status',
+        'thumbnail': 'thumbnail',
+        'thumbnail_index': 'thumbnail index',
+        'timestamp': 'upload timestamp',
+        'title': 'upload title',
+        'duration': 'duration'
+    }
+
+    def __init__(self, worksheet, columns=COLUMN_NAMES):
+        self.wks = worksheet
+        self.headers = [v.lower() for v in self.wks.row_values(1)]
+        self.columns = columns
+
+    def worksheet(self): return self.wks
+
+    def _check_col_exists(self, col: str):
+        if col not in self.columns:
+            raise Exception(f'Column {col} is not in the configured column names: {self.columns.keys()}')
+
+    def col_exists(self, col: str):
+        self._check_col_exists(col)
+        return self.columns[col] in self.headers
+
+    def col_index(self, col: str):
+        self._check_col_exists(col)
+        return self.headers.index(self.columns[col])
+
+    def count_rows(self):
+        return len(self.wks.get_values())
+
+    def get_row(self, row: int):
+        # row is 1-based
+        return self.wks.row_values(row)
+
+    def cell(self, row, col: str):
+        # row can be index (1-based) or list of values
+        if type(row) == int:
+            row = self.get_row(row)
+
+        col_index = self.col_index(col)
+        if col_index >= len(row):
+            return ''
+        return row[col_index]
+
+    def update(self, row: int, col: str, val):
+        # row is 1-based
+        col_index = self.col_index(col) + 1
+        self.wks.update_cell(row, col_index, val)
+
+    def update_batch(self, updates):
+        updates = [
+            {
+                'range': self.to_a1(row, self.col_index(col) + 1),
+                'values': [[val]]
+            }
+            for row, col, val in updates
+        ]
+        self.wks.batch_update(updates, value_input_option='USER_ENTERED')
+
+    def to_a1(self, row: int, col: int):
+        # row, col are 1-based
+        return utils.rowcol_to_a1(row, col)
+
+    # def index_to_col(self, index):
+    #     alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+
+    #     if index > 25:
+    #         t = index
+    #         dig = 0
+    #         while t > 25:
+    #             t = math.floor(t / 26)
+    #             dig += 1
+    #         return alphabet[t - 1] + self.index_to_col(index - t * int(math.pow(26, dig)))
+    #     else:
+    #         return alphabet[index]
+
+    # def col_to_index(self, col):
+    #     col = list(col)
+    #     ndigits = len(col)
+    #     alphabet = ' ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+    #     v = 0
+    #     i = ndigits - 1
+
+    #     for digit in col:
+    #         index = alphabet.find(digit)
+    #         v += (26 ** i) * index
+    #         i -= 1
+
+    #     return v - 1

From 374852e740f38ba9609862b1c2f315e9ac9b9b6c Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 23 Feb 2022 09:57:04 +0100
Subject: [PATCH 07/16] cleanup

---
 auto_archive.py | 13 ++++++-------
 gworksheet.py   | 27 ---------------------------
 2 files changed, 6 insertions(+), 34 deletions(-)

diff --git a/auto_archive.py b/auto_archive.py
index d636cbd..cbe0744 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -35,7 +35,6 @@ def update_sheet(gw, row, result: archivers.ArchiveResult):
     batch_if_valid('timestamp', result.timestamp)
 
     gw.update_batch(update)
-    
 
 
 def process_sheet(sheet):
@@ -54,11 +53,11 @@ def process_sheet(sheet):
         logger.info(f'Opening worksheet {ii}: "{wks.title}"')
         gw = GWorksheet(wks)
 
-        if not gw.col_exists("url"):
+        if not gw.col_exists('url'):
             logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}')
             continue
 
-        if not gw.col_exists("status"):
+        if not gw.col_exists('status'):
             logger.warning("No 'Archive status' column found, skipping")
             continue
 
@@ -110,14 +109,14 @@ def process_sheet(sheet):
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Automatically archive social media videos from a Google Sheets document")
-    parser.add_argument("--sheet", action="store", dest="sheet")
+        description='Automatically archive social media videos from a Google Sheets document')
+    parser.add_argument('--sheet', action='store', dest='sheet')
     args = parser.parse_args()
 
-    logger.info("Opening document " + args.sheet)
+    logger.info(f'Opening document {args.sheet}')
 
     process_sheet(args.sheet)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     main()
diff --git a/gworksheet.py b/gworksheet.py
index 721bb01..496ddcc 100644
--- a/gworksheet.py
+++ b/gworksheet.py
@@ -68,30 +68,3 @@ class GWorksheet:
     def to_a1(self, row: int, col: int):
         # row, col are 1-based
         return utils.rowcol_to_a1(row, col)
-
-    # def index_to_col(self, index):
-    #     alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-
-    #     if index > 25:
-    #         t = index
-    #         dig = 0
-    #         while t > 25:
-    #             t = math.floor(t / 26)
-    #             dig += 1
-    #         return alphabet[t - 1] + self.index_to_col(index - t * int(math.pow(26, dig)))
-    #     else:
-    #         return alphabet[index]
-
-    # def col_to_index(self, col):
-    #     col = list(col)
-    #     ndigits = len(col)
-    #     alphabet = ' ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-    #     v = 0
-    #     i = ndigits - 1
-
-    #     for digit in col:
-    #         index = alphabet.find(digit)
-    #         v += (26 ** i) * index
-    #         i -= 1
-
-    #     return v - 1

From 644aa0811c55219027e9c5eb98ef18e6c9e82057 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 23 Feb 2022 09:57:44 +0100
Subject: [PATCH 08/16] todo

---
 auto_archive.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/auto_archive.py b/auto_archive.py
index cbe0744..7e624d0 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -88,6 +88,7 @@ def process_sheet(sheet):
 
                 for archiver in active_archivers:
                     logger.debug(f'Trying {archiver} on row {i}')
+                    # TODO: add support for multiple videos/images
                     result = archiver.download(url, check_if_exists=True)
 
                     if result:

From 9550cd509e684159823cf9ed4a7ba8733c912daa Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 23 Feb 2022 13:57:11 +0100
Subject: [PATCH 09/16] making code more resilient to exceptions

---
 archivers/telegram_archiver.py |  4 +--
 auto_archive.py                | 64 +++++++++++++++++++---------------
 gworksheet.py                  | 37 +++++++++++---------
 3 files changed, 59 insertions(+), 46 deletions(-)

diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py
index 16c6ccf..5593acd 100644
--- a/archivers/telegram_archiver.py
+++ b/archivers/telegram_archiver.py
@@ -54,8 +54,8 @@ class TelegramArchiver(Archiver):
         # extract duration from HTML
         duration = s.find_all('time')[0].contents[0]
         if ':' in duration:
-            duration = float(duration.split(
-                ':')[0]) * 60 + float(duration.split(':')[1])
+            duration = float(duration.split(':')[0]) * 60
+            + float(duration.split(':')[1])
         else:
             duration = float(duration)
 
diff --git a/auto_archive.py b/auto_archive.py
index 7e624d0..cb70c58 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -14,17 +14,18 @@ load_dotenv()
 
 
 def update_sheet(gw, row, result: archivers.ArchiveResult):
-    update = []
+    cell_updates = []
+    row_values = gw.get_row(row)
 
     def batch_if_valid(col, val, final_value=None):
         final_value = final_value or val
-        if val and gw.col_exists(col) and gw.cell(row, col) == '':
-            update.append((row, col, final_value))
+        if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
+            cell_updates.append((row, col, final_value))
 
-    update.append((row, 'status', result.status))
+    cell_updates.append((row, 'status', result.status))
 
     batch_if_valid('archive', result.cdn_url)
-    batch_if_valid('archive', True, datetime.datetime.now().isoformat())
+    batch_if_valid('date', True, datetime.datetime.now().isoformat())
     batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
     batch_if_valid('thumbnail_index', result.thumbnail_index)
     batch_if_valid('title', result.title)
@@ -34,7 +35,18 @@ def update_sheet(gw, row, result: archivers.ArchiveResult):
         result.timestamp = datetime.datetime.fromtimestamp(result.timestamp).isoformat()
     batch_if_valid('timestamp', result.timestamp)
 
-    gw.update_batch(update)
+    gw.batch_set_cell(cell_updates)
+
+
+def expand_url(url):
+    # expand short URL links
+    if 'https://t.co/' in url:
+        try:
+            r = requests.get(url)
+            url = r.url
+        except:
+            logger.error(f'Failed to expand url {url}')
+    return url
 
 
 def process_sheet(sheet):
@@ -74,38 +86,34 @@ def process_sheet(sheet):
         ]
 
         # loop through rows in worksheet
-        for i in range(2, gw.count_rows() + 1):
-            row = gw.get_row(i)
-            url = gw.cell(row, 'url')
-            status = gw.cell(row, 'status')
+        for row in range(2, gw.count_rows() + 1):
+            url = gw.get_cell(row, 'url')
+            status = gw.get_cell(row, 'status')
             if url != '' and status in ['', None]:
-                gw.update(i, 'status', 'Archive in progress')
+                gw.set_cell(row, 'status', 'Archive in progress')
 
-                # expand short URL links
-                if 'https://t.co/' in url:
-                    r = requests.get(url)
-                    url = r.url
+                url = expand_url(url)
 
                 for archiver in active_archivers:
-                    logger.debug(f'Trying {archiver} on row {i}')
+                    logger.debug(f'Trying {archiver} on row {row}')
+
                     # TODO: add support for multiple videos/images
-                    result = archiver.download(url, check_if_exists=True)
+                    try:
+                        result = archiver.download(url, check_if_exists=True)
+                    except Exception as e:
+                        result = False
+                        logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}')
 
                     if result:
-                        logger.success(f'{archiver} succeeded on row {i}')
-                        break
+                        if result.status in ['success', 'already archived']:
+                            logger.success(f'{archiver} succeeded on row {row}')
+                            break
+                        logger.warning(f'{archiver} did not succeed on row {row}, final status: {result.status}')
 
                 if result:
-                    update_sheet(gw, i, result)
+                    update_sheet(gw, row, result)
                 else:
-                    gw.update(i, 'status', 'failed: no archiver')
-
-        #             # except:
-        #             # if any unexpected errors occured, log these into the Google Sheet
-        #             # t, value, traceback = sys.exc_info()
-
-        #             # update_sheet(wks, i, str(
-        #             #     value), {}, columns, v)
+                    gw.set_cell(row, 'status', 'failed: no archiver')
 
 
 def main():
diff --git a/gworksheet.py b/gworksheet.py
index 496ddcc..88de9a4 100644
--- a/gworksheet.py
+++ b/gworksheet.py
@@ -19,20 +19,18 @@ class GWorksheet:
         self.headers = [v.lower() for v in self.wks.row_values(1)]
         self.columns = columns
 
-    def worksheet(self): return self.wks
-
     def _check_col_exists(self, col: str):
         if col not in self.columns:
             raise Exception(f'Column {col} is not in the configured column names: {self.columns.keys()}')
 
+    def _col_index(self, col: str):
+        self._check_col_exists(col)
+        return self.headers.index(self.columns[col])
+
     def col_exists(self, col: str):
         self._check_col_exists(col)
         return self.columns[col] in self.headers
 
-    def col_index(self, col: str):
-        self._check_col_exists(col)
-        return self.headers.index(self.columns[col])
-
     def count_rows(self):
         return len(self.wks.get_values())
 
@@ -40,30 +38,37 @@ class GWorksheet:
         # row is 1-based
         return self.wks.row_values(row)
 
-    def cell(self, row, col: str):
-        # row can be index (1-based) or list of values
+    def get_cell(self, row, col: str):
+        """
+        returns the cell value from (row, col), 
+        where row can be an index (1-based) OR list of values
+        as received from self.get_row(row)
+        """
         if type(row) == int:
             row = self.get_row(row)
 
-        col_index = self.col_index(col)
+        col_index = self._col_index(col)
         if col_index >= len(row):
             return ''
         return row[col_index]
 
-    def update(self, row: int, col: str, val):
+    def set_cell(self, row: int, col: str, val):
         # row is 1-based
-        col_index = self.col_index(col) + 1
+        col_index = self._col_index(col) + 1
         self.wks.update_cell(row, col_index, val)
 
-    def update_batch(self, updates):
-        updates = [
+    def batch_set_cell(self, cell_updates):
+        """
+        receives a list of [(row:int, col:str, val)] and batch updates it, the parameters are the same as in the self.set_cell() method
+        """
+        cell_updates = [
             {
-                'range': self.to_a1(row, self.col_index(col) + 1),
+                'range': self.to_a1(row, self._col_index(col) + 1),
                 'values': [[val]]
             }
-            for row, col, val in updates
+            for row, col, val in cell_updates
         ]
-        self.wks.batch_update(updates, value_input_option='USER_ENTERED')
+        self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
 
     def to_a1(self, row: int, col: int):
         # row, col are 1-based

From 9a264a7dfeffc0077b7451f578fd46407efdedbd Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 23 Feb 2022 16:07:58 +0100
Subject: [PATCH 10/16] cleanup and docs

---
 README.md                       | 20 ++++++++++++++++++++
 archivers/base_archiver.py      | 26 ++++++++++++++++----------
 archivers/telegram_archiver.py  | 10 +++++-----
 archivers/tiktok_archiver.py    |  5 +++--
 archivers/youtubedl_archiver.py | 13 +++++++------
 gworksheet.py                   |  8 ++++----
 6 files changed, 55 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index cec6e9a..3d7f751 100644
--- a/README.md
+++ b/README.md
@@ -68,3 +68,23 @@ To make it easier to set up new auto-archiver sheets, the auto-auto-archiver wil
 
 ![A screenshot of a Google Spreadsheet configured to show instructional text and a list of sheet names to check with auto-archiver.](docs/auto-auto.png)
 
+# Code structure
+Code is split into functional concepts:
+1. [Archivers](archivers/) - receive a URL that they try to archive
+2. [Storages](storages/) - they deal with where the archived files go
+3. utilities
+   1. [GWorksheet](gworksheet.py) - facilitates some of the reading/writing tasks for a Google Worksheet
+
+### Current Archivers
+```mermaid
+graph TD
+    A(Archiver) -->|parent of| B(TelegramArchiver)
+    A -->|parent of| C(TikTokArchiver)
+    A -->|parent of| D(YoutubeDLArchiver)
+    A -->|parent of| E(WaybackArchiver)
+```
+### Current Storages
+```mermaid
+graph TD
+    A(BaseStorage) -->|parent of| B(S3Storage)
+```
diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
index b13a77f..6257aba 100644
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -3,6 +3,7 @@ import ffmpeg
 import datetime
 from dataclasses import dataclass
 from abc import ABC, abstractmethod
+from urllib.parse import urlparse
 
 from storages import Storage
 
@@ -30,6 +31,9 @@ class Archiver(ABC):
     @abstractmethod
     def download(self, url, check_if_exists=False): pass
 
+    def get_netloc(self, url):
+        return urlparse(url).netloc
+
     def get_key(self, filename):
         """
         returns a key in the format "[archiverName]_[filename]" includes extension
@@ -40,9 +44,12 @@ class Archiver(ABC):
             _id = _id.replace('unknown_video', 'jpg')
         return f'{self.name}_{_id}{extension}'
 
-    def get_thumbnails(self, filename, duration=None):
-        if not os.path.exists(filename.split('.')[0]):
-            os.mkdir(filename.split('.')[0])
+    def get_thumbnails(self, filename, key, duration=None):
+        thumbnails_folder = filename.split('.')[0] + '/'
+        key_folder = key.split('.')[0] + '/'
+
+        if not os.path.exists(thumbnails_folder):
+            os.mkdir(thumbnails_folder)
 
         fps = 0.5
         if duration is not None:
@@ -57,15 +64,14 @@ class Archiver(ABC):
 
         stream = ffmpeg.input(filename)
         stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
-        stream.output(filename.split('.')[0] + '/out%d.jpg').run()
+        stream.output(thumbnails_folder + 'out%d.jpg').run()
 
-        thumbnails = os.listdir(filename.split('.')[0] + '/')
+        thumbnails = os.listdir(thumbnails_folder)
         cdn_urls = []
-
         for fname in thumbnails:
             if fname[-3:] == 'jpg':
-                thumbnail_filename = filename.split('.')[0] + '/' + fname
-                key = filename.split('/')[1].split('.')[0] + '/' + fname
+                thumbnail_filename = thumbnails_folder + fname
+                key = key_folder + fname
 
                 cdn_url = self.storage.get_cdn_url(key)
 
@@ -86,12 +92,12 @@ class Archiver(ABC):
             index_page += f'<img src="{t}" />'
 
         index_page += f"</body></html>"
-        index_fname = filename.split('.')[0] + '/index.html'
+        index_fname = thumbnails_folder + 'index.html'
 
         with open(index_fname, 'w') as f:
             f.write(index_page)
 
-        thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
+        thumb_index = key_folder + 'index.html'
 
         self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'})
 
diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py
index 5593acd..5a9b013 100644
--- a/archivers/telegram_archiver.py
+++ b/archivers/telegram_archiver.py
@@ -10,7 +10,7 @@ class TelegramArchiver(Archiver):
 
     def download(self, url, check_if_exists=False):
         # detect URLs that we definitely cannot handle
-        if 'http://t.me/' not in url and 'https://t.me/' not in url:
+        if 't.me' != self.get_netloc(url):
             return False
 
         headers = {
@@ -20,7 +20,7 @@ class TelegramArchiver(Archiver):
 
         original_url = url
 
-        # TODO: check if we can do this more resilient to user-input
+        # TODO: check if we can do this more resilient to variable URLs
         if url[-8:] != "?embed=1":
             url += "?embed=1"
 
@@ -32,8 +32,8 @@ class TelegramArchiver(Archiver):
             return False  # could not find video
 
         video_url = video.get('src')
-        key = video_url.split('/')[-1].split('?')[0]
-        key = self.get_key(key)
+        video_id = video_url.split('/')[-1].split('?')[0]
+        key = self.get_key(video_id)
 
         filename = 'tmp/' + key
 
@@ -60,7 +60,7 @@ class TelegramArchiver(Archiver):
             duration = float(duration)
 
         # process thumbnails
-        key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
+        key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
         os.remove(filename)
 
         return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py
index b54f956..62aa415 100644
--- a/archivers/tiktok_archiver.py
+++ b/archivers/tiktok_archiver.py
@@ -37,8 +37,9 @@ class TiktokArchiver(Archiver):
                 self.storage.upload(filename, key)
 
             try:
-                key_thumb, thumb_index = self.get_thumbnails(filename, duration=info.duration)
-            except:
+                key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=info.duration)
+            except Exception as e:
+                logger.error(e)
                 key_thumb = ''
                 thumb_index = 'error creating thumbnails'
 
diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py
index 88f7970..ec11061 100644
--- a/archivers/youtubedl_archiver.py
+++ b/archivers/youtubedl_archiver.py
@@ -9,14 +9,15 @@ from .base_archiver import Archiver, ArchiveResult
 
 class YoutubeDLArchiver(Archiver):
     name = "yotube_dl"
+    ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
 
     def download(self, url, check_if_exists=False):
-        ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
-        if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
+        netloc = self.get_netloc(url)
+        if netloc in ['facebook.com', 'wwww.facebook.com'] and os.getenv('FB_COOKIE'):
             logger.info('Using Facebook cookie')
             youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
 
-        ydl = youtube_dl.YoutubeDL(ydl_opts)
+        ydl = youtube_dl.YoutubeDL(YoutubeDLArchiver.ydl_opts)
         cdn_url = None
         status = 'success'
 
@@ -26,7 +27,7 @@ class YoutubeDLArchiver(Archiver):
             # no video here
             return False
 
-        if 'is_live' in info and info['is_live']:
+        if info.get('is_live', False):
             logger.warning("Live streaming media, not archiving now")
             return ArchiveResult(status="Streaming media")
 
@@ -74,11 +75,11 @@ class YoutubeDLArchiver(Archiver):
             self.storage.upload(filename, key)
 
         # get duration
-        duration = info['duration'] if 'duration' in info else None
+        duration = info.get('duration')
 
         # get thumbnails
         try:
-            key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
+            key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
         except:
             key_thumb = ''
             thumb_index = 'Could not generate thumbnails'
diff --git a/gworksheet.py b/gworksheet.py
index 88de9a4..4349e2a 100644
--- a/gworksheet.py
+++ b/gworksheet.py
@@ -63,13 +63,13 @@ class GWorksheet:
         """
         cell_updates = [
             {
-                'range': self.to_a1(row, self._col_index(col) + 1),
+                'range': self.to_a1(row, col),
                 'values': [[val]]
             }
             for row, col, val in cell_updates
         ]
         self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
 
-    def to_a1(self, row: int, col: int):
-        # row, col are 1-based
-        return utils.rowcol_to_a1(row, col)
+    def to_a1(self, row: int, col: str):
+        # row is 1-based
+        return utils.rowcol_to_a1(row, self._col_index(col) + 1)

From 2601313249fd46581150b05692f908df5c709c07 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 23 Feb 2022 16:13:09 +0100
Subject: [PATCH 11/16] removed archivers.py

---
 archivers.py | 412 ---------------------------------------------------
 1 file changed, 412 deletions(-)
 delete mode 100644 archivers.py

diff --git a/archivers.py b/archivers.py
deleted file mode 100644
index 7c8df8c..0000000
--- a/archivers.py
+++ /dev/null
@@ -1,412 +0,0 @@
-from dataclasses import dataclass
-import youtube_dl
-from bs4 import BeautifulSoup
-import requests
-import tiktok_downloader
-from loguru import logger
-import os
-import datetime
-import ffmpeg
-from botocore.errorfactory import ClientError
-import time
-import traceback
-
-# TODO There should be a better way of generating keys, that adds the following info:
-#           - name of sheet that it is being archived from
-#             (this means we might archive the same media twice on different sheets, but that's OK I think)
-#           - name of archiver/platform that the video comes from
-#       This should make it easier to maintain and clean the archive later
-
-# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be
-#      cleaned up? Difficult is we don't know the filename until the archivers start working.
-
-
-def get_cdn_url(key):
-    return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
-        os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
-
-
-def do_s3_upload(s3_client, f, key):
-    s3_client.upload_fileobj(f, Bucket=os.getenv(
-        'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
-
-
-def get_key(filename):
-    key = filename.split('/')[1]
-    if 'unknown_video' in key:
-        key = key.replace('unknown_video', 'jpg')
-    return key
-
-
-def get_thumbnails(filename, s3_client, duration=None):
-    if not os.path.exists(filename.split('.')[0]):
-        os.mkdir(filename.split('.')[0])
-
-    fps = 0.5
-    if duration is not None:
-        duration = float(duration)
-
-        if duration < 60:
-            fps = 10.0 / duration
-        elif duration < 120:
-            fps = 20.0 / duration
-        else:
-            fps = 40.0 / duration
-
-    stream = ffmpeg.input(filename)
-    stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
-    stream.output(filename.split('.')[0] + '/out%d.jpg').run()
-
-    thumbnails = os.listdir(filename.split('.')[0] + '/')
-    cdn_urls = []
-
-    for fname in thumbnails:
-        if fname[-3:] == 'jpg':
-            thumbnail_filename = filename.split('.')[0] + '/' + fname
-            key = filename.split('/')[1].split('.')[0] + '/' + fname
-
-            cdn_url = get_cdn_url(key)
-
-            with open(thumbnail_filename, 'rb') as f:
-                do_s3_upload(s3_client, f, key)
-
-            cdn_urls.append(cdn_url)
-            os.remove(thumbnail_filename)
-
-    if len(cdn_urls) == 0:
-        return ('None', 'None')
-
-    key_thumb = cdn_urls[int(len(cdn_urls)*0.1)]
-
-    index_page = f'''<html><head><title>{filename}</title></head>
-        <body>'''
-
-    for t in cdn_urls:
-        index_page += f'<img src="{t}" />'
-
-    index_page += f"</body></html>"
-    index_fname = filename.split('.')[0] + '/index.html'
-
-    with open(index_fname, 'w') as f:
-        f.write(index_page)
-
-    thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
-
-    s3_client.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
-        'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
-
-    thumb_index_cdn_url = get_cdn_url(thumb_index)
-
-    return (key_thumb, thumb_index_cdn_url)
-
-
-@dataclass
-class ArchiveResult:
-    status: str
-    cdn_url: str = None
-    thumbnail: str = None
-    thumbnail_index: str = None
-    duration: float = None
-    title: str = None
-    timestamp: datetime.datetime = None
-
-
-class Archiver:
-    def __init__(self, s3_client):
-        self.s3 = s3_client
-
-    def download(self, url):
-        pass
-
-
-class TelegramArchiver(Archiver):
-    def download(self, url, check_if_exists=False):
-        # detect URLs that we definitely cannot handle
-        if 'http://t.me/' not in url and 'https://t.me/' not in url:
-            return False
-
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
-        status = "success"
-
-        original_url = url
-
-        if url[-8:] != "?embed=1":
-            url += "?embed=1"
-
-        t = requests.get(url, headers=headers)
-        s = BeautifulSoup(t.content, 'html.parser')
-        video = s.find("video")
-
-        if video is None:
-            return False  # could not find video
-
-        video_url = video.get('src')
-        key = video_url.split('/')[-1].split('?')[0]
-        filename = 'tmp/' + key
-
-        if check_if_exists:
-            try:
-                self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
-                # file exists
-                cdn_url = get_cdn_url(key)
-
-                status = 'already archived'
-
-            except ClientError:
-                pass
-
-        v = requests.get(video_url, headers=headers)
-
-        with open(filename, 'wb') as f:
-            f.write(v.content)
-
-        if status != 'already archived':
-            cdn_url = get_cdn_url(key)
-
-            with open(filename, 'rb') as f:
-                do_s3_upload(self.s3, f, key)
-
-        # extract duration from HTML
-        duration = s.find_all('time')[0].contents[0]
-        if ':' in duration:
-            duration = float(duration.split(
-                ':')[0])*60 + float(duration.split(':')[1])
-        else:
-            duration = float(duration)
-
-        # process thumbnails
-        key_thumb, thumb_index = get_thumbnails(
-            filename, self.s3, duration=duration)
-        os.remove(filename)
-
-        return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
-                             duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))
-
-
-class YoutubeDLArchiver(Archiver):
-    def download(self, url, check_if_exists=False):
-        ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
-        if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
-            logger.info('Using Facebook cookie')
-            youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
-
-        ydl = youtube_dl.YoutubeDL(ydl_opts)
-        cdn_url = None
-        status = 'success'
-
-        try:
-            info = ydl.extract_info(url, download=False)
-        except youtube_dl.utils.DownloadError:
-            # no video here
-            return False
-
-        if 'is_live' in info and info['is_live']:
-            logger.warning("Live streaming media, not archiving now")
-            return ArchiveResult(status="Streaming media")
-
-        if check_if_exists:
-            if 'entries' in info:
-                if len(info['entries']) > 1:
-                    logger.warning(
-                        'YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos')
-                    return False
-                elif len(info['entries']) == 0:
-                    logger.warning(
-                        'YoutubeDLArchiver succeeded but did not find video')
-                    return False
-
-                filename = ydl.prepare_filename(info['entries'][0])
-            else:
-                filename = ydl.prepare_filename(info)
-
-            key = get_key(filename)
-
-            try:
-                self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
-                # file exists
-                cdn_url = get_cdn_url(key)
-
-                status = 'already archived'
-
-            except ClientError:
-                pass
-
-        # sometimes this results in a different filename, so do this again
-        info = ydl.extract_info(url, download=True)
-
-        if 'entries' in info:
-            if len(info['entries']) > 1:
-                logger.warning(
-                    'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
-                return False
-            else:
-                info = info['entries'][0]
-
-        filename = ydl.prepare_filename(info)
-
-        if not os.path.exists(filename):
-            filename = filename.split('.')[0] + '.mkv'
-
-        if status != 'already archived':
-            key = get_key(filename)
-            cdn_url = get_cdn_url(key)
-
-            with open(filename, 'rb') as f:
-                do_s3_upload(self.s3, f, key)
-
-        # get duration
-        duration = info['duration'] if 'duration' in info else None
-
-        # get thumbnails
-        try:
-            key_thumb, thumb_index = get_thumbnails(
-                filename, self.s3, duration=duration)
-        except:
-            key_thumb = ''
-            thumb_index = 'Could not generate thumbnails'
-
-        os.remove(filename)
-
-        timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(
-            info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None
-
-        return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
-                             title=info['title'] if 'title' in info else None,
-                             timestamp=timestamp)
-
-
-class WaybackArchiver(Archiver):
-    def __init__(self, s3_client):
-        self.s3 = s3_client
-        self.seen_urls = {}
-
-    def download(self, url, check_if_exists=False):
-        if check_if_exists and url in self.seen_urls:
-            return self.seen_urls[url]
-
-        ia_headers = {
-            "Accept": "application/json",
-            "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
-        }
-
-        r = requests.post(
-            'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
-
-        if r.status_code != 200:
-            return ArchiveResult(status="Internet archive failed")
-
-        if 'job_id' not in r.json() and 'message' in r.json():
-            return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
-
-        job_id = r.json()['job_id']
-
-        status_r = requests.get(
-            'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
-
-        retries = 0
-
-        # wait 90-120 seconds for the archive job to finish
-        while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
-            time.sleep(3)
-
-            try:
-                status_r = requests.get(
-                    'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
-            except:
-                time.sleep(1)
-
-            retries += 1
-
-        if status_r.status_code != 200:
-            return ArchiveResult(status="Internet archive failed")
-
-        status_json = status_r.json()
-
-        if status_json['status'] != 'success':
-            return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
-
-        archive_url = 'https://web.archive.org/web/' + \
-            status_json['timestamp'] + '/' + status_json['original_url']
-
-        try:
-            r = requests.get(archive_url)
-
-            parsed = BeautifulSoup(
-                r.content, 'html.parser')
-
-            title = parsed.find_all('title')[
-                0].text
-
-            if title == 'Wayback Machine':
-                title = 'Could not get title'
-        except:
-            title = "Could not get title"
-
-        result = ArchiveResult(
-            status='Internet Archive fallback', cdn_url=archive_url, title=title)
-        self.seen_urls[url] = result
-        return result
-
-
-class TiktokArchiver(Archiver):
-    def download(self, url, check_if_exists=False):
-        if 'tiktok.com' not in url:
-            return False
-
-        status = 'success'
-
-        try:
-            info = tiktok_downloader.info_post(url)
-            key = 'tiktok_' + str(info.id) + '.mp4'
-            cdn_url = get_cdn_url(key)
-            filename = 'tmp/' + key
-
-            if check_if_exists:
-                try:
-                    self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
-                    # file exists
-                    cdn_url = get_cdn_url(key)
-
-                    status = 'already archived'
-
-                except ClientError:
-                    pass
-
-            media = tiktok_downloader.snaptik(url).get_media()
-
-            if len(media) <= 0:
-                if status == 'already archived':
-                    return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url)
-                else:
-                    return ArchiveResult(status='Could not download media')
-
-            media[0].download(filename)
-
-            if status != 'already archived':
-                with open(filename, 'rb') as f:
-                    do_s3_upload(self.s3, f, key)
-
-            try:
-                key_thumb, thumb_index = get_thumbnails(
-                    filename, self.s3, duration=info.duration)
-            except:
-                key_thumb = ''
-                thumb_index = 'error creating thumbnails'
-
-            os.remove(filename)
-
-            return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
-                                 thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat())
-
-        except tiktok_downloader.Except.InvalidUrl:
-            status = 'Invalid URL'
-            return ArchiveResult(status=status)
-
-        except:
-            error = traceback.format_exc()
-            status = 'Other Tiktok error: ' + str(error)
-            return ArchiveResult(status=status)

From 1d62009c4f6043c53df39a88ae56df69752a012a Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 23 Feb 2022 16:24:59 +0100
Subject: [PATCH 12/16] creates utils module and moves gworkseet there

---
 README.md                            | 4 ++--
 auto_archive.py                      | 2 +-
 utils/__init__.py                    | 2 ++
 gworksheet.py => utils/gworksheet.py | 0
 4 files changed, 5 insertions(+), 3 deletions(-)
 create mode 100644 utils/__init__.py
 rename gworksheet.py => utils/gworksheet.py (100%)

diff --git a/README.md b/README.md
index 3d7f751..7910e30 100644
--- a/README.md
+++ b/README.md
@@ -72,8 +72,8 @@ To make it easier to set up new auto-archiver sheets, the auto-auto-archiver wil
 Code is split into functional concepts:
 1. [Archivers](archivers/) - receive a URL that they try to archive
 2. [Storages](storages/) - they deal with where the archived files go
-3. utilities
-   1. [GWorksheet](gworksheet.py) - facilitates some of the reading/writing tasks for a Google Worksheet
+3. [Utilities](utils/)
+   1. [GWorksheet](utils/gworksheet.py) - facilitates some of the reading/writing tasks for a Google Worksheet
 
 ### Current Archivers
 ```mermaid
diff --git a/auto_archive.py b/auto_archive.py
index cb70c58..ba05310 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -8,7 +8,7 @@ from dotenv import load_dotenv
 
 import archivers
 from storages import S3Storage, S3Config
-from gworksheet import GWorksheet
+from utils import GWorksheet
 
 load_dotenv()
 
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000..482e144
--- /dev/null
+++ b/utils/__init__.py
@@ -0,0 +1,2 @@
+# we need to explicitly expose the available imports here
+from .gworksheet import GWorksheet
\ No newline at end of file
diff --git a/gworksheet.py b/utils/gworksheet.py
similarity index 100%
rename from gworksheet.py
rename to utils/gworksheet.py

From 3cafc444fc964ddc2ffcb34b94ac6ed67ae04a94 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 23 Feb 2022 16:32:38 +0100
Subject: [PATCH 13/16] creates tmp folder if not exists

---
 __init__.py                | 1 -
 archivers/base_archiver.py | 4 ++--
 auto_archive.py            | 3 ++-
 utils/__init__.py          | 3 ++-
 utils/misc.py              | 5 +++++
 5 files changed, 11 insertions(+), 5 deletions(-)
 delete mode 100644 __init__.py
 create mode 100644 utils/misc.py

diff --git a/__init__.py b/__init__.py
deleted file mode 100644
index b85e02a..0000000
--- a/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from storages import *
\ No newline at end of file
diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
index 6257aba..dc47273 100644
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -6,6 +6,7 @@ from abc import ABC, abstractmethod
 from urllib.parse import urlparse
 
 from storages import Storage
+from utils import mkdir_if_not_exists
 
 
 @dataclass
@@ -48,8 +49,7 @@ class Archiver(ABC):
         thumbnails_folder = filename.split('.')[0] + '/'
         key_folder = key.split('.')[0] + '/'
 
-        if not os.path.exists(thumbnails_folder):
-            os.mkdir(thumbnails_folder)
+        mkdir_if_not_exists(thumbnails_folder)
 
         fps = 0.5
         if duration is not None:
diff --git a/auto_archive.py b/auto_archive.py
index ba05310..472efd2 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -8,7 +8,7 @@ from dotenv import load_dotenv
 
 import archivers
 from storages import S3Storage, S3Config
-from utils import GWorksheet
+from utils import GWorksheet, mkdir_if_not_exists
 
 load_dotenv()
 
@@ -124,6 +124,7 @@ def main():
 
     logger.info(f'Opening document {args.sheet}')
 
+    mkdir_if_not_exists('tmp')
     process_sheet(args.sheet)
 
 
diff --git a/utils/__init__.py b/utils/__init__.py
index 482e144..9b58126 100644
--- a/utils/__init__.py
+++ b/utils/__init__.py
@@ -1,2 +1,3 @@
 # we need to explicitly expose the available imports here
-from .gworksheet import GWorksheet
\ No newline at end of file
+from .gworksheet import GWorksheet
+from .misc import *
\ No newline at end of file
diff --git a/utils/misc.py b/utils/misc.py
new file mode 100644
index 0000000..e8ef66d
--- /dev/null
+++ b/utils/misc.py
@@ -0,0 +1,5 @@
+import os
+
+def mkdir_if_not_exists(folder):
+    if not os.path.exists(folder):
+        os.mkdir(folder)
\ No newline at end of file

From 214d52d36f21e1d72851f3f9dbba6b77f1b3f79f Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 23 Feb 2022 16:43:42 +0100
Subject: [PATCH 14/16] improved tmp folder management

---
 archivers/base_archiver.py | 3 ++-
 auto_archive.py            | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
index dc47273..12cca80 100644
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -1,6 +1,7 @@
 import os
 import ffmpeg
 import datetime
+import shutil
 from dataclasses import dataclass
 from abc import ABC, abstractmethod
 from urllib.parse import urlparse
@@ -78,7 +79,6 @@ class Archiver(ABC):
                 self.storage.upload(thumbnail_filename, key)
 
                 cdn_urls.append(cdn_url)
-                os.remove(thumbnail_filename)
 
         if len(cdn_urls) == 0:
             return ('None', 'None')
@@ -100,6 +100,7 @@ class Archiver(ABC):
         thumb_index = key_folder + 'index.html'
 
         self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'})
+        shutil.rmtree(thumbnails_folder)
 
         thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)
 
diff --git a/auto_archive.py b/auto_archive.py
index 472efd2..ce82ee1 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -2,6 +2,7 @@ import os
 import datetime
 import argparse
 import requests
+import shutil
 import gspread
 from loguru import logger
 from dotenv import load_dotenv
@@ -126,7 +127,7 @@ def main():
 
     mkdir_if_not_exists('tmp')
     process_sheet(args.sheet)
-
+    shutil.rmtree('tmp')
 
 if __name__ == '__main__':
     main()

From 4bbbdcc7fd4902d28efeba063c587d8b32fb6e99 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 23 Feb 2022 18:30:06 +0100
Subject: [PATCH 15/16] minor update

---
 auto_archive.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_archive.py b/auto_archive.py
index ce82ee1..211d3d7 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -71,7 +71,7 @@ def process_sheet(sheet):
             continue
 
         if not gw.col_exists('status'):
-            logger.warning("No 'Archive status' column found, skipping")
+            logger.warning(f'No "Archive status" column found, skipping worksheet {wks.title}')
             continue
 
         # archives will be in a folder 'doc_name/worksheet_name'

From 8bce84082a94f8b5793203ec9a4c1a03c821a881 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 23 Feb 2022 18:32:40 +0100
Subject: [PATCH 16/16] minor updates

---
 archivers/telegram_archiver.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py
index 5a9b013..d7b8924 100644
--- a/archivers/telegram_archiver.py
+++ b/archivers/telegram_archiver.py
@@ -36,10 +36,10 @@ class TelegramArchiver(Archiver):
         key = self.get_key(video_id)
 
         filename = 'tmp/' + key
+        cdn_url = self.storage.get_cdn_url(key)
 
         if check_if_exists and self.storage.exists(key):
             status = 'already archived'
-            cdn_url = self.storage.get_cdn_url(key)
 
         v = requests.get(video_url, headers=headers)
 
@@ -47,8 +47,6 @@ class TelegramArchiver(Archiver):
             f.write(v.content)
 
         if status != 'already archived':
-            cdn_url = self.storage.get_cdn_url(key)
-
             self.storage.upload(filename, key)
 
         # extract duration from HTML