diff --git a/.gitignore b/.gitignore
index 4f3d132..141d9f5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,8 @@
tmp/
-.env
+.env*
.DS_Store
expmt/
service_account.json
__pycache__/
+._*
+anu.html
diff --git a/Pipfile b/Pipfile
index 8c71f78..27071fa 100644
--- a/Pipfile
+++ b/Pipfile
@@ -9,10 +9,13 @@ boto3 = "*"
python-dotenv = "*"
youtube_dl = "*"
argparse = "*"
-ffmpeg-python = "*"
beautifulsoup4 = "*"
+tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"}
+bs4 = "*"
+loguru = "*"
+ffmpeg-python = "*"
[dev-packages]
[requires]
-python_version = "3.8"
+python_version = "3.9"
diff --git a/Pipfile.lock b/Pipfile.lock
index ef838d8..9879884 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,11 +1,11 @@
{
"_meta": {
"hash": {
- "sha256": "2aa6e5f9d7cda1a459444bf812fb2f7a4acfe547e7c65a975ab41530f9213da5"
+ "sha256": "9a5218275503e5ae779407349d0a76f44712dc4824e066b10aeb047264a168be"
},
"pipfile-spec": 6,
"requires": {
- "python_version": "3.8"
+ "python_version": "3.9"
},
"sources": [
{
@@ -26,49 +26,80 @@
},
"beautifulsoup4": {
"hashes": [
- "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35",
- "sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25",
- "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"
+ "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf",
+ "sha256:c23ad23c521d818955a4151a67d81580319d4bf548d3d49f4223ae041ff98891"
],
"index": "pypi",
- "version": "==4.9.3"
+ "version": "==4.10.0"
},
"boto3": {
"hashes": [
- "sha256:7209b79833bdf13753aa24f76bf533890ffed2cc4fe1fe08619d223c209bbd11",
- "sha256:f46c93d09acd4d4bfc6b9522ed852fecbdc508e0365f29ddfb3c146aae784b4e"
+ "sha256:aa00024cc1f3d24b2318dae4d5dbaa173c8da8bc6f9d12f0b2e67467ec460989",
+ "sha256:ab4ab2392f7520c01ce6e40e6df4b5b65a575ee6bd9fb78db0239cb2a06de557"
],
"index": "pypi",
- "version": "==1.18.27"
+ "version": "==1.21.3"
},
"botocore": {
"hashes": [
- "sha256:8c99abd7093ab11ce8d09c68732aeeb6065a53d2fe371568452e99291817fff5",
- "sha256:b9e2c90bad164d111c229102f58f995c28576e719dd116b446965e1b786f8fa5"
+ "sha256:979e5c5e826ff115f4903fe9887b191f3809229f694a747f910e1221fe63efc7",
+ "sha256:ca33f747c67cd0e109fab9398d39c38c1a2df352c1e1f9823839df8f1db58046"
],
- "version": "==1.21.27"
+ "markers": "python_version >= '3.6'",
+ "version": "==1.24.3"
+ },
+ "bs4": {
+ "hashes": [
+ "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"
+ ],
+ "index": "pypi",
+ "version": "==0.0.1"
},
"cachetools": {
"hashes": [
- "sha256:2cc0b89715337ab6dbba85b5b50effe2b0c74e035d83ee8ed637cf52f12ae001",
- "sha256:61b5ed1e22a0924aed1d23b478f37e8d52549ff8a961de2909c69bf950020cff"
+ "sha256:486471dfa8799eb7ec503a8059e263db000cdda20075ce5e48903087f79d5fd6",
+ "sha256:8fecd4203a38af17928be7b90689d8083603073622229ca7077b72d8e5a976e4"
],
- "version": "==4.2.2"
+ "markers": "python_version ~= '3.7'",
+ "version": "==5.0.0"
},
"certifi": {
"hashes": [
- "sha256:2bbf76fd432960138b3ef6dda3dde0544f27cbf8546c458e60baf371917ba9ee",
- "sha256:50b1e4f8446b06f41be7dd6338db18e0990601dce795c2b1686458aa7e8fa7d8"
+ "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
+ "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
],
- "version": "==2021.5.30"
+ "version": "==2021.10.8"
},
"charset-normalizer": {
"hashes": [
- "sha256:0c8911edd15d19223366a194a513099a302055a962bca2cec0f54b8b63175d8b",
- "sha256:f23667ebe1084be45f6ae0538e4a5a865206544097e4e8bbcacf42cd02a348f3"
+ "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
+ "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"
],
"markers": "python_version >= '3'",
- "version": "==2.0.4"
+ "version": "==2.0.12"
+ },
+ "click": {
+ "hashes": [
+ "sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1",
+ "sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==8.0.4"
+ },
+ "cloudscraper": {
+ "hashes": [
+ "sha256:674fd739f9412188aae8d6614e3e6316939fc0670ef5646abd3d316f1a59d3c2",
+ "sha256:dda29028c5628b5ba3e4dc43816ed38fd46bd945ef938c420f185586a6d8dff2"
+ ],
+ "version": "==1.2.58"
+ },
+ "faker": {
+ "hashes": [
+ "sha256:ee8d9181137cdd2b198bd3d0653b0a3b7b385213862348e15ba8a423324b702b",
+ "sha256:f545b2a1ba5f7effc4ed71af0a5204d939445f0190838d41bee6bc160958bfbe"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==13.0.0"
},
"ffmpeg-python": {
"hashes": [
@@ -78,55 +109,138 @@
"index": "pypi",
"version": "==0.2.0"
},
+ "flask": {
+ "hashes": [
+ "sha256:59da8a3170004800a2837844bfa84d49b022550616070f7cb1a659682b2e7c9f",
+ "sha256:e1120c228ca2f553b470df4a5fa927ab66258467526069981b3eb0a91902687d"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==2.0.3"
+ },
"future": {
"hashes": [
"sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d"
],
+ "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.18.2"
},
"google-auth": {
"hashes": [
- "sha256:c012c8be7c442c8309ca8fa0876fef33f5fd977c467be1e1c1c2f721e8ebd73c",
- "sha256:ea1af050b3e06eb73e4470f704d23007307bc0e87c13e015f6b90460f1407bd3"
+ "sha256:218ca03d7744ca0c8b6697b6083334be7df49b7bf76a69d555962fd1a7657b5f",
+ "sha256:ad160fc1ea8f19e331a16a14a79f3d643d813a69534ba9611d2c80dc10439dad"
],
- "version": "==2.0.1"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
+ "version": "==2.6.0"
},
"google-auth-oauthlib": {
"hashes": [
- "sha256:4ab58e6c3dc6ccf112f921fcced40e5426fba266768986ea502228488276eaba",
- "sha256:b5a1ce7c617d247ccb2dfbba9d4bfc734b41096803d854a2c52592ae80150a67"
+ "sha256:3f2a6e802eebbb6fb736a370fbf3b055edcb6b52878bf2f26330b5e041316c73",
+ "sha256:a90a072f6993f2c327067bf65270046384cda5a8ecb20b94ea9a687f1f233a7a"
],
- "version": "==0.4.5"
+ "markers": "python_version >= '3.6'",
+ "version": "==0.4.6"
},
"gspread": {
"hashes": [
- "sha256:236a0f24e3724b49bae4cbd5144ed036b0ae6feaf5828ad033eb2824bf05e5be",
- "sha256:4933c3e2359e82698c0990f3b0e312627fcbf8fecc8bc81d26713f5860e20b48"
+ "sha256:d9db8c43d552f541ea072d4727d1e955bc2368b095dd86c5429a845c9d8aed8f",
+ "sha256:ffba57786e27519fb97125e3de37a0f062134a396506681f5baacaf47a9febe3"
],
"index": "pypi",
- "version": "==4.0.1"
+ "version": "==5.1.1"
},
"idna": {
"hashes": [
- "sha256:14475042e284991034cb48e06f6851428fb14c4dc953acd9be9a5e95c7b6dd7a",
- "sha256:467fbad99067910785144ce333826c71fb0e63a425657295239737f7ecd125f3"
+ "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
+ "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
],
"markers": "python_version >= '3'",
- "version": "==3.2"
+ "version": "==3.3"
+ },
+ "itsdangerous": {
+ "hashes": [
+ "sha256:29285842166554469a56d427addc0843914172343784cb909695fdbe90a3e129",
+ "sha256:d848fcb8bc7d507c4546b448574e8a44fc4ea2ba84ebf8d783290d53e81992f5"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==2.1.0"
+ },
+ "jinja2": {
+ "hashes": [
+ "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8",
+ "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==3.0.3"
},
"jmespath": {
"hashes": [
"sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9",
"sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f"
],
+ "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.10.0"
},
+ "loguru": {
+ "hashes": [
+ "sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c",
+ "sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3"
+ ],
+ "index": "pypi",
+ "version": "==0.6.0"
+ },
+ "markupsafe": {
+ "hashes": [
+ "sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3",
+ "sha256:09c86c9643cceb1d87ca08cdc30160d1b7ab49a8a21564868921959bd16441b8",
+ "sha256:142119fb14a1ef6d758912b25c4e803c3ff66920635c44078666fe7cc3f8f759",
+ "sha256:1d1fb9b2eec3c9714dd936860850300b51dbaa37404209c8d4cb66547884b7ed",
+ "sha256:204730fd5fe2fe3b1e9ccadb2bd18ba8712b111dcabce185af0b3b5285a7c989",
+ "sha256:24c3be29abb6b34052fd26fc7a8e0a49b1ee9d282e3665e8ad09a0a68faee5b3",
+ "sha256:290b02bab3c9e216da57c1d11d2ba73a9f73a614bbdcc027d299a60cdfabb11a",
+ "sha256:3028252424c72b2602a323f70fbf50aa80a5d3aa616ea6add4ba21ae9cc9da4c",
+ "sha256:30c653fde75a6e5eb814d2a0a89378f83d1d3f502ab710904ee585c38888816c",
+ "sha256:3cace1837bc84e63b3fd2dfce37f08f8c18aeb81ef5cf6bb9b51f625cb4e6cd8",
+ "sha256:4056f752015dfa9828dce3140dbadd543b555afb3252507348c493def166d454",
+ "sha256:454ffc1cbb75227d15667c09f164a0099159da0c1f3d2636aa648f12675491ad",
+ "sha256:598b65d74615c021423bd45c2bc5e9b59539c875a9bdb7e5f2a6b92dfcfc268d",
+ "sha256:599941da468f2cf22bf90a84f6e2a65524e87be2fce844f96f2dd9a6c9d1e635",
+ "sha256:5ddea4c352a488b5e1069069f2f501006b1a4362cb906bee9a193ef1245a7a61",
+ "sha256:62c0285e91414f5c8f621a17b69fc0088394ccdaa961ef469e833dbff64bd5ea",
+ "sha256:679cbb78914ab212c49c67ba2c7396dc599a8479de51b9a87b174700abd9ea49",
+ "sha256:6e104c0c2b4cd765b4e83909cde7ec61a1e313f8a75775897db321450e928cce",
+ "sha256:736895a020e31b428b3382a7887bfea96102c529530299f426bf2e636aacec9e",
+ "sha256:75bb36f134883fdbe13d8e63b8675f5f12b80bb6627f7714c7d6c5becf22719f",
+ "sha256:7d2f5d97fcbd004c03df8d8fe2b973fe2b14e7bfeb2cfa012eaa8759ce9a762f",
+ "sha256:80beaf63ddfbc64a0452b841d8036ca0611e049650e20afcb882f5d3c266d65f",
+ "sha256:84ad5e29bf8bab3ad70fd707d3c05524862bddc54dc040982b0dbcff36481de7",
+ "sha256:8da5924cb1f9064589767b0f3fc39d03e3d0fb5aa29e0cb21d43106519bd624a",
+ "sha256:961eb86e5be7d0973789f30ebcf6caab60b844203f4396ece27310295a6082c7",
+ "sha256:96de1932237abe0a13ba68b63e94113678c379dca45afa040a17b6e1ad7ed076",
+ "sha256:a0a0abef2ca47b33fb615b491ce31b055ef2430de52c5b3fb19a4042dbc5cadb",
+ "sha256:b2a5a856019d2833c56a3dcac1b80fe795c95f401818ea963594b345929dffa7",
+ "sha256:b8811d48078d1cf2a6863dafb896e68406c5f513048451cd2ded0473133473c7",
+ "sha256:c532d5ab79be0199fa2658e24a02fce8542df196e60665dd322409a03db6a52c",
+ "sha256:d3b64c65328cb4cd252c94f83e66e3d7acf8891e60ebf588d7b493a55a1dbf26",
+ "sha256:d4e702eea4a2903441f2735799d217f4ac1b55f7d8ad96ab7d4e25417cb0827c",
+ "sha256:d5653619b3eb5cbd35bfba3c12d575db2a74d15e0e1c08bf1db788069d410ce8",
+ "sha256:d66624f04de4af8bbf1c7f21cc06649c1c69a7f84109179add573ce35e46d448",
+ "sha256:e67ec74fada3841b8c5f4c4f197bea916025cb9aa3fe5abf7d52b655d042f956",
+ "sha256:e6f7f3f41faffaea6596da86ecc2389672fa949bd035251eab26dc6697451d05",
+ "sha256:f02cf7221d5cd915d7fa58ab64f7ee6dd0f6cddbb48683debf5d04ae9b1c2cc1",
+ "sha256:f0eddfcabd6936558ec020130f932d479930581171368fd728efcfb6ef0dd357",
+ "sha256:fabbe18087c3d33c5824cb145ffca52eccd053061df1d79d4b66dafa5ad2a5ea",
+ "sha256:fc3150f85e2dbcf99e65238c842d1cfe69d3e7649b19864c1cc043213d9cd730"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==2.1.0"
+ },
"oauthlib": {
"hashes": [
- "sha256:42bf6354c2ed8c6acb54d971fce6f88193d97297e18602a3a886603f9d7730cc",
- "sha256:8f0215fcc533dd8dd1bee6f4c412d4f0cd7297307d43ac61666389e3bc3198a3"
+ "sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2",
+ "sha256:6db33440354787f9b7f3a6dbd4febf5d0f93758354060e802f6c06cb493022fe"
],
- "version": "==3.1.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==3.2.0"
},
"pyasn1": {
"hashes": [
@@ -164,79 +278,112 @@
],
"version": "==0.2.8"
},
+ "pyparsing": {
+ "hashes": [
+ "sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea",
+ "sha256:a6c06a88f252e6c322f65faf8f418b16213b51bdfaece0524c1c1bc30c63c484"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==3.0.7"
+ },
"python-dateutil": {
"hashes": [
"sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
"sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"
],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.8.2"
},
"python-dotenv": {
"hashes": [
- "sha256:aae25dc1ebe97c420f50b81fb0e5c949659af713f31fdb63c749ca68748f34b1",
- "sha256:f521bc2ac9a8e03c736f62911605c5d83970021e3fa95b37d769e2bbbe9b6172"
+ "sha256:32b2bdc1873fd3a3c346da1c6db83d0053c3c62f28f1f38516070c4c8971b1d3",
+ "sha256:a5de49a31e953b45ff2d2fd434bbc2670e8db5273606c1e737cc6b93eff3655f"
],
"index": "pypi",
- "version": "==0.19.0"
+ "version": "==0.19.2"
},
"requests": {
"hashes": [
- "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
- "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
+ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
+ "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
],
- "version": "==2.26.0"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
+ "version": "==2.27.1"
},
"requests-oauthlib": {
"hashes": [
- "sha256:7f71572defaecd16372f9006f33c2ec8c077c3cfa6f5911a9a90202beb513f3d",
- "sha256:b4261601a71fd721a8bd6d7aa1cc1d6a8a93b4a9f5e96626f8e4d91e8beeaa6a",
- "sha256:fa6c47b933f01060936d87ae9327fead68768b69c6c9ea2109c48be30f2d4dbc"
+ "sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5",
+ "sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a"
],
- "version": "==1.3.0"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+ "version": "==1.3.1"
+ },
+ "requests-toolbelt": {
+ "hashes": [
+ "sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f",
+ "sha256:968089d4584ad4ad7c171454f0a5c6dac23971e9472521ea3b6d49d610aa6fc0"
+ ],
+ "version": "==0.9.1"
},
"rsa": {
"hashes": [
- "sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2",
- "sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9"
+ "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17",
+ "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"
],
- "version": "==4.7.2"
+ "markers": "python_version >= '3.6'",
+ "version": "==4.8"
},
"s3transfer": {
"hashes": [
- "sha256:50ed823e1dc5868ad40c8dc92072f757aa0e653a192845c94a3b676f4a62da4c",
- "sha256:9c1dc369814391a6bda20ebbf4b70a0f34630592c9aa520856bf384916af2803"
+ "sha256:25c140f5c66aa79e1ac60be50dcd45ddc59e83895f062a3aab263b870102911f",
+ "sha256:69d264d3e760e569b78aaa0f22c97e955891cd22e32b10c51f784eeda4d9d10a"
],
- "version": "==0.5.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==0.5.1"
},
"six": {
"hashes": [
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
"sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.16.0"
},
"soupsieve": {
"hashes": [
- "sha256:052774848f448cf19c7e959adf5566904d525f33a3f8b6ba6f6f8f26ec7de0cc",
- "sha256:c2c1c2d44f158cdbddab7824a9af8c4f83c76b1e23e049479aa432feb6c4c23b"
+ "sha256:1a3cca2617c6b38c0343ed661b1fa5de5637f257d4fe22bd9f1338010a1efefb",
+ "sha256:b8d49b1cd4f037c7082a9683dfa1801aa2597fb11c3a1155b7a5b94829b4f1f9"
],
- "markers": "python_version >= '3.0'",
- "version": "==2.2.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.3.1"
+ },
+ "tiktok-downloader": {
+ "git": "https://github.com/msramalho/tiktok-downloader",
+ "ref": "81c6ea1f959b2cc5620961043272592bd1bfc2e2"
},
"urllib3": {
"hashes": [
- "sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4",
- "sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f"
+ "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
+ "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
],
- "version": "==1.26.6"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
+ "version": "==1.26.8"
+ },
+ "werkzeug": {
+ "hashes": [
+ "sha256:1421ebfc7648a39a5c58c601b154165d05cf47a3cd0ccb70857cbdacf6c8f2b8",
+ "sha256:b863f8ff057c522164b6067c9e28b041161b4be5ba4d0daceeaa50a163822d3c"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==2.0.3"
},
"youtube-dl": {
"hashes": [
- "sha256:263e04d53fb8ba3dfbd246ad09b7d388e896c132a20cc770c26ee7684de050ac",
- "sha256:cb2d3ee002158ede783e97a82c95f3817594df54367ea6a77ce5ceea4772f0ab"
+ "sha256:bc59e86c5d15d887ac590454511f08ce2c47698d5a82c27bfe27b5d814bbaed2",
+ "sha256:f1336d5de68647e0364a47b3c0712578e59ec76f02048ff5c50ef1c69d79cd55"
],
"index": "pypi",
- "version": "==2021.6.6"
+ "version": "==2021.12.17"
}
},
"develop": {}
diff --git a/README.md b/README.md
index 2e40bcc..7910e30 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,8 @@ If you are using `pipenv` (recommended), `pipenv install` is sufficient to insta
[A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script.
+[ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work.
+
A `.env` file is required for saving content to a Digital Ocean space, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables:
```
@@ -66,3 +68,23 @@ To make it easier to set up new auto-archiver sheets, the auto-auto-archiver wil

+# Code structure
+Code is split into functional concepts:
+1. [Archivers](archivers/) - receive a URL that they try to archive
+2. [Storages](storages/) - they deal with where the archived files go
+3. [Utilities](utils/)
+ 1. [GWorksheet](utils/gworksheet.py) - facilitates some of the reading/writing tasks for a Google Worksheet
+
+### Current Archivers
+```mermaid
+graph TD
+ A(Archiver) -->|parent of| B(TelegramArchiver)
+ A -->|parent of| C(TikTokArchiver)
+ A -->|parent of| D(YoutubeDLArchiver)
+ A -->|parent of| E(WaybackArchiver)
+```
+### Current Storages
+```mermaid
+graph TD
+ A(BaseStorage) -->|parent of| B(S3Storage)
+```
diff --git a/archivers/__init__.py b/archivers/__init__.py
new file mode 100644
index 0000000..e6c4ba6
--- /dev/null
+++ b/archivers/__init__.py
@@ -0,0 +1,6 @@
+# we need to explicitly expose the available imports here
+from .base_archiver import *
+from .telegram_archiver import *
+from .tiktok_archiver import *
+from .wayback_archiver import *
+from .youtubedl_archiver import *
\ No newline at end of file
diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
new file mode 100644
index 0000000..12cca80
--- /dev/null
+++ b/archivers/base_archiver.py
@@ -0,0 +1,107 @@
+import os
+import ffmpeg
+import datetime
+import shutil
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+from urllib.parse import urlparse
+
+from storages import Storage
+from utils import mkdir_if_not_exists
+
+
+@dataclass
+class ArchiveResult:
+ status: str
+ cdn_url: str = None
+ thumbnail: str = None
+ thumbnail_index: str = None
+ duration: float = None
+ title: str = None
+ timestamp: datetime.datetime = None
+
+
+class Archiver(ABC):
+ name = "default"
+
+ def __init__(self, storage: Storage):
+ self.storage = storage
+
+ def __str__(self):
+ return self.__class__.__name__
+
+ @abstractmethod
+ def download(self, url, check_if_exists=False): pass
+
+ def get_netloc(self, url):
+ return urlparse(url).netloc
+
+ def get_key(self, filename):
+ """
+ returns a key in the format "[archiverName]_[filename]" includes extension
+ """
+ tail = os.path.split(filename)[1] # returns filename.ext from full path
+ _id, extension = os.path.splitext(tail) # returns [filename, .ext]
+ if 'unknown_video' in _id:
+ _id = _id.replace('unknown_video', 'jpg')
+ return f'{self.name}_{_id}{extension}'
+
+ def get_thumbnails(self, filename, key, duration=None):
+ thumbnails_folder = filename.split('.')[0] + '/'
+ key_folder = key.split('.')[0] + '/'
+
+ mkdir_if_not_exists(thumbnails_folder)
+
+ fps = 0.5
+ if duration is not None:
+ duration = float(duration)
+
+ if duration < 60:
+ fps = 10.0 / duration
+ elif duration < 120:
+ fps = 20.0 / duration
+ else:
+ fps = 40.0 / duration
+
+ stream = ffmpeg.input(filename)
+ stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
+ stream.output(thumbnails_folder + 'out%d.jpg').run()
+
+ thumbnails = os.listdir(thumbnails_folder)
+ cdn_urls = []
+ for fname in thumbnails:
+ if fname[-3:] == 'jpg':
+ thumbnail_filename = thumbnails_folder + fname
+ key = key_folder + fname
+
+ cdn_url = self.storage.get_cdn_url(key)
+
+ self.storage.upload(thumbnail_filename, key)
+
+ cdn_urls.append(cdn_url)
+
+ if len(cdn_urls) == 0:
+ return ('None', 'None')
+
+ key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]
+
+ index_page = f'''
{filename}
+ '''
+
+ for t in cdn_urls:
+ index_page += f'
'
+
+ index_page += f""
+ index_fname = thumbnails_folder + 'index.html'
+
+ with open(index_fname, 'w') as f:
+ f.write(index_page)
+
+ thumb_index = key_folder + 'index.html'
+
+ self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'})
+ shutil.rmtree(thumbnails_folder)
+
+ thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)
+
+ return (key_thumb, thumb_index_cdn_url)
diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py
new file mode 100644
index 0000000..d7b8924
--- /dev/null
+++ b/archivers/telegram_archiver.py
@@ -0,0 +1,65 @@
+import os
+import requests
+from bs4 import BeautifulSoup
+
+from .base_archiver import Archiver, ArchiveResult
+
+
+class TelegramArchiver(Archiver):
+ name = "telegram"
+
+ def download(self, url, check_if_exists=False):
+ # detect URLs that we definitely cannot handle
+ if 't.me' != self.get_netloc(url):
+ return False
+
+ headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
+ }
+ status = "success"
+
+ original_url = url
+
+ # TODO: check if we can do this more resilient to variable URLs
+ if url[-8:] != "?embed=1":
+ url += "?embed=1"
+
+ t = requests.get(url, headers=headers)
+ s = BeautifulSoup(t.content, 'html.parser')
+ video = s.find("video")
+
+ if video is None:
+ return False # could not find video
+
+ video_url = video.get('src')
+ video_id = video_url.split('/')[-1].split('?')[0]
+ key = self.get_key(video_id)
+
+ filename = 'tmp/' + key
+ cdn_url = self.storage.get_cdn_url(key)
+
+ if check_if_exists and self.storage.exists(key):
+ status = 'already archived'
+
+ v = requests.get(video_url, headers=headers)
+
+ with open(filename, 'wb') as f:
+ f.write(v.content)
+
+ if status != 'already archived':
+ self.storage.upload(filename, key)
+
+ # extract duration from HTML
+ duration = s.find_all('time')[0].contents[0]
+ if ':' in duration:
+ duration = float(duration.split(':')[0]) * 60
+ + float(duration.split(':')[1])
+ else:
+ duration = float(duration)
+
+ # process thumbnails
+ key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
+ os.remove(filename)
+
+ return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
+ duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))
diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py
new file mode 100644
index 0000000..62aa415
--- /dev/null
+++ b/archivers/tiktok_archiver.py
@@ -0,0 +1,60 @@
+import os, traceback
+import tiktok_downloader
+from loguru import logger
+
+from .base_archiver import Archiver, ArchiveResult
+
+
+class TiktokArchiver(Archiver):
+ name = "tiktok"
+
+ def download(self, url, check_if_exists=False):
+ if 'tiktok.com' not in url:
+ return False
+
+ status = 'success'
+
+ try:
+ info = tiktok_downloader.info_post(url)
+ key = self.get_key(f'{info.id}.mp4')
+ cdn_url = self.storage.get_cdn_url(key)
+ filename = 'tmp/' + key
+
+ if check_if_exists and self.storage.exists(key):
+ status = 'already archived'
+
+ media = tiktok_downloader.snaptik(url).get_media()
+
+ if len(media) <= 0:
+ if status == 'already archived':
+ return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url)
+ else:
+ return ArchiveResult(status='Could not download media')
+
+ media[0].download(filename)
+
+ if status != 'already archived':
+ self.storage.upload(filename, key)
+
+ try:
+ key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=info.duration)
+ except Exception as e:
+ logger.error(e)
+ key_thumb = ''
+ thumb_index = 'error creating thumbnails'
+
+ try: os.remove(filename)
+ except FileNotFoundError:
+ logger.info(f'tmp file not found thus not deleted {filename}')
+
+ return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
+ thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat())
+
+ except tiktok_downloader.Except.InvalidUrl:
+ status = 'Invalid URL'
+ return ArchiveResult(status=status)
+
+ except:
+ error = traceback.format_exc()
+ status = 'Other Tiktok error: ' + str(error)
+ return ArchiveResult(status=status)
diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py
new file mode 100644
index 0000000..53b356f
--- /dev/null
+++ b/archivers/wayback_archiver.py
@@ -0,0 +1,76 @@
+import time, requests, os
+from bs4 import BeautifulSoup
+
+from storages import Storage
+from .base_archiver import Archiver, ArchiveResult
+
+
+class WaybackArchiver(Archiver):
+ name = "wayback"
+
+ def __init__(self, storage: Storage):
+ super(WaybackArchiver, self).__init__(storage)
+ self.seen_urls = {}
+
+ def download(self, url, check_if_exists=False):
+ if check_if_exists and url in self.seen_urls:
+ return self.seen_urls[url]
+
+ ia_headers = {
+ "Accept": "application/json",
+ "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
+ }
+
+ r = requests.post(
+ 'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
+
+ if r.status_code != 200:
+ return ArchiveResult(status="Internet archive failed")
+
+ if 'job_id' not in r.json() and 'message' in r.json():
+ return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
+
+ job_id = r.json()['job_id']
+
+ status_r = requests.get('https://web.archive.org/save/status/' + job_id, headers=ia_headers)
+
+ retries = 0
+
+ # wait 90-120 seconds for the archive job to finish
+ while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
+ time.sleep(3)
+
+ try:
+ status_r = requests.get(
+ 'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
+ except:
+ time.sleep(1)
+
+ retries += 1
+
+ if status_r.status_code != 200:
+ return ArchiveResult(status="Internet archive failed")
+
+ status_json = status_r.json()
+
+ if status_json['status'] != 'success':
+ return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
+
+ archive_url = 'https://web.archive.org/web/' + \
+ status_json['timestamp'] + '/' + status_json['original_url']
+
+ try:
+ r = requests.get(archive_url)
+
+ parsed = BeautifulSoup(r.content, 'html.parser')
+
+ title = parsed.find_all('title')[0].text
+
+ if title == 'Wayback Machine':
+ title = 'Could not get title'
+ except:
+ title = "Could not get title"
+
+ result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title)
+ self.seen_urls[url] = result
+ return result
diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py
new file mode 100644
index 0000000..ec11061
--- /dev/null
+++ b/archivers/youtubedl_archiver.py
@@ -0,0 +1,92 @@
+
+import os
+import datetime
+import youtube_dl
+from loguru import logger
+
+from .base_archiver import Archiver, ArchiveResult
+
+
+class YoutubeDLArchiver(Archiver):
+ name = "yotube_dl"
+ ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
+
+ def download(self, url, check_if_exists=False):
+ netloc = self.get_netloc(url)
+ if netloc in ['facebook.com', 'wwww.facebook.com'] and os.getenv('FB_COOKIE'):
+ logger.info('Using Facebook cookie')
+ youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
+
+ ydl = youtube_dl.YoutubeDL(YoutubeDLArchiver.ydl_opts)
+ cdn_url = None
+ status = 'success'
+
+ try:
+ info = ydl.extract_info(url, download=False)
+ except youtube_dl.utils.DownloadError:
+ # no video here
+ return False
+
+ if info.get('is_live', False):
+ logger.warning("Live streaming media, not archiving now")
+ return ArchiveResult(status="Streaming media")
+
+ if check_if_exists:
+ if 'entries' in info:
+ if len(info['entries']) > 1:
+ logger.warning('YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos')
+ return False
+ elif len(info['entries']) == 0:
+ logger.warning(
+ 'YoutubeDLArchiver succeeded but did not find video')
+ return False
+
+ filename = ydl.prepare_filename(info['entries'][0])
+ else:
+ filename = ydl.prepare_filename(info)
+
+ key = self.get_key(filename)
+
+ if self.storage.exists(key):
+ status = 'already archived'
+ cdn_url = self.storage.get_cdn_url(key)
+
+ # sometimes this results in a different filename, so do this again
+ info = ydl.extract_info(url, download=True)
+
+ # TODO: add support for multiple videos
+ if 'entries' in info:
+ if len(info['entries']) > 1:
+ logger.warning(
+ 'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
+ return False
+ else:
+ info = info['entries'][0]
+
+ filename = ydl.prepare_filename(info)
+
+ if not os.path.exists(filename):
+ filename = filename.split('.')[0] + '.mkv'
+
+ if status != 'already archived':
+ key = self.get_key(filename)
+ cdn_url = self.storage.get_cdn_url(key)
+
+ self.storage.upload(filename, key)
+
+ # get duration
+ duration = info.get('duration')
+
+ # get thumbnails
+ try:
+ key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
+ except:
+ key_thumb = ''
+ thumb_index = 'Could not generate thumbnails'
+
+ os.remove(filename)
+
+ timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None
+
+ return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
+ title=info['title'] if 'title' in info else None, timestamp=timestamp)
diff --git a/auto_archive.py b/auto_archive.py
index f0f6862..211d3d7 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -1,506 +1,133 @@
-import gspread
-import youtube_dl
-from pathlib import Path
-import sys
-import datetime
-import boto3
import os
-from dotenv import load_dotenv
-from botocore.errorfactory import ClientError
+import datetime
import argparse
-import math
-import ffmpeg
-import threading
-import time
-from bs4 import BeautifulSoup
import requests
+import shutil
+import gspread
+from loguru import logger
+from dotenv import load_dotenv
+
+import archivers
+from storages import S3Storage, S3Config
+from utils import GWorksheet, mkdir_if_not_exists
load_dotenv()
-def col_to_index(col):
- col = list(col)
- ndigits = len(col)
- alphabet = ' ABCDEFGHIJKLMNOPQRSTUVWXYZ'
- v = 0
- i = ndigits - 1
+def update_sheet(gw, row, result: archivers.ArchiveResult):
+ cell_updates = []
+ row_values = gw.get_row(row)
- for digit in col:
- index = alphabet.find(digit)
- v += (26 ** i) * index
- i -= 1
+ def batch_if_valid(col, val, final_value=None):
+ final_value = final_value or val
+ if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
+ cell_updates.append((row, col, final_value))
- return v - 1
+ cell_updates.append((row, 'status', result.status))
+ batch_if_valid('archive', result.cdn_url)
+ batch_if_valid('date', True, datetime.datetime.now().isoformat())
+ batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
+ batch_if_valid('thumbnail_index', result.thumbnail_index)
+ batch_if_valid('title', result.title)
+ batch_if_valid('duration', result.duration, str(result.duration))
-def index_to_col(index):
- alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+ if result.timestamp and type(result.timestamp) != str:
+ result.timestamp = datetime.datetime.fromtimestamp(result.timestamp).isoformat()
+ batch_if_valid('timestamp', result.timestamp)
- if index > 25:
- t = index
- dig = 0
- while t > 25:
- t = math.floor(t / 26)
- dig += 1
- return alphabet[t - 1] + index_to_col(index - t * int(math.pow(26, dig)))
- else:
- return alphabet[index]
+ gw.batch_set_cell(cell_updates)
-def get_cdn_url(key):
- return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
- os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
-
-def do_s3_upload(s3_client, f, key):
- s3_client.upload_fileobj(f, Bucket=os.getenv(
- 'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
-
-
-def get_thumbnails(filename, s3_client, duration = None):
- if not os.path.exists(filename.split('.')[0]):
- os.mkdir(filename.split('.')[0])
-
- fps = 0.5
- if duration is not None:
- duration = float(duration)
-
- if duration < 60:
- fps = 10.0 / duration
- elif duration < 120:
- fps = 20.0 / duration
- else:
- fps = 40.0 / duration
-
-
- stream = ffmpeg.input(filename)
- stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
- stream.output(filename.split('.')[0] + '/out%d.jpg').run()
-
- thumbnails = os.listdir(filename.split('.')[0] + '/')
- cdn_urls = []
-
- for fname in thumbnails:
- if fname[-3:] == 'jpg':
- thumbnail_filename = filename.split('.')[0] + '/' + fname
- key = filename.split('/')[1].split('.')[0] + '/' + fname
-
- cdn_url = get_cdn_url(key)
-
- with open(thumbnail_filename, 'rb') as f:
- do_s3_upload(s3_client, f, key)
-
- cdn_urls.append(cdn_url)
- os.remove(thumbnail_filename)
-
- if len(cdn_urls) == 0:
- return ('None', 'None')
-
- key_thumb = cdn_urls[int(len(cdn_urls)*0.1)]
-
- index_page = f'''{filename}
- '''
-
- for t in cdn_urls:
- index_page += f'
'
-
- index_page += f""
- index_fname = filename.split('.')[0] + '/index.html'
-
- with open(index_fname, 'w') as f:
- f.write(index_page)
-
- thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
-
- s3_client.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
- 'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
-
- thumb_index_cdn_url = get_cdn_url(thumb_index)
-
- return (key_thumb, thumb_index_cdn_url)
-
-
-def download_telegram_video(url, s3_client, check_if_exists=False):
- status = 'success'
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
-
- original_url = url
-
- if url[-8:] != "?embed=1":
- url += "?embed=1"
-
- t = requests.get(url, headers=headers)
- s = BeautifulSoup(t.content, 'html.parser')
- video = s.find("video")
-
- if video is None:
- return ({}, 'No telegram video found')
- else:
- video_url = video.get('src')
- key = video_url.split('/')[-1].split('?')[0]
- filename = 'tmp/' + key
-
- if check_if_exists:
- try:
- s3_client.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
- # file exists
- cdn_url = get_cdn_url(key)
-
- status = 'already archived'
-
- except ClientError:
- pass
-
- v = requests.get(video_url, headers=headers)
-
- with open(filename, 'wb') as f:
- f.write(v.content)
-
- if status != 'already archived':
- cdn_url = get_cdn_url(key)
-
- with open(filename, 'rb') as f:
- do_s3_upload(s3_client, f, key)
-
- duration = s.find_all('time')[0].contents[0]
- if ':' in duration:
- duration = float(duration.split(':')[0])*60 + float(duration.split(':')[1])
- else:
- duration = float(duration)
-
- key_thumb, thumb_index = get_thumbnails(filename, s3_client, duration=duration)
- os.remove(filename)
-
- video_data = {
- 'cdn_url': cdn_url,
- 'thumbnail': key_thumb,
- 'thumbnail_index': thumb_index,
- 'duration': duration,
- 'title': original_url,
- 'timestamp': s.find_all('time')[1].get('datetime')
- }
-
- return (video_data, status)
-
-
-def internet_archive(url, s3_client):
-
-
- ia_headers = {
- "Accept": "application/json",
- "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
- }
-
- r = requests.post(
- 'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
-
- if r.status_code != 200:
- return ({}, 'Internet archive failed')
- else:
- job_id = r.json()['job_id']
-
- status_r = requests.get(
- 'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
-
- retries = 0
-
- while status_r.json()['status'] == 'pending' and retries < 40:
- time.sleep(5)
-
- try:
- status_r = requests.get(
- 'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
- except:
- time.sleep(1)
-
- retries += 1
-
- status_json = status_r.json()
-
- if status_json['status'] == 'success':
- url = 'https://web.archive.org/web/' + \
- status_json['timestamp'] + '/' + status_json['original_url']
-
- r = requests.get(url)
-
- parsed = BeautifulSoup(
- r.content, 'html.parser')
- title = parsed.find_all('title')[
- 0].text
-
- return ({'cdn_url': url, 'title': title}, 'Internet Archive fallback')
- else:
- return ({}, 'Internet Archive failed: ' + status_json['message'])
-
-def get_key(filename):
- key = filename.split('/')[1]
- if 'unknown_video' in key:
- key = key.replace('unknown_video', 'jpg')
- return key
-
-
-def download_vid(url, s3_client, check_if_exists=False):
- ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
- if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
- print('Using cookie')
- youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
- ydl = youtube_dl.YoutubeDL(ydl_opts)
- cdn_url = None
- status = 'success'
-
- if check_if_exists:
- info = ydl.extract_info(url, download=False)
-
- if 'entries' in info:
- if len(info['entries']) > 1:
- raise Exception(
- 'ERROR: Cannot archive channels or pages with multiple videos')
-
- filename = ydl.prepare_filename(info['entries'][0])
- else:
- filename = ydl.prepare_filename(info)
-
- key = get_key(filename)
+def expand_url(url):
+ # expand short URL links
+ if 'https://t.co/' in url:
try:
- s3_client.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
- # file exists
- cdn_url = get_cdn_url(os, key)
-
- status = 'already archived'
-
- except ClientError:
- pass
-
- # sometimes this results in a different filename, so do this again
- info = ydl.extract_info(url, download=True)
-
- if 'entries' in info:
- if len(info['entries']) > 1:
- raise Exception(
- 'ERROR: Cannot archive channels or pages with multiple videos')
- else:
- info = info['entries'][0]
-
- filename = ydl.prepare_filename(info)
-
- if not os.path.exists(filename):
- filename = filename.split('.')[0] + '.mkv'
-
- if status != 'already archived':
- key = get_key(filename)
- cdn_url = get_cdn_url(os, key)
-
- with open(filename, 'rb') as f:
- do_s3_upload(s3_client, f, key)
-
- duration = info['duration'] if 'duration' in info else None
- key_thumb, thumb_index = get_thumbnails(filename, s3_client, duration=duration)
- os.remove(filename)
-
- video_data = {
- 'cdn_url': cdn_url,
- 'thumbnail': key_thumb,
- 'thumbnail_index': thumb_index,
- 'duration': duration,
- 'title': info['title'] if 'title' in info else None,
- 'timestamp': info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None,
- }
-
- return (video_data, status)
-
-
-def update_sheet(wks, row, status, video_data, columns, v):
- update = []
-
- if columns['status'] is not None:
- update += [{
- 'range': columns['status'] + str(row),
- 'values': [[status]]
- }]
-
- if 'cdn_url' in video_data and video_data['cdn_url'] is not None and columns['archive'] is not None and v[col_to_index(columns['archive'])] == '':
- update += [{
- 'range': columns['archive'] + str(row),
- 'values': [[video_data['cdn_url']]]
- }]
-
- if 'date' in video_data and columns['date'] is not None and v[col_to_index(columns['date'])] == '':
- update += [{
- 'range': columns['date'] + str(row),
- 'values': [[datetime.datetime.now().isoformat()]]
- }]
-
- if 'thumbnail' in video_data and columns['thumbnail'] is not None and v[col_to_index(columns['thumbnail'])] == '':
- update += [{
- 'range': columns['thumbnail'] + str(row),
- 'values': [['=IMAGE("' + video_data['thumbnail'] + '")']]
- }]
-
- if 'thumbnail_index' in video_data and columns['thumbnail_index'] is not None and v[col_to_index(columns['thumbnail_index'])] == '':
- update += [{
- 'range': columns['thumbnail_index'] + str(row),
- 'values': [[video_data['thumbnail_index']]]
- }]
-
- if 'timestamp' in video_data and columns['timestamp'] is not None and video_data['timestamp'] is not None and v[col_to_index(columns['timestamp'])] == '':
- update += [{
- 'range': columns['timestamp'] + str(row),
- 'values': [[video_data['timestamp']]] if type(video_data['timestamp']) == str else [[datetime.datetime.fromtimestamp(video_data['timestamp']).isoformat()]]
- }]
-
- if 'title' in video_data and columns['title'] is not None and video_data['title'] is not None and v[col_to_index(columns['title'])] == '':
- update += [{
- 'range': columns['title'] + str(row),
- 'values': [[video_data['title']]]
- }]
-
- if 'duration' in video_data and columns['duration'] is not None and video_data['duration'] is not None and v[col_to_index(columns['duration'])] == '':
- update += [{
- 'range': columns['duration'] + str(row),
- 'values': [[str(video_data['duration'])]]
- }]
-
- wks.batch_update(update, value_input_option='USER_ENTERED')
-
-
-def record_stream(url, s3_client, wks, i, columns, v):
- video_data, status = download_vid(url, s3_client)
- update_sheet(wks, i, status, video_data, columns, v)
+ r = requests.get(url)
+ url = r.url
+ except:
+ logger.error(f'Failed to expand url {url}')
+ return url
def process_sheet(sheet):
gc = gspread.service_account(filename='service_account.json')
sh = gc.open(sheet)
- n_worksheets = len(sh.worksheets())
- s3_client = boto3.client('s3',
- region_name=os.getenv('DO_SPACES_REGION'),
- endpoint_url='https://{}.digitaloceanspaces.com'.format(
- os.getenv('DO_SPACES_REGION')),
- aws_access_key_id=os.getenv('DO_SPACES_KEY'),
- aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
+ s3_config = S3Config(
+ bucket=os.getenv('DO_BUCKET'),
+ region=os.getenv('DO_SPACES_REGION'),
+ key=os.getenv('DO_SPACES_KEY'),
+ secret=os.getenv('DO_SPACES_SECRET')
+ )
# loop through worksheets to check
- for ii in range(n_worksheets):
- print("Opening worksheet " + str(ii))
- wks = sh.get_worksheet(ii)
- values = wks.get_all_values()
+ for ii, wks in enumerate(sh.worksheets()):
+ logger.info(f'Opening worksheet {ii}: "{wks.title}"')
+ gw = GWorksheet(wks)
- headers = [v.lower() for v in values[0]]
- columns = {}
-
- columns['url'] = index_to_col(headers.index(
- 'media url')) if 'media url' in headers else index_to_col(headers.index(
- 'source url')) if 'source url' in headers else None
-
- if columns['url'] is None:
- print("No 'Media URL' column found, skipping")
+ if not gw.col_exists('url'):
+ logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}')
continue
- url_index = col_to_index(columns['url'])
-
- columns['archive'] = index_to_col(headers.index(
- 'archive location')) if 'archive location' in headers else None
- columns['date'] = index_to_col(headers.index(
- 'archive date')) if 'archive date' in headers else None
- columns['status'] = index_to_col(headers.index(
- 'archive status')) if 'archive status' in headers else None
-
- if columns['status'] is None:
- print("No 'Archive status' column found, skipping")
+ if not gw.col_exists('status'):
+ logger.warning(f'No "Archive status" column found, skipping worksheet {wks.title}')
continue
- columns['thumbnail'] = index_to_col(headers.index(
- 'thumbnail')) if 'thumbnail' in headers else None
- columns['thumbnail_index'] = index_to_col(headers.index(
- 'thumbnail index')) if 'thumbnail index' in headers else None
- columns['timestamp'] = index_to_col(headers.index(
- 'upload timestamp')) if 'upload timestamp' in headers else None
- columns['title'] = index_to_col(headers.index(
- 'upload title')) if 'upload title' in headers else None
- columns['duration'] = index_to_col(headers.index(
- 'duration')) if 'duration' in headers else None
+ # archives will be in a folder 'doc_name/worksheet_name'
+ s3_config.folder = f'{sheet}/{wks.title}/'
+ s3_client = S3Storage(s3_config)
+
+ # order matters, first to succeed excludes remaining
+ active_archivers = [
+ archivers.TelegramArchiver(s3_client),
+ archivers.TiktokArchiver(s3_client),
+ archivers.YoutubeDLArchiver(s3_client),
+ archivers.WaybackArchiver(s3_client)
+ ]
# loop through rows in worksheet
- for i in range(2, len(values)+1):
- v = values[i-1]
+ for row in range(2, gw.count_rows() + 1):
+ url = gw.get_cell(row, 'url')
+ status = gw.get_cell(row, 'status')
+ if url != '' and status in ['', None]:
+ gw.set_cell(row, 'status', 'Archive in progress')
- if v[url_index] != "" and v[col_to_index(columns['status'])] == "":
- latest_val = wks.acell(
- columns['status'] + str(i)).value
+ url = expand_url(url)
- # check so we don't step on each others' toes
- if latest_val == '' or latest_val is None:
- wks.update(
- columns['status'] + str(i), 'Archive in progress')
+ for archiver in active_archivers:
+ logger.debug(f'Trying {archiver} on row {row}')
- if 'http://t.me/' in v[url_index] or 'https://t.me/' in v[url_index]:
- video_data, status = download_telegram_video(
- v[url_index], s3_client, check_if_exists=True)
-
- if status == 'No telegram video found':
- print("Trying Internet Archive fallback")
+ # TODO: add support for multiple videos/images
+ try:
+ result = archiver.download(url, check_if_exists=True)
+ except Exception as e:
+ result = False
+ logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}')
- video_data, status = internet_archive(
- v[url_index], s3_client)
-
- update_sheet(wks, i, status, video_data, columns, v)
+ if result:
+ if result.status in ['success', 'already archived']:
+ logger.success(f'{archiver} succeeded on row {row}')
+ break
+ logger.warning(f'{archiver} did not succeed on row {row}, final status: {result.status}')
- else:
- try:
- ydl_opts = {
- 'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
- if (v[url_index][0:21] == 'https://facebook.com/' or v[url_index][0:25] == 'https://www.facebook.com/') and os.getenv('FB_COOKIE'):
- print('Using cookie')
- youtube_dl.utils.std_headers['cookie'] = os.getenv(
- 'FB_COOKIE')
- ydl = youtube_dl.YoutubeDL(ydl_opts)
- info = ydl.extract_info(
- v[url_index], download=False)
-
- if 'is_live' in info and info['is_live']:
- wks.update(columns['status'] +
- str(i), 'Recording stream')
- t = threading.Thread(target=record_stream, args=(
- v[url_index], s3_client, wks, i, columns, v))
- t.start()
- continue
- elif 'is_live' not in info or not info['is_live']:
- video_data, status = download_vid(
- v[url_index], s3_client, check_if_exists=True)
- update_sheet(wks, i, status,
- video_data, columns, v)
-
- except:
- # i'm sure there's a better way to handle this than nested try/catch blocks
- try:
- print("Trying Internet Archive fallback")
-
- video_data, status = internet_archive(
- v[url_index], s3_client)
- update_sheet(wks, i, status,
- video_data, columns, v)
-
- except:
- # if any unexpected errors occured, log these into the Google Sheet
- t, value, traceback = sys.exc_info()
-
- update_sheet(wks, i, str(
- value), {}, columns, v)
+ if result:
+ update_sheet(gw, row, result)
+ else:
+ gw.set_cell(row, 'status', 'failed: no archiver')
def main():
parser = argparse.ArgumentParser(
- description="Automatically use youtube-dl to download media from a Google Sheet")
- parser.add_argument("--sheet", action="store", dest="sheet")
+ description='Automatically archive social media videos from a Google Sheets document')
+ parser.add_argument('--sheet', action='store', dest='sheet')
args = parser.parse_args()
- print("Opening document " + args.sheet)
+ logger.info(f'Opening document {args.sheet}')
+ mkdir_if_not_exists('tmp')
process_sheet(args.sheet)
+ shutil.rmtree('tmp')
-
-if __name__ == "__main__":
+if __name__ == '__main__':
main()
diff --git a/auto_auto_archive.py b/auto_auto_archive.py
index f725d10..a518204 100644
--- a/auto_auto_archive.py
+++ b/auto_auto_archive.py
@@ -1,8 +1,7 @@
import gspread
-import subprocess
import argparse
import auto_archive
-import datetime
+from loguru import logger
def main():
parser = argparse.ArgumentParser(
@@ -11,8 +10,7 @@ def main():
args = parser.parse_args()
- print(datetime.datetime.now())
- print("Opening document " + args.sheet)
+ logger.info("Opening document " + args.sheet)
gc = gspread.service_account(filename='service_account.json')
sh = gc.open(args.sheet)
@@ -23,7 +21,7 @@ def main():
for i in range(11, len(values)):
sheet_name = values[i][0]
- print("Processing " + sheet_name)
+ logger.info("Processing " + sheet_name)
auto_archive.process_sheet(sheet_name)
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 53073dc..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-gspread
-youtube_dl
-boto3
-python-dotenv
-
diff --git a/storages/__init__.py b/storages/__init__.py
new file mode 100644
index 0000000..3054d36
--- /dev/null
+++ b/storages/__init__.py
@@ -0,0 +1,3 @@
+# we need to explicitly expose the available imports here
+from .base_storage import *
+from .s3_storage import *
\ No newline at end of file
diff --git a/storages/base_storage.py b/storages/base_storage.py
new file mode 100644
index 0000000..050a8eb
--- /dev/null
+++ b/storages/base_storage.py
@@ -0,0 +1,19 @@
+from abc import ABC, abstractmethod
+
+
+class Storage(ABC):
+ @abstractmethod
+ def __init__(self, config): pass
+
+ @abstractmethod
+ def get_cdn_url(self, path): pass
+
+ @abstractmethod
+ def exists(self, path): pass
+
+ @abstractmethod
+ def uploadf(self, file, key, **kwargs): pass
+
+ def upload(self, filename: str, key: str, **kwargs):
+ with open(filename, 'rb') as f:
+ self.uploadf(f, key, **kwargs)
diff --git a/storages/s3_storage.py b/storages/s3_storage.py
new file mode 100644
index 0000000..188db7e
--- /dev/null
+++ b/storages/s3_storage.py
@@ -0,0 +1,49 @@
+import boto3
+from botocore.errorfactory import ClientError
+from .base_storage import Storage
+from dataclasses import dataclass
+
+
+@dataclass
+class S3Config:
+ bucket: str
+ region: str
+ key: str
+ secret: str
+ folder: str = ""
+
+
+class S3Storage(Storage):
+
+ def __init__(self, config: S3Config):
+ self.bucket = config.bucket
+ self.region = config.region
+ self.folder = config.folder
+
+ if len(self.folder) and self.folder[-1] != '/':
+ self.folder += '/'
+
+ self.s3 = boto3.client(
+ 's3',
+ region_name=self.region,
+ endpoint_url=f'https://{self.region}.digitaloceanspaces.com',
+ aws_access_key_id=config.key,
+ aws_secret_access_key=config.secret
+ )
+
+ def _get_path(self, key):
+ return self.folder + key
+
+ def get_cdn_url(self, key):
+ return f'https://{self.bucket}.{self.region}.cdn.digitaloceanspaces.com/{self._get_path(key)}'
+
+ def exists(self, key):
+ try:
+ self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key))
+ return True
+ except ClientError:
+ return False
+
+ def uploadf(self, file, key, **kwargs):
+ extra_args = kwargs["extra_args"] if "extra_args" in kwargs else {'ACL': 'public-read'}
+ self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000..9b58126
--- /dev/null
+++ b/utils/__init__.py
@@ -0,0 +1,3 @@
+# we need to explicitly expose the available imports here
+from .gworksheet import GWorksheet
+from .misc import *
\ No newline at end of file
diff --git a/utils/gworksheet.py b/utils/gworksheet.py
new file mode 100644
index 0000000..4349e2a
--- /dev/null
+++ b/utils/gworksheet.py
@@ -0,0 +1,75 @@
+from gspread import utils
+
+
+class GWorksheet:
+ COLUMN_NAMES = {
+ 'url': 'media url',
+ 'archive': 'archive location',
+ 'date': 'archive date',
+ 'status': 'archive status',
+ 'thumbnail': 'thumbnail',
+ 'thumbnail_index': 'thumbnail index',
+ 'timestamp': 'upload timestamp',
+ 'title': 'upload title',
+ 'duration': 'duration'
+ }
+
+ def __init__(self, worksheet, columns=COLUMN_NAMES):
+ self.wks = worksheet
+ self.headers = [v.lower() for v in self.wks.row_values(1)]
+ self.columns = columns
+
+ def _check_col_exists(self, col: str):
+ if col not in self.columns:
+ raise Exception(f'Column {col} is not in the configured column names: {self.columns.keys()}')
+
+ def _col_index(self, col: str):
+ self._check_col_exists(col)
+ return self.headers.index(self.columns[col])
+
+ def col_exists(self, col: str):
+ self._check_col_exists(col)
+ return self.columns[col] in self.headers
+
+ def count_rows(self):
+ return len(self.wks.get_values())
+
+ def get_row(self, row: int):
+ # row is 1-based
+ return self.wks.row_values(row)
+
+ def get_cell(self, row, col: str):
+ """
+ returns the cell value from (row, col),
+ where row can be an index (1-based) OR list of values
+ as received from self.get_row(row)
+ """
+ if type(row) == int:
+ row = self.get_row(row)
+
+ col_index = self._col_index(col)
+ if col_index >= len(row):
+ return ''
+ return row[col_index]
+
+ def set_cell(self, row: int, col: str, val):
+ # row is 1-based
+ col_index = self._col_index(col) + 1
+ self.wks.update_cell(row, col_index, val)
+
+ def batch_set_cell(self, cell_updates):
+ """
+ receives a list of [(row:int, col:str, val)] and batch updates it, the parameters are the same as in the self.set_cell() method
+ """
+ cell_updates = [
+ {
+ 'range': self.to_a1(row, col),
+ 'values': [[val]]
+ }
+ for row, col, val in cell_updates
+ ]
+ self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
+
+ def to_a1(self, row: int, col: str):
+ # row is 1-based
+ return utils.rowcol_to_a1(row, self._col_index(col) + 1)
diff --git a/utils/misc.py b/utils/misc.py
new file mode 100644
index 0000000..e8ef66d
--- /dev/null
+++ b/utils/misc.py
@@ -0,0 +1,5 @@
+import os
+
+def mkdir_if_not_exists(folder):
+ if not os.path.exists(folder):
+ os.mkdir(folder)
\ No newline at end of file