From 51d448f0cbabbf2ba6a5c4408ae9b8c09d356019 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Sun, 20 Feb 2022 10:27:25 +0100 Subject: [PATCH 01/16] Refactor archivers to make it easier to add support for new types of URLs --- .gitignore | 1 + Pipfile | 8 +- Pipfile.lock | 421 ++++++++++++++++++++++++++++++++++++------- archivers.py | 390 +++++++++++++++++++++++++++++++++++++++ auto_archive.py | 391 +++++----------------------------------- auto_auto_archive.py | 8 +- requirements.txt | 5 - 7 files changed, 807 insertions(+), 417 deletions(-) create mode 100644 archivers.py delete mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index 4f3d132..b6a6b68 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ tmp/ expmt/ service_account.json __pycache__/ +._* diff --git a/Pipfile b/Pipfile index 8c71f78..88dbebf 100644 --- a/Pipfile +++ b/Pipfile @@ -11,8 +11,14 @@ youtube_dl = "*" argparse = "*" ffmpeg-python = "*" beautifulsoup4 = "*" +nordvpn-switcher = "*" +tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"} +telethon = "*" +ffmpeg = "*" +bs4 = "*" +loguru = "*" [dev-packages] [requires] -python_version = "3.8" +python_version = "3.9" diff --git a/Pipfile.lock b/Pipfile.lock index ef838d8..8a5f227 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,11 +1,11 @@ { "_meta": { "hash": { - "sha256": "2aa6e5f9d7cda1a459444bf812fb2f7a4acfe547e7c65a975ab41530f9213da5" + "sha256": "420a5d5c155830dac792fe2f037bebce97c30f4271301bb5950a288254798660" }, "pipfile-spec": 6, "requires": { - "python_version": "3.8" + "python_version": "3.9" }, "sources": [ { @@ -26,49 +26,79 @@ }, "beautifulsoup4": { "hashes": [ - "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35", - "sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25", - "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666" + "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf", + "sha256:c23ad23c521d818955a4151a67d81580319d4bf548d3d49f4223ae041ff98891" ], "index": "pypi", - "version": "==4.9.3" + "version": "==4.10.0" }, "boto3": { "hashes": [ - "sha256:7209b79833bdf13753aa24f76bf533890ffed2cc4fe1fe08619d223c209bbd11", - "sha256:f46c93d09acd4d4bfc6b9522ed852fecbdc508e0365f29ddfb3c146aae784b4e" + "sha256:aa00024cc1f3d24b2318dae4d5dbaa173c8da8bc6f9d12f0b2e67467ec460989", + "sha256:ab4ab2392f7520c01ce6e40e6df4b5b65a575ee6bd9fb78db0239cb2a06de557" ], "index": "pypi", - "version": "==1.18.27" + "version": "==1.21.3" }, "botocore": { "hashes": [ - "sha256:8c99abd7093ab11ce8d09c68732aeeb6065a53d2fe371568452e99291817fff5", - "sha256:b9e2c90bad164d111c229102f58f995c28576e719dd116b446965e1b786f8fa5" + "sha256:979e5c5e826ff115f4903fe9887b191f3809229f694a747f910e1221fe63efc7", + "sha256:ca33f747c67cd0e109fab9398d39c38c1a2df352c1e1f9823839df8f1db58046" ], - "version": "==1.21.27" + "markers": "python_version >= '3.6'", + "version": "==1.24.3" + }, + "bs4": { + "hashes": [ + "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a" + ], + "index": "pypi", + "version": "==0.0.1" }, "cachetools": { "hashes": [ - "sha256:2cc0b89715337ab6dbba85b5b50effe2b0c74e035d83ee8ed637cf52f12ae001", - "sha256:61b5ed1e22a0924aed1d23b478f37e8d52549ff8a961de2909c69bf950020cff" + "sha256:486471dfa8799eb7ec503a8059e263db000cdda20075ce5e48903087f79d5fd6", + "sha256:8fecd4203a38af17928be7b90689d8083603073622229ca7077b72d8e5a976e4" ], - "version": "==4.2.2" + "markers": "python_version ~= '3.7'", + "version": "==5.0.0" }, "certifi": { "hashes": [ - "sha256:2bbf76fd432960138b3ef6dda3dde0544f27cbf8546c458e60baf371917ba9ee", - "sha256:50b1e4f8446b06f41be7dd6338db18e0990601dce795c2b1686458aa7e8fa7d8" + "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872", + "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569" ], - "version": "==2021.5.30" + "version": "==2021.10.8" }, "charset-normalizer": { "hashes": [ - "sha256:0c8911edd15d19223366a194a513099a302055a962bca2cec0f54b8b63175d8b", - "sha256:f23667ebe1084be45f6ae0538e4a5a865206544097e4e8bbcacf42cd02a348f3" + "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597", + "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df" ], "markers": "python_version >= '3'", - "version": "==2.0.4" + "version": "==2.0.12" + }, + "click": { + "hashes": [ + "sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1", + "sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb" + ], + "markers": "python_version >= '3.6'", + "version": "==8.0.4" + }, + "cloudscraper": { + "hashes": [ + "sha256:674fd739f9412188aae8d6614e3e6316939fc0670ef5646abd3d316f1a59d3c2", + "sha256:dda29028c5628b5ba3e4dc43816ed38fd46bd945ef938c420f185586a6d8dff2" + ], + "version": "==1.2.58" + }, + "ffmpeg": { + "hashes": [ + "sha256:6931692c890ff21d39938433c2189747815dca0c60ddc7f9bb97f199dba0b5b9" + ], + "index": "pypi", + "version": "==1.4" }, "ffmpeg-python": { "hashes": [ @@ -78,55 +108,272 @@ "index": "pypi", "version": "==0.2.0" }, + "flask": { + "hashes": [ + "sha256:59da8a3170004800a2837844bfa84d49b022550616070f7cb1a659682b2e7c9f", + "sha256:e1120c228ca2f553b470df4a5fa927ab66258467526069981b3eb0a91902687d" + ], + "markers": "python_version >= '3.6'", + "version": "==2.0.3" + }, "future": { "hashes": [ "sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d" ], + "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==0.18.2" }, "google-auth": { "hashes": [ - "sha256:c012c8be7c442c8309ca8fa0876fef33f5fd977c467be1e1c1c2f721e8ebd73c", - "sha256:ea1af050b3e06eb73e4470f704d23007307bc0e87c13e015f6b90460f1407bd3" + "sha256:218ca03d7744ca0c8b6697b6083334be7df49b7bf76a69d555962fd1a7657b5f", + "sha256:ad160fc1ea8f19e331a16a14a79f3d643d813a69534ba9611d2c80dc10439dad" ], - "version": "==2.0.1" + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", + "version": "==2.6.0" }, "google-auth-oauthlib": { "hashes": [ - "sha256:4ab58e6c3dc6ccf112f921fcced40e5426fba266768986ea502228488276eaba", - "sha256:b5a1ce7c617d247ccb2dfbba9d4bfc734b41096803d854a2c52592ae80150a67" + "sha256:3f2a6e802eebbb6fb736a370fbf3b055edcb6b52878bf2f26330b5e041316c73", + "sha256:a90a072f6993f2c327067bf65270046384cda5a8ecb20b94ea9a687f1f233a7a" ], - "version": "==0.4.5" + "markers": "python_version >= '3.6'", + "version": "==0.4.6" }, "gspread": { "hashes": [ - "sha256:236a0f24e3724b49bae4cbd5144ed036b0ae6feaf5828ad033eb2824bf05e5be", - "sha256:4933c3e2359e82698c0990f3b0e312627fcbf8fecc8bc81d26713f5860e20b48" + "sha256:d9db8c43d552f541ea072d4727d1e955bc2368b095dd86c5429a845c9d8aed8f", + "sha256:ffba57786e27519fb97125e3de37a0f062134a396506681f5baacaf47a9febe3" ], "index": "pypi", - "version": "==4.0.1" + "version": "==5.1.1" }, "idna": { "hashes": [ - "sha256:14475042e284991034cb48e06f6851428fb14c4dc953acd9be9a5e95c7b6dd7a", - "sha256:467fbad99067910785144ce333826c71fb0e63a425657295239737f7ecd125f3" + "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff", + "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d" ], "markers": "python_version >= '3'", - "version": "==3.2" + "version": "==3.3" + }, + "itsdangerous": { + "hashes": [ + "sha256:29285842166554469a56d427addc0843914172343784cb909695fdbe90a3e129", + "sha256:d848fcb8bc7d507c4546b448574e8a44fc4ea2ba84ebf8d783290d53e81992f5" + ], + "markers": "python_version >= '3.7'", + "version": "==2.1.0" + }, + "jinja2": { + "hashes": [ + "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8", + "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7" + ], + "markers": "python_version >= '3.6'", + "version": "==3.0.3" }, "jmespath": { "hashes": [ "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9", "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f" ], + "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==0.10.0" }, + "loguru": { + "hashes": [ + "sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c", + "sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3" + ], + "index": "pypi", + "version": "==0.6.0" + }, + "lxml": { + "hashes": [ + "sha256:078306d19a33920004addeb5f4630781aaeabb6a8d01398045fcde085091a169", + "sha256:0c1978ff1fd81ed9dcbba4f91cf09faf1f8082c9d72eb122e92294716c605428", + "sha256:1010042bfcac2b2dc6098260a2ed022968dbdfaf285fc65a3acf8e4eb1ffd1bc", + "sha256:1d650812b52d98679ed6c6b3b55cbb8fe5a5460a0aef29aeb08dc0b44577df85", + "sha256:20b8a746a026017acf07da39fdb10aa80ad9877046c9182442bf80c84a1c4696", + "sha256:2403a6d6fb61c285969b71f4a3527873fe93fd0abe0832d858a17fe68c8fa507", + "sha256:24f5c5ae618395ed871b3d8ebfcbb36e3f1091fd847bf54c4de623f9107942f3", + "sha256:28d1af847786f68bec57961f31221125c29d6f52d9187c01cd34dc14e2b29430", + "sha256:31499847fc5f73ee17dbe1b8e24c6dafc4e8d5b48803d17d22988976b0171f03", + "sha256:31ba2cbc64516dcdd6c24418daa7abff989ddf3ba6d3ea6f6ce6f2ed6e754ec9", + "sha256:330bff92c26d4aee79c5bc4d9967858bdbe73fdbdbacb5daf623a03a914fe05b", + "sha256:5045ee1ccd45a89c4daec1160217d363fcd23811e26734688007c26f28c9e9e7", + "sha256:52cbf2ff155b19dc4d4100f7442f6a697938bf4493f8d3b0c51d45568d5666b5", + "sha256:530f278849031b0eb12f46cca0e5db01cfe5177ab13bd6878c6e739319bae654", + "sha256:545bd39c9481f2e3f2727c78c169425efbfb3fbba6e7db4f46a80ebb249819ca", + "sha256:5804e04feb4e61babf3911c2a974a5b86f66ee227cc5006230b00ac6d285b3a9", + "sha256:5a58d0b12f5053e270510bf12f753a76aaf3d74c453c00942ed7d2c804ca845c", + "sha256:5f148b0c6133fb928503cfcdfdba395010f997aa44bcf6474fcdd0c5398d9b63", + "sha256:5f7d7d9afc7b293147e2d506a4596641d60181a35279ef3aa5778d0d9d9123fe", + "sha256:60d2f60bd5a2a979df28ab309352cdcf8181bda0cca4529769a945f09aba06f9", + "sha256:6259b511b0f2527e6d55ad87acc1c07b3cbffc3d5e050d7e7bcfa151b8202df9", + "sha256:6268e27873a3d191849204d00d03f65c0e343b3bcb518a6eaae05677c95621d1", + "sha256:627e79894770783c129cc5e89b947e52aa26e8e0557c7e205368a809da4b7939", + "sha256:62f93eac69ec0f4be98d1b96f4d6b964855b8255c345c17ff12c20b93f247b68", + "sha256:6d6483b1229470e1d8835e52e0ff3c6973b9b97b24cd1c116dca90b57a2cc613", + "sha256:6f7b82934c08e28a2d537d870293236b1000d94d0b4583825ab9649aef7ddf63", + "sha256:6fe4ef4402df0250b75ba876c3795510d782def5c1e63890bde02d622570d39e", + "sha256:719544565c2937c21a6f76d520e6e52b726d132815adb3447ccffbe9f44203c4", + "sha256:730766072fd5dcb219dd2b95c4c49752a54f00157f322bc6d71f7d2a31fecd79", + "sha256:74eb65ec61e3c7c019d7169387d1b6ffcfea1b9ec5894d116a9a903636e4a0b1", + "sha256:7993232bd4044392c47779a3c7e8889fea6883be46281d45a81451acfd704d7e", + "sha256:80bbaddf2baab7e6de4bc47405e34948e694a9efe0861c61cdc23aa774fcb141", + "sha256:86545e351e879d0b72b620db6a3b96346921fa87b3d366d6c074e5a9a0b8dadb", + "sha256:891dc8f522d7059ff0024cd3ae79fd224752676447f9c678f2a5c14b84d9a939", + "sha256:8a31f24e2a0b6317f33aafbb2f0895c0bce772980ae60c2c640d82caac49628a", + "sha256:8b99ec73073b37f9ebe8caf399001848fced9c08064effdbfc4da2b5a8d07b93", + "sha256:986b7a96228c9b4942ec420eff37556c5777bfba6758edcb95421e4a614b57f9", + "sha256:a1547ff4b8a833511eeaceacbcd17b043214fcdb385148f9c1bc5556ca9623e2", + "sha256:a2bfc7e2a0601b475477c954bf167dee6d0f55cb167e3f3e7cefad906e7759f6", + "sha256:a3c5f1a719aa11866ffc530d54ad965063a8cbbecae6515acbd5f0fae8f48eaa", + "sha256:a9f1c3489736ff8e1c7652e9dc39f80cff820f23624f23d9eab6e122ac99b150", + "sha256:aa0cf4922da7a3c905d000b35065df6184c0dc1d866dd3b86fd961905bbad2ea", + "sha256:ad4332a532e2d5acb231a2e5d33f943750091ee435daffca3fec0a53224e7e33", + "sha256:b2582b238e1658c4061ebe1b4df53c435190d22457642377fd0cb30685cdfb76", + "sha256:b6fc2e2fb6f532cf48b5fed57567ef286addcef38c28874458a41b7837a57807", + "sha256:b92d40121dcbd74831b690a75533da703750f7041b4bf951befc657c37e5695a", + "sha256:bbab6faf6568484707acc052f4dfc3802bdb0cafe079383fbaa23f1cdae9ecd4", + "sha256:c0b88ed1ae66777a798dc54f627e32d3b81c8009967c63993c450ee4cbcbec15", + "sha256:ce13d6291a5f47c1c8dbd375baa78551053bc6b5e5c0e9bb8e39c0a8359fd52f", + "sha256:db3535733f59e5605a88a706824dfcb9bd06725e709ecb017e165fc1d6e7d429", + "sha256:dd10383f1d6b7edf247d0960a3db274c07e96cf3a3fc7c41c8448f93eac3fb1c", + "sha256:e01f9531ba5420838c801c21c1b0f45dbc9607cb22ea2cf132844453bec863a5", + "sha256:e11527dc23d5ef44d76fef11213215c34f36af1608074561fcc561d983aeb870", + "sha256:e1ab2fac607842ac36864e358c42feb0960ae62c34aa4caaf12ada0a1fb5d99b", + "sha256:e1fd7d2fe11f1cb63d3336d147c852f6d07de0d0020d704c6031b46a30b02ca8", + "sha256:e9f84ed9f4d50b74fbc77298ee5c870f67cb7e91dcdc1a6915cb1ff6a317476c", + "sha256:ec4b4e75fc68da9dc0ed73dcdb431c25c57775383fec325d23a770a64e7ebc87", + "sha256:f10ce66fcdeb3543df51d423ede7e238be98412232fca5daec3e54bcd16b8da0", + "sha256:f63f62fc60e6228a4ca9abae28228f35e1bd3ce675013d1dfb828688d50c6e23", + "sha256:fa56bb08b3dd8eac3a8c5b7d075c94e74f755fd9d8a04543ae8d37b1612dd170", + "sha256:fa9b7c450be85bfc6cd39f6df8c5b8cbd76b5d6fc1f69efec80203f9894b885f" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==4.8.0" + }, + "markupsafe": { + "hashes": [ + "sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3", + "sha256:09c86c9643cceb1d87ca08cdc30160d1b7ab49a8a21564868921959bd16441b8", + "sha256:142119fb14a1ef6d758912b25c4e803c3ff66920635c44078666fe7cc3f8f759", + "sha256:1d1fb9b2eec3c9714dd936860850300b51dbaa37404209c8d4cb66547884b7ed", + "sha256:204730fd5fe2fe3b1e9ccadb2bd18ba8712b111dcabce185af0b3b5285a7c989", + "sha256:24c3be29abb6b34052fd26fc7a8e0a49b1ee9d282e3665e8ad09a0a68faee5b3", + "sha256:290b02bab3c9e216da57c1d11d2ba73a9f73a614bbdcc027d299a60cdfabb11a", + "sha256:3028252424c72b2602a323f70fbf50aa80a5d3aa616ea6add4ba21ae9cc9da4c", + "sha256:30c653fde75a6e5eb814d2a0a89378f83d1d3f502ab710904ee585c38888816c", + "sha256:3cace1837bc84e63b3fd2dfce37f08f8c18aeb81ef5cf6bb9b51f625cb4e6cd8", + "sha256:4056f752015dfa9828dce3140dbadd543b555afb3252507348c493def166d454", + "sha256:454ffc1cbb75227d15667c09f164a0099159da0c1f3d2636aa648f12675491ad", + "sha256:598b65d74615c021423bd45c2bc5e9b59539c875a9bdb7e5f2a6b92dfcfc268d", + "sha256:599941da468f2cf22bf90a84f6e2a65524e87be2fce844f96f2dd9a6c9d1e635", + "sha256:5ddea4c352a488b5e1069069f2f501006b1a4362cb906bee9a193ef1245a7a61", + "sha256:62c0285e91414f5c8f621a17b69fc0088394ccdaa961ef469e833dbff64bd5ea", + "sha256:679cbb78914ab212c49c67ba2c7396dc599a8479de51b9a87b174700abd9ea49", + "sha256:6e104c0c2b4cd765b4e83909cde7ec61a1e313f8a75775897db321450e928cce", + "sha256:736895a020e31b428b3382a7887bfea96102c529530299f426bf2e636aacec9e", + "sha256:75bb36f134883fdbe13d8e63b8675f5f12b80bb6627f7714c7d6c5becf22719f", + "sha256:7d2f5d97fcbd004c03df8d8fe2b973fe2b14e7bfeb2cfa012eaa8759ce9a762f", + "sha256:80beaf63ddfbc64a0452b841d8036ca0611e049650e20afcb882f5d3c266d65f", + "sha256:84ad5e29bf8bab3ad70fd707d3c05524862bddc54dc040982b0dbcff36481de7", + "sha256:8da5924cb1f9064589767b0f3fc39d03e3d0fb5aa29e0cb21d43106519bd624a", + "sha256:961eb86e5be7d0973789f30ebcf6caab60b844203f4396ece27310295a6082c7", + "sha256:96de1932237abe0a13ba68b63e94113678c379dca45afa040a17b6e1ad7ed076", + "sha256:a0a0abef2ca47b33fb615b491ce31b055ef2430de52c5b3fb19a4042dbc5cadb", + "sha256:b2a5a856019d2833c56a3dcac1b80fe795c95f401818ea963594b345929dffa7", + "sha256:b8811d48078d1cf2a6863dafb896e68406c5f513048451cd2ded0473133473c7", + "sha256:c532d5ab79be0199fa2658e24a02fce8542df196e60665dd322409a03db6a52c", + "sha256:d3b64c65328cb4cd252c94f83e66e3d7acf8891e60ebf588d7b493a55a1dbf26", + "sha256:d4e702eea4a2903441f2735799d217f4ac1b55f7d8ad96ab7d4e25417cb0827c", + "sha256:d5653619b3eb5cbd35bfba3c12d575db2a74d15e0e1c08bf1db788069d410ce8", + "sha256:d66624f04de4af8bbf1c7f21cc06649c1c69a7f84109179add573ce35e46d448", + "sha256:e67ec74fada3841b8c5f4c4f197bea916025cb9aa3fe5abf7d52b655d042f956", + "sha256:e6f7f3f41faffaea6596da86ecc2389672fa949bd035251eab26dc6697451d05", + "sha256:f02cf7221d5cd915d7fa58ab64f7ee6dd0f6cddbb48683debf5d04ae9b1c2cc1", + "sha256:f0eddfcabd6936558ec020130f932d479930581171368fd728efcfb6ef0dd357", + "sha256:fabbe18087c3d33c5824cb145ffca52eccd053061df1d79d4b66dafa5ad2a5ea", + "sha256:fc3150f85e2dbcf99e65238c842d1cfe69d3e7649b19864c1cc043213d9cd730" + ], + "markers": "python_version >= '3.7'", + "version": "==2.1.0" + }, + "nordvpn-switcher": { + "hashes": [ + "sha256:764db054715d949af0f836da5e46c4053afe92282a0d4b2cfc6b8cfe8c3045de", + "sha256:9788c2c3113d0d7b00894dae3ea19bed14f3b38d111d7223c126f001b1729a3b" + ], + "index": "pypi", + "version": "==0.2.9" + }, "oauthlib": { "hashes": [ - "sha256:42bf6354c2ed8c6acb54d971fce6f88193d97297e18602a3a886603f9d7730cc", - "sha256:8f0215fcc533dd8dd1bee6f4c412d4f0cd7297307d43ac61666389e3bc3198a3" + "sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2", + "sha256:6db33440354787f9b7f3a6dbd4febf5d0f93758354060e802f6c06cb493022fe" ], - "version": "==3.1.1" + "markers": "python_version >= '3.6'", + "version": "==3.2.0" + }, + "pathlib": { + "hashes": [ + "sha256:6940718dfc3eff4258203ad5021090933e5c04707d5ca8cc9e73c94a7894ea9f" + ], + "version": "==1.0.1" + }, + "psutil": { + "hashes": [ + "sha256:072664401ae6e7c1bfb878c65d7282d4b4391f1bc9a56d5e03b5a490403271b5", + "sha256:1070a9b287846a21a5d572d6dddd369517510b68710fca56b0e9e02fd24bed9a", + "sha256:1d7b433519b9a38192dfda962dd8f44446668c009833e1429a52424624f408b4", + "sha256:3151a58f0fbd8942ba94f7c31c7e6b310d2989f4da74fcbf28b934374e9bf841", + "sha256:32acf55cb9a8cbfb29167cd005951df81b567099295291bcfd1027365b36591d", + "sha256:3611e87eea393f779a35b192b46a164b1d01167c9d323dda9b1e527ea69d697d", + "sha256:3d00a664e31921009a84367266b35ba0aac04a2a6cad09c550a89041034d19a0", + "sha256:4e2fb92e3aeae3ec3b7b66c528981fd327fb93fd906a77215200404444ec1845", + "sha256:539e429da49c5d27d5a58e3563886057f8fc3868a5547b4f1876d9c0f007bccf", + "sha256:55ce319452e3d139e25d6c3f85a1acf12d1607ddedea5e35fb47a552c051161b", + "sha256:58c7d923dc209225600aec73aa2c4ae8ea33b1ab31bc11ef8a5933b027476f07", + "sha256:7336292a13a80eb93c21f36bde4328aa748a04b68c13d01dfddd67fc13fd0618", + "sha256:742c34fff804f34f62659279ed5c5b723bb0195e9d7bd9907591de9f8f6558e2", + "sha256:7641300de73e4909e5d148e90cc3142fb890079e1525a840cf0dfd39195239fd", + "sha256:76cebf84aac1d6da5b63df11fe0d377b46b7b500d892284068bacccf12f20666", + "sha256:7779be4025c540d1d65a2de3f30caeacc49ae7a2152108adeaf42c7534a115ce", + "sha256:7d190ee2eaef7831163f254dc58f6d2e2a22e27382b936aab51c835fc080c3d3", + "sha256:8293942e4ce0c5689821f65ce6522ce4786d02af57f13c0195b40e1edb1db61d", + "sha256:869842dbd66bb80c3217158e629d6fceaecc3a3166d3d1faee515b05dd26ca25", + "sha256:90a58b9fcae2dbfe4ba852b57bd4a1dded6b990a33d6428c7614b7d48eccb492", + "sha256:9b51917c1af3fa35a3f2dabd7ba96a2a4f19df3dec911da73875e1edaf22a40b", + "sha256:b2237f35c4bbae932ee98902a08050a27821f8f6dfa880a47195e5993af4702d", + "sha256:c3400cae15bdb449d518545cbd5b649117de54e3596ded84aacabfbb3297ead2", + "sha256:c51f1af02334e4b516ec221ee26b8fdf105032418ca5a5ab9737e8c87dafe203", + "sha256:cb8d10461c1ceee0c25a64f2dd54872b70b89c26419e147a05a10b753ad36ec2", + "sha256:d62a2796e08dd024b8179bd441cb714e0f81226c352c802fca0fd3f89eeacd94", + "sha256:df2c8bd48fb83a8408c8390b143c6a6fa10cb1a674ca664954de193fdcab36a9", + "sha256:e5c783d0b1ad6ca8a5d3e7b680468c9c926b804be83a3a8e95141b05c39c9f64", + "sha256:e9805fed4f2a81de98ae5fe38b75a74c6e6ad2df8a5c479594c7629a1fe35f56", + "sha256:ea42d747c5f71b5ccaa6897b216a7dadb9f52c72a0fe2b872ef7d3e1eacf3ba3", + "sha256:ef216cc9feb60634bda2f341a9559ac594e2eeaadd0ba187a4c2eb5b5d40b91c", + "sha256:ff0d41f8b3e9ebb6b6110057e40019a432e96aae2008951121ba4e56040b84f3" + ], + "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==5.9.0" + }, + "py-mini-racer": { + "hashes": [ + "sha256:346e73bb89a2024888244d487834be24a121089ceb0641dd0200cb96c4e24b57", + "sha256:42896c24968481dd953eeeb11de331f6870917811961c9b26ba09071e07180e2", + "sha256:97cab31bbf63ce462ba4cd6e978c572c916d8b15586156c7c5e0b2e42c10baab", + "sha256:f71e36b643d947ba698c57cd9bd2232c83ca997b0802fc2f7f79582377040c11" + ], + "version": "==0.6.0" + }, + "pyaes": { + "hashes": [ + "sha256:02c1b1405c38d3c370b085fb952dd8bea3fadcee6411ad99f312cc129c536d8f" + ], + "version": "==1.6.1" }, "pyasn1": { "hashes": [ @@ -164,80 +411,128 @@ ], "version": "==0.2.8" }, + "pyparsing": { + "hashes": [ + "sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea", + "sha256:a6c06a88f252e6c322f65faf8f418b16213b51bdfaece0524c1c1bc30c63c484" + ], + "markers": "python_version >= '3.6'", + "version": "==3.0.7" + }, "python-dateutil": { "hashes": [ "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9" ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==2.8.2" }, "python-dotenv": { "hashes": [ - "sha256:aae25dc1ebe97c420f50b81fb0e5c949659af713f31fdb63c749ca68748f34b1", - "sha256:f521bc2ac9a8e03c736f62911605c5d83970021e3fa95b37d769e2bbbe9b6172" + "sha256:32b2bdc1873fd3a3c346da1c6db83d0053c3c62f28f1f38516070c4c8971b1d3", + "sha256:a5de49a31e953b45ff2d2fd434bbc2670e8db5273606c1e737cc6b93eff3655f" ], "index": "pypi", - "version": "==0.19.0" + "version": "==0.19.2" + }, + "random-user-agent": { + "hashes": [ + "sha256:535636a55fb63fe3d74fd0260d854c241d9f2946447026464e578e68eac17dac", + "sha256:8f8ca26ec8cb1d24ad1758d8b8f700d154064d641dbe9a255cfec42960fbd012" + ], + "version": "==1.0.1" }, "requests": { "hashes": [ - "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24", - "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7" + "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", + "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" ], - "version": "==2.26.0" + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", + "version": "==2.27.1" }, "requests-oauthlib": { "hashes": [ - "sha256:7f71572defaecd16372f9006f33c2ec8c077c3cfa6f5911a9a90202beb513f3d", - "sha256:b4261601a71fd721a8bd6d7aa1cc1d6a8a93b4a9f5e96626f8e4d91e8beeaa6a", - "sha256:fa6c47b933f01060936d87ae9327fead68768b69c6c9ea2109c48be30f2d4dbc" + "sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5", + "sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a" ], - "version": "==1.3.0" + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==1.3.1" + }, + "requests-toolbelt": { + "hashes": [ + "sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f", + "sha256:968089d4584ad4ad7c171454f0a5c6dac23971e9472521ea3b6d49d610aa6fc0" + ], + "version": "==0.9.1" }, "rsa": { "hashes": [ - "sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2", - "sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9" + "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17", + "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb" ], - "version": "==4.7.2" + "markers": "python_version >= '3.6'", + "version": "==4.8" }, "s3transfer": { "hashes": [ - "sha256:50ed823e1dc5868ad40c8dc92072f757aa0e653a192845c94a3b676f4a62da4c", - "sha256:9c1dc369814391a6bda20ebbf4b70a0f34630592c9aa520856bf384916af2803" + "sha256:25c140f5c66aa79e1ac60be50dcd45ddc59e83895f062a3aab263b870102911f", + "sha256:69d264d3e760e569b78aaa0f22c97e955891cd22e32b10c51f784eeda4d9d10a" ], - "version": "==0.5.0" + "markers": "python_version >= '3.6'", + "version": "==0.5.1" }, "six": { "hashes": [ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.16.0" }, "soupsieve": { "hashes": [ - "sha256:052774848f448cf19c7e959adf5566904d525f33a3f8b6ba6f6f8f26ec7de0cc", - "sha256:c2c1c2d44f158cdbddab7824a9af8c4f83c76b1e23e049479aa432feb6c4c23b" + "sha256:1a3cca2617c6b38c0343ed661b1fa5de5637f257d4fe22bd9f1338010a1efefb", + "sha256:b8d49b1cd4f037c7082a9683dfa1801aa2597fb11c3a1155b7a5b94829b4f1f9" ], - "markers": "python_version >= '3.0'", - "version": "==2.2.1" + "markers": "python_version >= '3.6'", + "version": "==2.3.1" + }, + "telethon": { + "hashes": [ + "sha256:04fdc5fa4ed3e886e6ecf4bad79205ab8880c6aefbd42c29c89c689a502aa816", + "sha256:818cb61281ed3f75ba4da9b68cb69486bed9474d2db4e0aa16e482053117452c" + ], + "index": "pypi", + "version": "==1.24.0" + }, + "tiktok-downloader": { + "git": "https://github.com/msramalho/tiktok-downloader", + "ref": "81c6ea1f959b2cc5620961043272592bd1bfc2e2" }, "urllib3": { "hashes": [ - "sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4", - "sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f" + "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed", + "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c" ], - "version": "==1.26.6" + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", + "version": "==1.26.8" + }, + "werkzeug": { + "hashes": [ + "sha256:1421ebfc7648a39a5c58c601b154165d05cf47a3cd0ccb70857cbdacf6c8f2b8", + "sha256:b863f8ff057c522164b6067c9e28b041161b4be5ba4d0daceeaa50a163822d3c" + ], + "markers": "python_version >= '3.6'", + "version": "==2.0.3" }, "youtube-dl": { "hashes": [ - "sha256:263e04d53fb8ba3dfbd246ad09b7d388e896c132a20cc770c26ee7684de050ac", - "sha256:cb2d3ee002158ede783e97a82c95f3817594df54367ea6a77ce5ceea4772f0ab" + "sha256:bc59e86c5d15d887ac590454511f08ce2c47698d5a82c27bfe27b5d814bbaed2", + "sha256:f1336d5de68647e0364a47b3c0712578e59ec76f02048ff5c50ef1c69d79cd55" ], "index": "pypi", - "version": "==2021.6.6" + "version": "==2021.12.17" } }, "develop": {} -} +} \ No newline at end of file diff --git a/archivers.py b/archivers.py new file mode 100644 index 0000000..d8a72f6 --- /dev/null +++ b/archivers.py @@ -0,0 +1,390 @@ +from dataclasses import dataclass +import youtube_dl +from bs4 import BeautifulSoup +import requests +import tiktok_downloader +from loguru import logger +import os +import datetime +import ffmpeg +from botocore.errorfactory import ClientError +import time +import traceback + +# TODO There should be a better way of generating keys, that adds the following info: +# - name of sheet that it is being archived from +# (this means we might archive the same media twice on different sheets, but that's OK I think) +# - name of archiver/platform that the video comes from +# This should make it easier to maintain and clean the archive later + +# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be +# cleaned up? Difficult is we don't know the filename until the archivers start working. + + +def get_cdn_url(key): + return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format( + os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key) + + +def do_s3_upload(s3_client, f, key): + s3_client.upload_fileobj(f, Bucket=os.getenv( + 'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'}) + + +def get_key(filename): + key = filename.split('/')[1] + if 'unknown_video' in key: + key = key.replace('unknown_video', 'jpg') + return key + + +def get_thumbnails(filename, s3_client, duration=None): + if not os.path.exists(filename.split('.')[0]): + os.mkdir(filename.split('.')[0]) + + fps = 0.5 + if duration is not None: + duration = float(duration) + + if duration < 60: + fps = 10.0 / duration + elif duration < 120: + fps = 20.0 / duration + else: + fps = 40.0 / duration + + stream = ffmpeg.input(filename) + stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1) + stream.output(filename.split('.')[0] + '/out%d.jpg').run() + + thumbnails = os.listdir(filename.split('.')[0] + '/') + cdn_urls = [] + + for fname in thumbnails: + if fname[-3:] == 'jpg': + thumbnail_filename = filename.split('.')[0] + '/' + fname + key = filename.split('/')[1].split('.')[0] + '/' + fname + + cdn_url = get_cdn_url(key) + + with open(thumbnail_filename, 'rb') as f: + do_s3_upload(s3_client, f, key) + + cdn_urls.append(cdn_url) + os.remove(thumbnail_filename) + + if len(cdn_urls) == 0: + return ('None', 'None') + + key_thumb = cdn_urls[int(len(cdn_urls)*0.1)] + + index_page = f'''{filename} + ''' + + for t in cdn_urls: + index_page += f'' + + index_page += f"" + index_fname = filename.split('.')[0] + '/index.html' + + with open(index_fname, 'w') as f: + f.write(index_page) + + thumb_index = filename.split('/')[1].split('.')[0] + '/index.html' + + s3_client.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv( + 'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'}) + + thumb_index_cdn_url = get_cdn_url(thumb_index) + + return (key_thumb, thumb_index_cdn_url) + + +@dataclass +class ArchiveResult: + status: str + cdn_url: str = None + thumbnail: str = None + thumbnail_index: str = None + duration: float = None + title: str = None + timestamp: datetime.datetime = None + + +class Archiver: + def __init__(self, s3_client): + self.s3 = s3_client + + def download(self, url): + pass + + +class TelegramArchiver(Archiver): + def download(self, url, check_if_exists=False): + # detect URLs that we definitely cannot handle + if 'http://t.me/' not in url and 'https://t.me/' not in url: + return False + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'} + status = "success" + + original_url = url + + if url[-8:] != "?embed=1": + url += "?embed=1" + + t = requests.get(url, headers=headers) + s = BeautifulSoup(t.content, 'html.parser') + video = s.find("video") + + if video is None: + return False # could not find video + + video_url = video.get('src') + key = video_url.split('/')[-1].split('?')[0] + filename = 'tmp/' + key + + if check_if_exists: + try: + self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) + + # file exists + cdn_url = get_cdn_url(key) + + status = 'already archived' + + except ClientError: + pass + + v = requests.get(video_url, headers=headers) + + with open(filename, 'wb') as f: + f.write(v.content) + + if status != 'already archived': + cdn_url = get_cdn_url(key) + + with open(filename, 'rb') as f: + do_s3_upload(self.s3, f, key) + + # extract duration from HTML + duration = s.find_all('time')[0].contents[0] + if ':' in duration: + duration = float(duration.split( + ':')[0])*60 + float(duration.split(':')[1]) + else: + duration = float(duration) + + # process thumbnails + key_thumb, thumb_index = get_thumbnails( + filename, self.s3, duration=duration) + os.remove(filename) + + return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, + duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime')) + + +class YoutubeDLArchiver(Archiver): + def download(self, url, check_if_exists=False): + ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} + if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'): + logger.info('Using Facebook cookie') + youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE') + + ydl = youtube_dl.YoutubeDL(ydl_opts) + cdn_url = None + status = 'success' + + try: + info = ydl.extract_info(url, download=False) + except youtube_dl.utils.DownloadError: + # no video here + return False + + if 'is_live' in info and info['is_live']: + logger.warning("Live streaming media, not archiving now") + return ArchiveResult(status="Streaming media") + + if check_if_exists: + if 'entries' in info: + if len(info['entries']) > 1: + logger.warning( + 'YoutubeDLArchiver cannot archive channels or pages with multiple videos') + return False + + filename = ydl.prepare_filename(info['entries'][0]) + else: + filename = ydl.prepare_filename(info) + + key = get_key(filename) + + try: + self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) + + # file exists + cdn_url = get_cdn_url(key) + + status = 'already archived' + + except ClientError: + pass + + # sometimes this results in a different filename, so do this again + info = ydl.extract_info(url, download=True) + + if 'entries' in info: + if len(info['entries']) > 1: + logger.warning( + 'YoutubeDLArchiver cannot archive channels or pages with multiple videos') + return False + else: + info = info['entries'][0] + + filename = ydl.prepare_filename(info) + + if not os.path.exists(filename): + filename = filename.split('.')[0] + '.mkv' + + if status != 'already archived': + key = get_key(filename) + cdn_url = get_cdn_url(key) + + with open(filename, 'rb') as f: + do_s3_upload(self.s3, f, key) + + # get duration + duration = info['duration'] if 'duration' in info else None + + # get thumbnails + key_thumb, thumb_index = get_thumbnails( + filename, self.s3, duration=duration) + os.remove(filename) + + return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, + title=info['title'] if 'title' in info else None, + timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None) + + +class WaybackArchiver(Archiver): + def __init__(self, s3_client): + self.s3 = s3_client + self.seen_urls = {} + + def download(self, url, check_if_exists=False): + if check_if_exists and url in self.seen_urls: + return self.seen_urls[url] + + ia_headers = { + "Accept": "application/json", + "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET') + } + + r = requests.post( + 'https://web.archive.org/save/', headers=ia_headers, data={'url': url}) + + if r.status_code != 200: + return ArchiveResult(status="Internet archive failed") + + job_id = r.json()['job_id'] + + status_r = requests.get( + 'https://web.archive.org/save/status/' + job_id, headers=ia_headers) + + retries = 0 + + # wait 90-120 seconds for the archive job to finish + while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30: + time.sleep(3) + + try: + status_r = requests.get( + 'https://web.archive.org/save/status/' + job_id, headers=ia_headers) + except: + time.sleep(1) + + retries += 1 + + if status_r.status_code != 200: + return ArchiveResult(status="Internet archive failed") + + status_json = status_r.json() + + if status_json['status'] != 'success': + return ArchiveResult(status='Internet Archive failed: ' + status_json['message']) + + archive_url = 'https://web.archive.org/web/' + \ + status_json['timestamp'] + '/' + status_json['original_url'] + + try: + r = requests.get(archive_url) + + parsed = BeautifulSoup( + r.content, 'html.parser') + + title = parsed.find_all('title')[ + 0].text + except: + title = "Could not get title" + + result = ArchiveResult( + status='Internet Archive fallback', cdn_url=archive_url, title=title) + self.seen_urls[url] = result + return result + + +class TiktokArchiver(Archiver): + def download(self, url, check_if_exists=False): + if 'tiktok.com' not in url: + return False + + status = 'success' + + try: + info = tiktok_downloader.info_post(url) + key = 'tiktok_' + str(info.id) + '.mp4' + filename = 'tmp/' + key + + if check_if_exists: + try: + self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) + + # file exists + cdn_url = get_cdn_url(key) + + status = 'already archived' + + except ClientError: + pass + + if status != 'already archived': + media = tiktok_downloader.snaptik(url).get_media() + if len(media) > 0: + media[0].download(filename) + with open(filename, 'rb') as f: + do_s3_upload(self.s3, f, key) + + cdn_url = get_cdn_url(key) + else: + status = 'could not download media' + + try: + key_thumb, thumb_index = get_thumbnails( + filename, self.s3, duration=info.duration) + except: + key_thumb = '' + thumb_index = 'error creating thumbnails' + + os.remove(filename) + + return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, + thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat()) + + except tiktok_downloader.Except.InvalidUrl: + status = 'Invalid URL' + return ArchiveResult(status=status) + + except: + error = traceback.format_exc() + status = 'Other Tiktok error: ' + str(error) + return ArchiveResult(status=status) diff --git a/auto_archive.py b/auto_archive.py index f0f6862..ef4f89c 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -1,19 +1,15 @@ +from dataclasses import dataclass import gspread -import youtube_dl from pathlib import Path -import sys import datetime import boto3 import os from dotenv import load_dotenv -from botocore.errorfactory import ClientError import argparse import math -import ffmpeg import threading -import time -from bs4 import BeautifulSoup -import requests +from loguru import logger +import archivers load_dotenv() @@ -46,328 +42,64 @@ def index_to_col(index): else: return alphabet[index] -def get_cdn_url(key): - return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format( - os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key) -def do_s3_upload(s3_client, f, key): - s3_client.upload_fileobj(f, Bucket=os.getenv( - 'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'}) - - -def get_thumbnails(filename, s3_client, duration = None): - if not os.path.exists(filename.split('.')[0]): - os.mkdir(filename.split('.')[0]) - - fps = 0.5 - if duration is not None: - duration = float(duration) - - if duration < 60: - fps = 10.0 / duration - elif duration < 120: - fps = 20.0 / duration - else: - fps = 40.0 / duration - - - stream = ffmpeg.input(filename) - stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1) - stream.output(filename.split('.')[0] + '/out%d.jpg').run() - - thumbnails = os.listdir(filename.split('.')[0] + '/') - cdn_urls = [] - - for fname in thumbnails: - if fname[-3:] == 'jpg': - thumbnail_filename = filename.split('.')[0] + '/' + fname - key = filename.split('/')[1].split('.')[0] + '/' + fname - - cdn_url = get_cdn_url(key) - - with open(thumbnail_filename, 'rb') as f: - do_s3_upload(s3_client, f, key) - - cdn_urls.append(cdn_url) - os.remove(thumbnail_filename) - - if len(cdn_urls) == 0: - return ('None', 'None') - - key_thumb = cdn_urls[int(len(cdn_urls)*0.1)] - - index_page = f'''{filename} - ''' - - for t in cdn_urls: - index_page += f'' - - index_page += f"" - index_fname = filename.split('.')[0] + '/index.html' - - with open(index_fname, 'w') as f: - f.write(index_page) - - thumb_index = filename.split('/')[1].split('.')[0] + '/index.html' - - s3_client.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv( - 'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'}) - - thumb_index_cdn_url = get_cdn_url(thumb_index) - - return (key_thumb, thumb_index_cdn_url) - - -def download_telegram_video(url, s3_client, check_if_exists=False): - status = 'success' - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'} - - original_url = url - - if url[-8:] != "?embed=1": - url += "?embed=1" - - t = requests.get(url, headers=headers) - s = BeautifulSoup(t.content, 'html.parser') - video = s.find("video") - - if video is None: - return ({}, 'No telegram video found') - else: - video_url = video.get('src') - key = video_url.split('/')[-1].split('?')[0] - filename = 'tmp/' + key - - if check_if_exists: - try: - s3_client.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) - - # file exists - cdn_url = get_cdn_url(key) - - status = 'already archived' - - except ClientError: - pass - - v = requests.get(video_url, headers=headers) - - with open(filename, 'wb') as f: - f.write(v.content) - - if status != 'already archived': - cdn_url = get_cdn_url(key) - - with open(filename, 'rb') as f: - do_s3_upload(s3_client, f, key) - - duration = s.find_all('time')[0].contents[0] - if ':' in duration: - duration = float(duration.split(':')[0])*60 + float(duration.split(':')[1]) - else: - duration = float(duration) - - key_thumb, thumb_index = get_thumbnails(filename, s3_client, duration=duration) - os.remove(filename) - - video_data = { - 'cdn_url': cdn_url, - 'thumbnail': key_thumb, - 'thumbnail_index': thumb_index, - 'duration': duration, - 'title': original_url, - 'timestamp': s.find_all('time')[1].get('datetime') - } - - return (video_data, status) - - -def internet_archive(url, s3_client): - - - ia_headers = { - "Accept": "application/json", - "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET') - } - - r = requests.post( - 'https://web.archive.org/save/', headers=ia_headers, data={'url': url}) - - if r.status_code != 200: - return ({}, 'Internet archive failed') - else: - job_id = r.json()['job_id'] - - status_r = requests.get( - 'https://web.archive.org/save/status/' + job_id, headers=ia_headers) - - retries = 0 - - while status_r.json()['status'] == 'pending' and retries < 40: - time.sleep(5) - - try: - status_r = requests.get( - 'https://web.archive.org/save/status/' + job_id, headers=ia_headers) - except: - time.sleep(1) - - retries += 1 - - status_json = status_r.json() - - if status_json['status'] == 'success': - url = 'https://web.archive.org/web/' + \ - status_json['timestamp'] + '/' + status_json['original_url'] - - r = requests.get(url) - - parsed = BeautifulSoup( - r.content, 'html.parser') - title = parsed.find_all('title')[ - 0].text - - return ({'cdn_url': url, 'title': title}, 'Internet Archive fallback') - else: - return ({}, 'Internet Archive failed: ' + status_json['message']) - -def get_key(filename): - key = filename.split('/')[1] - if 'unknown_video' in key: - key = key.replace('unknown_video', 'jpg') - return key - - -def download_vid(url, s3_client, check_if_exists=False): - ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} - if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'): - print('Using cookie') - youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE') - ydl = youtube_dl.YoutubeDL(ydl_opts) - cdn_url = None - status = 'success' - - if check_if_exists: - info = ydl.extract_info(url, download=False) - - if 'entries' in info: - if len(info['entries']) > 1: - raise Exception( - 'ERROR: Cannot archive channels or pages with multiple videos') - - filename = ydl.prepare_filename(info['entries'][0]) - else: - filename = ydl.prepare_filename(info) - - key = get_key(filename) - - try: - s3_client.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) - - # file exists - cdn_url = get_cdn_url(os, key) - - status = 'already archived' - - except ClientError: - pass - - # sometimes this results in a different filename, so do this again - info = ydl.extract_info(url, download=True) - - if 'entries' in info: - if len(info['entries']) > 1: - raise Exception( - 'ERROR: Cannot archive channels or pages with multiple videos') - else: - info = info['entries'][0] - - filename = ydl.prepare_filename(info) - - if not os.path.exists(filename): - filename = filename.split('.')[0] + '.mkv' - - if status != 'already archived': - key = get_key(filename) - cdn_url = get_cdn_url(os, key) - - with open(filename, 'rb') as f: - do_s3_upload(s3_client, f, key) - - duration = info['duration'] if 'duration' in info else None - key_thumb, thumb_index = get_thumbnails(filename, s3_client, duration=duration) - os.remove(filename) - - video_data = { - 'cdn_url': cdn_url, - 'thumbnail': key_thumb, - 'thumbnail_index': thumb_index, - 'duration': duration, - 'title': info['title'] if 'title' in info else None, - 'timestamp': info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None, - } - - return (video_data, status) - - -def update_sheet(wks, row, status, video_data, columns, v): +def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v): update = [] if columns['status'] is not None: update += [{ 'range': columns['status'] + str(row), - 'values': [[status]] + 'values': [[result.status]] }] - if 'cdn_url' in video_data and video_data['cdn_url'] is not None and columns['archive'] is not None and v[col_to_index(columns['archive'])] == '': + if result.cdn_url and columns['archive'] is not None and v[col_to_index(columns['archive'])] == '': update += [{ 'range': columns['archive'] + str(row), - 'values': [[video_data['cdn_url']]] + 'values': [[result.cdn_url]] }] - if 'date' in video_data and columns['date'] is not None and v[col_to_index(columns['date'])] == '': + if columns['date'] is not None and v[col_to_index(columns['date'])] == '': update += [{ 'range': columns['date'] + str(row), 'values': [[datetime.datetime.now().isoformat()]] }] - if 'thumbnail' in video_data and columns['thumbnail'] is not None and v[col_to_index(columns['thumbnail'])] == '': + if result.thumbnail and columns['thumbnail'] is not None and v[col_to_index(columns['thumbnail'])] == '': update += [{ 'range': columns['thumbnail'] + str(row), - 'values': [['=IMAGE("' + video_data['thumbnail'] + '")']] + 'values': [['=IMAGE("' + result.thumbnail + '")']] }] - if 'thumbnail_index' in video_data and columns['thumbnail_index'] is not None and v[col_to_index(columns['thumbnail_index'])] == '': + if result.thumbnail_index and columns['thumbnail_index'] is not None and v[col_to_index(columns['thumbnail_index'])] == '': update += [{ 'range': columns['thumbnail_index'] + str(row), - 'values': [[video_data['thumbnail_index']]] + 'values': [[result.thumbnail_index]] }] - if 'timestamp' in video_data and columns['timestamp'] is not None and video_data['timestamp'] is not None and v[col_to_index(columns['timestamp'])] == '': + if result.timestamp and columns['timestamp'] is not None and v[col_to_index(columns['timestamp'])] == '': update += [{ 'range': columns['timestamp'] + str(row), - 'values': [[video_data['timestamp']]] if type(video_data['timestamp']) == str else [[datetime.datetime.fromtimestamp(video_data['timestamp']).isoformat()]] + 'values': [[result.timestamp]] if type(result.timestamp) == str else [[datetime.datetime.fromtimestamp(result.timestamp).isoformat()]] }] - if 'title' in video_data and columns['title'] is not None and video_data['title'] is not None and v[col_to_index(columns['title'])] == '': + if result.title and columns['title'] is not None and v[col_to_index(columns['title'])] == '': update += [{ 'range': columns['title'] + str(row), - 'values': [[video_data['title']]] + 'values': [[result.title]] }] - if 'duration' in video_data and columns['duration'] is not None and video_data['duration'] is not None and v[col_to_index(columns['duration'])] == '': + if result.duration and columns['duration'] is not None and v[col_to_index(columns['duration'])] == '': update += [{ 'range': columns['duration'] + str(row), - 'values': [[str(video_data['duration'])]] + 'values': [[str(result.duration)]] }] wks.batch_update(update, value_input_option='USER_ENTERED') -def record_stream(url, s3_client, wks, i, columns, v): - video_data, status = download_vid(url, s3_client) - update_sheet(wks, i, status, video_data, columns, v) +# def record_stream(url, s3_client, wks, i, columns, v): +# video_data, status = download_vid(url, s3_client) +# update_sheet(wks, i, status, video_data, columns, v) def process_sheet(sheet): @@ -384,7 +116,7 @@ def process_sheet(sheet): # loop through worksheets to check for ii in range(n_worksheets): - print("Opening worksheet " + str(ii)) + logger.info("Opening worksheet " + str(ii)) wks = sh.get_worksheet(ii) values = wks.get_all_values() @@ -396,7 +128,7 @@ def process_sheet(sheet): 'source url')) if 'source url' in headers else None if columns['url'] is None: - print("No 'Media URL' column found, skipping") + logger.warning("No 'Media URL' column found, skipping") continue url_index = col_to_index(columns['url']) @@ -409,7 +141,7 @@ def process_sheet(sheet): 'archive status')) if 'archive status' in headers else None if columns['status'] is None: - print("No 'Archive status' column found, skipping") + logger.warning("No 'Archive status' column found, skipping") continue columns['thumbnail'] = index_to_col(headers.index( @@ -423,6 +155,15 @@ def process_sheet(sheet): columns['duration'] = index_to_col(headers.index( 'duration')) if 'duration' in headers else None + + active_archivers = [ + archivers.TelegramArchiver(s3_client), + archivers.TiktokArchiver(s3_client), + archivers.YoutubeDLArchiver(s3_client), + archivers.WaybackArchiver(s3_client) + ] + + # loop through rows in worksheet for i in range(2, len(values)+1): v = values[i-1] @@ -434,61 +175,25 @@ def process_sheet(sheet): # check so we don't step on each others' toes if latest_val == '' or latest_val is None: wks.update( - columns['status'] + str(i), 'Archive in progress') + columns['status'] + str(i), 'Archive in progress') - if 'http://t.me/' in v[url_index] or 'https://t.me/' in v[url_index]: - video_data, status = download_telegram_video( - v[url_index], s3_client, check_if_exists=True) - - if status == 'No telegram video found': - print("Trying Internet Archive fallback") + for archiver in active_archivers: + logger.debug(f"Trying {archiver} on row {i}") + result = archiver.download(v[url_index], check_if_exists=True) + if result: + logger.info(f"{archiver} succeeded on row {i}") + break - video_data, status = internet_archive( - v[url_index], s3_client) - - update_sheet(wks, i, status, video_data, columns, v) + if result: + update_sheet(wks, i, result, columns, v) - else: - try: - ydl_opts = { - 'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} - if (v[url_index][0:21] == 'https://facebook.com/' or v[url_index][0:25] == 'https://www.facebook.com/') and os.getenv('FB_COOKIE'): - print('Using cookie') - youtube_dl.utils.std_headers['cookie'] = os.getenv( - 'FB_COOKIE') - ydl = youtube_dl.YoutubeDL(ydl_opts) - info = ydl.extract_info( - v[url_index], download=False) - if 'is_live' in info and info['is_live']: - wks.update(columns['status'] + - str(i), 'Recording stream') - t = threading.Thread(target=record_stream, args=( - v[url_index], s3_client, wks, i, columns, v)) - t.start() - continue - elif 'is_live' not in info or not info['is_live']: - video_data, status = download_vid( - v[url_index], s3_client, check_if_exists=True) - update_sheet(wks, i, status, - video_data, columns, v) - - except: - # i'm sure there's a better way to handle this than nested try/catch blocks - try: - print("Trying Internet Archive fallback") + # except: + # if any unexpected errors occured, log these into the Google Sheet + # t, value, traceback = sys.exc_info() - video_data, status = internet_archive( - v[url_index], s3_client) - update_sheet(wks, i, status, - video_data, columns, v) - - except: - # if any unexpected errors occured, log these into the Google Sheet - t, value, traceback = sys.exc_info() - - update_sheet(wks, i, str( - value), {}, columns, v) + # update_sheet(wks, i, str( + # value), {}, columns, v) def main(): @@ -497,7 +202,7 @@ def main(): parser.add_argument("--sheet", action="store", dest="sheet") args = parser.parse_args() - print("Opening document " + args.sheet) + logger.info("Opening document " + args.sheet) process_sheet(args.sheet) diff --git a/auto_auto_archive.py b/auto_auto_archive.py index f725d10..a518204 100644 --- a/auto_auto_archive.py +++ b/auto_auto_archive.py @@ -1,8 +1,7 @@ import gspread -import subprocess import argparse import auto_archive -import datetime +from loguru import logger def main(): parser = argparse.ArgumentParser( @@ -11,8 +10,7 @@ def main(): args = parser.parse_args() - print(datetime.datetime.now()) - print("Opening document " + args.sheet) + logger.info("Opening document " + args.sheet) gc = gspread.service_account(filename='service_account.json') sh = gc.open(args.sheet) @@ -23,7 +21,7 @@ def main(): for i in range(11, len(values)): sheet_name = values[i][0] - print("Processing " + sheet_name) + logger.info("Processing " + sheet_name) auto_archive.process_sheet(sheet_name) diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 53073dc..0000000 --- a/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -gspread -youtube_dl -boto3 -python-dotenv - From 009c0dd8cadb3420fc99acc1e6b1243c9f0f4ddb Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Sun, 20 Feb 2022 11:06:47 +0100 Subject: [PATCH 02/16] Clean up dependencies --- Pipfile | 4 +--- Pipfile.lock | 25 ++----------------------- 2 files changed, 3 insertions(+), 26 deletions(-) diff --git a/Pipfile b/Pipfile index 88dbebf..0d954c9 100644 --- a/Pipfile +++ b/Pipfile @@ -9,14 +9,12 @@ boto3 = "*" python-dotenv = "*" youtube_dl = "*" argparse = "*" -ffmpeg-python = "*" beautifulsoup4 = "*" nordvpn-switcher = "*" tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"} -telethon = "*" -ffmpeg = "*" bs4 = "*" loguru = "*" +ffmpeg-python = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index 8a5f227..b354d59 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "420a5d5c155830dac792fe2f037bebce97c30f4271301bb5950a288254798660" + "sha256": "af39efbad8c78641a732697001193b5f4f92a0af8a9709081428001362a47060" }, "pipfile-spec": 6, "requires": { @@ -93,13 +93,6 @@ ], "version": "==1.2.58" }, - "ffmpeg": { - "hashes": [ - "sha256:6931692c890ff21d39938433c2189747815dca0c60ddc7f9bb97f199dba0b5b9" - ], - "index": "pypi", - "version": "==1.4" - }, "ffmpeg-python": { "hashes": [ "sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127", @@ -369,12 +362,6 @@ ], "version": "==0.6.0" }, - "pyaes": { - "hashes": [ - "sha256:02c1b1405c38d3c370b085fb952dd8bea3fadcee6411ad99f312cc129c536d8f" - ], - "version": "==1.6.1" - }, "pyasn1": { "hashes": [ "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359", @@ -497,14 +484,6 @@ "markers": "python_version >= '3.6'", "version": "==2.3.1" }, - "telethon": { - "hashes": [ - "sha256:04fdc5fa4ed3e886e6ecf4bad79205ab8880c6aefbd42c29c89c689a502aa816", - "sha256:818cb61281ed3f75ba4da9b68cb69486bed9474d2db4e0aa16e482053117452c" - ], - "index": "pypi", - "version": "==1.24.0" - }, "tiktok-downloader": { "git": "https://github.com/msramalho/tiktok-downloader", "ref": "81c6ea1f959b2cc5620961043272592bd1bfc2e2" @@ -535,4 +514,4 @@ } }, "develop": {} -} \ No newline at end of file +} From f3ce22666562bed2780181dbef95b8dee5a5e69e Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 21 Feb 2022 14:19:09 +0100 Subject: [PATCH 03/16] split into multiple files MVP --- .gitignore | 3 +- Pipfile | 1 - Pipfile.lock | 145 +----------- README.md | 2 + archivers.py | 390 -------------------------------- archivers/__init__.py | 6 + archivers/base_archiver.py | 115 ++++++++++ archivers/telegram_archiver.py | 76 +++++++ archivers/tiktok_archiver.py | 68 ++++++ archivers/wayback_archiver.py | 73 ++++++ archivers/youtubedl_archiver.py | 88 +++++++ auto_archive.py | 15 +- 12 files changed, 446 insertions(+), 536 deletions(-) delete mode 100644 archivers.py create mode 100644 archivers/__init__.py create mode 100644 archivers/base_archiver.py create mode 100644 archivers/telegram_archiver.py create mode 100644 archivers/tiktok_archiver.py create mode 100644 archivers/wayback_archiver.py create mode 100644 archivers/youtubedl_archiver.py diff --git a/.gitignore b/.gitignore index b6a6b68..5d7eec9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,8 @@ tmp/ -.env +.env* .DS_Store expmt/ service_account.json __pycache__/ ._* +anu.html \ No newline at end of file diff --git a/Pipfile b/Pipfile index 0d954c9..27071fa 100644 --- a/Pipfile +++ b/Pipfile @@ -10,7 +10,6 @@ python-dotenv = "*" youtube_dl = "*" argparse = "*" beautifulsoup4 = "*" -nordvpn-switcher = "*" tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"} bs4 = "*" loguru = "*" diff --git a/Pipfile.lock b/Pipfile.lock index b354d59..9879884 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "af39efbad8c78641a732697001193b5f4f92a0af8a9709081428001362a47060" + "sha256": "9a5218275503e5ae779407349d0a76f44712dc4824e066b10aeb047264a168be" }, "pipfile-spec": 6, "requires": { @@ -93,6 +93,14 @@ ], "version": "==1.2.58" }, + "faker": { + "hashes": [ + "sha256:ee8d9181137cdd2b198bd3d0653b0a3b7b385213862348e15ba8a423324b702b", + "sha256:f545b2a1ba5f7effc4ed71af0a5204d939445f0190838d41bee6bc160958bfbe" + ], + "markers": "python_version >= '3.6'", + "version": "==13.0.0" + }, "ffmpeg-python": { "hashes": [ "sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127", @@ -180,73 +188,6 @@ "index": "pypi", "version": "==0.6.0" }, - "lxml": { - "hashes": [ - "sha256:078306d19a33920004addeb5f4630781aaeabb6a8d01398045fcde085091a169", - "sha256:0c1978ff1fd81ed9dcbba4f91cf09faf1f8082c9d72eb122e92294716c605428", - "sha256:1010042bfcac2b2dc6098260a2ed022968dbdfaf285fc65a3acf8e4eb1ffd1bc", - "sha256:1d650812b52d98679ed6c6b3b55cbb8fe5a5460a0aef29aeb08dc0b44577df85", - "sha256:20b8a746a026017acf07da39fdb10aa80ad9877046c9182442bf80c84a1c4696", - "sha256:2403a6d6fb61c285969b71f4a3527873fe93fd0abe0832d858a17fe68c8fa507", - "sha256:24f5c5ae618395ed871b3d8ebfcbb36e3f1091fd847bf54c4de623f9107942f3", - "sha256:28d1af847786f68bec57961f31221125c29d6f52d9187c01cd34dc14e2b29430", - "sha256:31499847fc5f73ee17dbe1b8e24c6dafc4e8d5b48803d17d22988976b0171f03", - "sha256:31ba2cbc64516dcdd6c24418daa7abff989ddf3ba6d3ea6f6ce6f2ed6e754ec9", - "sha256:330bff92c26d4aee79c5bc4d9967858bdbe73fdbdbacb5daf623a03a914fe05b", - "sha256:5045ee1ccd45a89c4daec1160217d363fcd23811e26734688007c26f28c9e9e7", - "sha256:52cbf2ff155b19dc4d4100f7442f6a697938bf4493f8d3b0c51d45568d5666b5", - "sha256:530f278849031b0eb12f46cca0e5db01cfe5177ab13bd6878c6e739319bae654", - "sha256:545bd39c9481f2e3f2727c78c169425efbfb3fbba6e7db4f46a80ebb249819ca", - "sha256:5804e04feb4e61babf3911c2a974a5b86f66ee227cc5006230b00ac6d285b3a9", - "sha256:5a58d0b12f5053e270510bf12f753a76aaf3d74c453c00942ed7d2c804ca845c", - "sha256:5f148b0c6133fb928503cfcdfdba395010f997aa44bcf6474fcdd0c5398d9b63", - "sha256:5f7d7d9afc7b293147e2d506a4596641d60181a35279ef3aa5778d0d9d9123fe", - "sha256:60d2f60bd5a2a979df28ab309352cdcf8181bda0cca4529769a945f09aba06f9", - "sha256:6259b511b0f2527e6d55ad87acc1c07b3cbffc3d5e050d7e7bcfa151b8202df9", - "sha256:6268e27873a3d191849204d00d03f65c0e343b3bcb518a6eaae05677c95621d1", - "sha256:627e79894770783c129cc5e89b947e52aa26e8e0557c7e205368a809da4b7939", - "sha256:62f93eac69ec0f4be98d1b96f4d6b964855b8255c345c17ff12c20b93f247b68", - "sha256:6d6483b1229470e1d8835e52e0ff3c6973b9b97b24cd1c116dca90b57a2cc613", - "sha256:6f7b82934c08e28a2d537d870293236b1000d94d0b4583825ab9649aef7ddf63", - "sha256:6fe4ef4402df0250b75ba876c3795510d782def5c1e63890bde02d622570d39e", - "sha256:719544565c2937c21a6f76d520e6e52b726d132815adb3447ccffbe9f44203c4", - "sha256:730766072fd5dcb219dd2b95c4c49752a54f00157f322bc6d71f7d2a31fecd79", - "sha256:74eb65ec61e3c7c019d7169387d1b6ffcfea1b9ec5894d116a9a903636e4a0b1", - "sha256:7993232bd4044392c47779a3c7e8889fea6883be46281d45a81451acfd704d7e", - "sha256:80bbaddf2baab7e6de4bc47405e34948e694a9efe0861c61cdc23aa774fcb141", - "sha256:86545e351e879d0b72b620db6a3b96346921fa87b3d366d6c074e5a9a0b8dadb", - "sha256:891dc8f522d7059ff0024cd3ae79fd224752676447f9c678f2a5c14b84d9a939", - "sha256:8a31f24e2a0b6317f33aafbb2f0895c0bce772980ae60c2c640d82caac49628a", - "sha256:8b99ec73073b37f9ebe8caf399001848fced9c08064effdbfc4da2b5a8d07b93", - "sha256:986b7a96228c9b4942ec420eff37556c5777bfba6758edcb95421e4a614b57f9", - "sha256:a1547ff4b8a833511eeaceacbcd17b043214fcdb385148f9c1bc5556ca9623e2", - "sha256:a2bfc7e2a0601b475477c954bf167dee6d0f55cb167e3f3e7cefad906e7759f6", - "sha256:a3c5f1a719aa11866ffc530d54ad965063a8cbbecae6515acbd5f0fae8f48eaa", - "sha256:a9f1c3489736ff8e1c7652e9dc39f80cff820f23624f23d9eab6e122ac99b150", - "sha256:aa0cf4922da7a3c905d000b35065df6184c0dc1d866dd3b86fd961905bbad2ea", - "sha256:ad4332a532e2d5acb231a2e5d33f943750091ee435daffca3fec0a53224e7e33", - "sha256:b2582b238e1658c4061ebe1b4df53c435190d22457642377fd0cb30685cdfb76", - "sha256:b6fc2e2fb6f532cf48b5fed57567ef286addcef38c28874458a41b7837a57807", - "sha256:b92d40121dcbd74831b690a75533da703750f7041b4bf951befc657c37e5695a", - "sha256:bbab6faf6568484707acc052f4dfc3802bdb0cafe079383fbaa23f1cdae9ecd4", - "sha256:c0b88ed1ae66777a798dc54f627e32d3b81c8009967c63993c450ee4cbcbec15", - "sha256:ce13d6291a5f47c1c8dbd375baa78551053bc6b5e5c0e9bb8e39c0a8359fd52f", - "sha256:db3535733f59e5605a88a706824dfcb9bd06725e709ecb017e165fc1d6e7d429", - "sha256:dd10383f1d6b7edf247d0960a3db274c07e96cf3a3fc7c41c8448f93eac3fb1c", - "sha256:e01f9531ba5420838c801c21c1b0f45dbc9607cb22ea2cf132844453bec863a5", - "sha256:e11527dc23d5ef44d76fef11213215c34f36af1608074561fcc561d983aeb870", - "sha256:e1ab2fac607842ac36864e358c42feb0960ae62c34aa4caaf12ada0a1fb5d99b", - "sha256:e1fd7d2fe11f1cb63d3336d147c852f6d07de0d0020d704c6031b46a30b02ca8", - "sha256:e9f84ed9f4d50b74fbc77298ee5c870f67cb7e91dcdc1a6915cb1ff6a317476c", - "sha256:ec4b4e75fc68da9dc0ed73dcdb431c25c57775383fec325d23a770a64e7ebc87", - "sha256:f10ce66fcdeb3543df51d423ede7e238be98412232fca5daec3e54bcd16b8da0", - "sha256:f63f62fc60e6228a4ca9abae28228f35e1bd3ce675013d1dfb828688d50c6e23", - "sha256:fa56bb08b3dd8eac3a8c5b7d075c94e74f755fd9d8a04543ae8d37b1612dd170", - "sha256:fa9b7c450be85bfc6cd39f6df8c5b8cbd76b5d6fc1f69efec80203f9894b885f" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==4.8.0" - }, "markupsafe": { "hashes": [ "sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3", @@ -293,14 +234,6 @@ "markers": "python_version >= '3.7'", "version": "==2.1.0" }, - "nordvpn-switcher": { - "hashes": [ - "sha256:764db054715d949af0f836da5e46c4053afe92282a0d4b2cfc6b8cfe8c3045de", - "sha256:9788c2c3113d0d7b00894dae3ea19bed14f3b38d111d7223c126f001b1729a3b" - ], - "index": "pypi", - "version": "==0.2.9" - }, "oauthlib": { "hashes": [ "sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2", @@ -309,59 +242,6 @@ "markers": "python_version >= '3.6'", "version": "==3.2.0" }, - "pathlib": { - "hashes": [ - "sha256:6940718dfc3eff4258203ad5021090933e5c04707d5ca8cc9e73c94a7894ea9f" - ], - "version": "==1.0.1" - }, - "psutil": { - "hashes": [ - "sha256:072664401ae6e7c1bfb878c65d7282d4b4391f1bc9a56d5e03b5a490403271b5", - "sha256:1070a9b287846a21a5d572d6dddd369517510b68710fca56b0e9e02fd24bed9a", - "sha256:1d7b433519b9a38192dfda962dd8f44446668c009833e1429a52424624f408b4", - "sha256:3151a58f0fbd8942ba94f7c31c7e6b310d2989f4da74fcbf28b934374e9bf841", - "sha256:32acf55cb9a8cbfb29167cd005951df81b567099295291bcfd1027365b36591d", - "sha256:3611e87eea393f779a35b192b46a164b1d01167c9d323dda9b1e527ea69d697d", - "sha256:3d00a664e31921009a84367266b35ba0aac04a2a6cad09c550a89041034d19a0", - "sha256:4e2fb92e3aeae3ec3b7b66c528981fd327fb93fd906a77215200404444ec1845", - "sha256:539e429da49c5d27d5a58e3563886057f8fc3868a5547b4f1876d9c0f007bccf", - "sha256:55ce319452e3d139e25d6c3f85a1acf12d1607ddedea5e35fb47a552c051161b", - "sha256:58c7d923dc209225600aec73aa2c4ae8ea33b1ab31bc11ef8a5933b027476f07", - "sha256:7336292a13a80eb93c21f36bde4328aa748a04b68c13d01dfddd67fc13fd0618", - "sha256:742c34fff804f34f62659279ed5c5b723bb0195e9d7bd9907591de9f8f6558e2", - "sha256:7641300de73e4909e5d148e90cc3142fb890079e1525a840cf0dfd39195239fd", - "sha256:76cebf84aac1d6da5b63df11fe0d377b46b7b500d892284068bacccf12f20666", - "sha256:7779be4025c540d1d65a2de3f30caeacc49ae7a2152108adeaf42c7534a115ce", - "sha256:7d190ee2eaef7831163f254dc58f6d2e2a22e27382b936aab51c835fc080c3d3", - "sha256:8293942e4ce0c5689821f65ce6522ce4786d02af57f13c0195b40e1edb1db61d", - "sha256:869842dbd66bb80c3217158e629d6fceaecc3a3166d3d1faee515b05dd26ca25", - "sha256:90a58b9fcae2dbfe4ba852b57bd4a1dded6b990a33d6428c7614b7d48eccb492", - "sha256:9b51917c1af3fa35a3f2dabd7ba96a2a4f19df3dec911da73875e1edaf22a40b", - "sha256:b2237f35c4bbae932ee98902a08050a27821f8f6dfa880a47195e5993af4702d", - "sha256:c3400cae15bdb449d518545cbd5b649117de54e3596ded84aacabfbb3297ead2", - "sha256:c51f1af02334e4b516ec221ee26b8fdf105032418ca5a5ab9737e8c87dafe203", - "sha256:cb8d10461c1ceee0c25a64f2dd54872b70b89c26419e147a05a10b753ad36ec2", - "sha256:d62a2796e08dd024b8179bd441cb714e0f81226c352c802fca0fd3f89eeacd94", - "sha256:df2c8bd48fb83a8408c8390b143c6a6fa10cb1a674ca664954de193fdcab36a9", - "sha256:e5c783d0b1ad6ca8a5d3e7b680468c9c926b804be83a3a8e95141b05c39c9f64", - "sha256:e9805fed4f2a81de98ae5fe38b75a74c6e6ad2df8a5c479594c7629a1fe35f56", - "sha256:ea42d747c5f71b5ccaa6897b216a7dadb9f52c72a0fe2b872ef7d3e1eacf3ba3", - "sha256:ef216cc9feb60634bda2f341a9559ac594e2eeaadd0ba187a4c2eb5b5d40b91c", - "sha256:ff0d41f8b3e9ebb6b6110057e40019a432e96aae2008951121ba4e56040b84f3" - ], - "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==5.9.0" - }, - "py-mini-racer": { - "hashes": [ - "sha256:346e73bb89a2024888244d487834be24a121089ceb0641dd0200cb96c4e24b57", - "sha256:42896c24968481dd953eeeb11de331f6870917811961c9b26ba09071e07180e2", - "sha256:97cab31bbf63ce462ba4cd6e978c572c916d8b15586156c7c5e0b2e42c10baab", - "sha256:f71e36b643d947ba698c57cd9bd2232c83ca997b0802fc2f7f79582377040c11" - ], - "version": "==0.6.0" - }, "pyasn1": { "hashes": [ "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359", @@ -422,13 +302,6 @@ "index": "pypi", "version": "==0.19.2" }, - "random-user-agent": { - "hashes": [ - "sha256:535636a55fb63fe3d74fd0260d854c241d9f2946447026464e578e68eac17dac", - "sha256:8f8ca26ec8cb1d24ad1758d8b8f700d154064d641dbe9a255cfec42960fbd012" - ], - "version": "==1.0.1" - }, "requests": { "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", diff --git a/README.md b/README.md index 2e40bcc..cec6e9a 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,8 @@ If you are using `pipenv` (recommended), `pipenv install` is sufficient to insta [A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script. +[ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work. + A `.env` file is required for saving content to a Digital Ocean space, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables: ``` diff --git a/archivers.py b/archivers.py deleted file mode 100644 index d8a72f6..0000000 --- a/archivers.py +++ /dev/null @@ -1,390 +0,0 @@ -from dataclasses import dataclass -import youtube_dl -from bs4 import BeautifulSoup -import requests -import tiktok_downloader -from loguru import logger -import os -import datetime -import ffmpeg -from botocore.errorfactory import ClientError -import time -import traceback - -# TODO There should be a better way of generating keys, that adds the following info: -# - name of sheet that it is being archived from -# (this means we might archive the same media twice on different sheets, but that's OK I think) -# - name of archiver/platform that the video comes from -# This should make it easier to maintain and clean the archive later - -# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be -# cleaned up? Difficult is we don't know the filename until the archivers start working. - - -def get_cdn_url(key): - return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format( - os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key) - - -def do_s3_upload(s3_client, f, key): - s3_client.upload_fileobj(f, Bucket=os.getenv( - 'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'}) - - -def get_key(filename): - key = filename.split('/')[1] - if 'unknown_video' in key: - key = key.replace('unknown_video', 'jpg') - return key - - -def get_thumbnails(filename, s3_client, duration=None): - if not os.path.exists(filename.split('.')[0]): - os.mkdir(filename.split('.')[0]) - - fps = 0.5 - if duration is not None: - duration = float(duration) - - if duration < 60: - fps = 10.0 / duration - elif duration < 120: - fps = 20.0 / duration - else: - fps = 40.0 / duration - - stream = ffmpeg.input(filename) - stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1) - stream.output(filename.split('.')[0] + '/out%d.jpg').run() - - thumbnails = os.listdir(filename.split('.')[0] + '/') - cdn_urls = [] - - for fname in thumbnails: - if fname[-3:] == 'jpg': - thumbnail_filename = filename.split('.')[0] + '/' + fname - key = filename.split('/')[1].split('.')[0] + '/' + fname - - cdn_url = get_cdn_url(key) - - with open(thumbnail_filename, 'rb') as f: - do_s3_upload(s3_client, f, key) - - cdn_urls.append(cdn_url) - os.remove(thumbnail_filename) - - if len(cdn_urls) == 0: - return ('None', 'None') - - key_thumb = cdn_urls[int(len(cdn_urls)*0.1)] - - index_page = f'''{filename} - ''' - - for t in cdn_urls: - index_page += f'' - - index_page += f"" - index_fname = filename.split('.')[0] + '/index.html' - - with open(index_fname, 'w') as f: - f.write(index_page) - - thumb_index = filename.split('/')[1].split('.')[0] + '/index.html' - - s3_client.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv( - 'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'}) - - thumb_index_cdn_url = get_cdn_url(thumb_index) - - return (key_thumb, thumb_index_cdn_url) - - -@dataclass -class ArchiveResult: - status: str - cdn_url: str = None - thumbnail: str = None - thumbnail_index: str = None - duration: float = None - title: str = None - timestamp: datetime.datetime = None - - -class Archiver: - def __init__(self, s3_client): - self.s3 = s3_client - - def download(self, url): - pass - - -class TelegramArchiver(Archiver): - def download(self, url, check_if_exists=False): - # detect URLs that we definitely cannot handle - if 'http://t.me/' not in url and 'https://t.me/' not in url: - return False - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'} - status = "success" - - original_url = url - - if url[-8:] != "?embed=1": - url += "?embed=1" - - t = requests.get(url, headers=headers) - s = BeautifulSoup(t.content, 'html.parser') - video = s.find("video") - - if video is None: - return False # could not find video - - video_url = video.get('src') - key = video_url.split('/')[-1].split('?')[0] - filename = 'tmp/' + key - - if check_if_exists: - try: - self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) - - # file exists - cdn_url = get_cdn_url(key) - - status = 'already archived' - - except ClientError: - pass - - v = requests.get(video_url, headers=headers) - - with open(filename, 'wb') as f: - f.write(v.content) - - if status != 'already archived': - cdn_url = get_cdn_url(key) - - with open(filename, 'rb') as f: - do_s3_upload(self.s3, f, key) - - # extract duration from HTML - duration = s.find_all('time')[0].contents[0] - if ':' in duration: - duration = float(duration.split( - ':')[0])*60 + float(duration.split(':')[1]) - else: - duration = float(duration) - - # process thumbnails - key_thumb, thumb_index = get_thumbnails( - filename, self.s3, duration=duration) - os.remove(filename) - - return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, - duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime')) - - -class YoutubeDLArchiver(Archiver): - def download(self, url, check_if_exists=False): - ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} - if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'): - logger.info('Using Facebook cookie') - youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE') - - ydl = youtube_dl.YoutubeDL(ydl_opts) - cdn_url = None - status = 'success' - - try: - info = ydl.extract_info(url, download=False) - except youtube_dl.utils.DownloadError: - # no video here - return False - - if 'is_live' in info and info['is_live']: - logger.warning("Live streaming media, not archiving now") - return ArchiveResult(status="Streaming media") - - if check_if_exists: - if 'entries' in info: - if len(info['entries']) > 1: - logger.warning( - 'YoutubeDLArchiver cannot archive channels or pages with multiple videos') - return False - - filename = ydl.prepare_filename(info['entries'][0]) - else: - filename = ydl.prepare_filename(info) - - key = get_key(filename) - - try: - self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) - - # file exists - cdn_url = get_cdn_url(key) - - status = 'already archived' - - except ClientError: - pass - - # sometimes this results in a different filename, so do this again - info = ydl.extract_info(url, download=True) - - if 'entries' in info: - if len(info['entries']) > 1: - logger.warning( - 'YoutubeDLArchiver cannot archive channels or pages with multiple videos') - return False - else: - info = info['entries'][0] - - filename = ydl.prepare_filename(info) - - if not os.path.exists(filename): - filename = filename.split('.')[0] + '.mkv' - - if status != 'already archived': - key = get_key(filename) - cdn_url = get_cdn_url(key) - - with open(filename, 'rb') as f: - do_s3_upload(self.s3, f, key) - - # get duration - duration = info['duration'] if 'duration' in info else None - - # get thumbnails - key_thumb, thumb_index = get_thumbnails( - filename, self.s3, duration=duration) - os.remove(filename) - - return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, - title=info['title'] if 'title' in info else None, - timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None) - - -class WaybackArchiver(Archiver): - def __init__(self, s3_client): - self.s3 = s3_client - self.seen_urls = {} - - def download(self, url, check_if_exists=False): - if check_if_exists and url in self.seen_urls: - return self.seen_urls[url] - - ia_headers = { - "Accept": "application/json", - "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET') - } - - r = requests.post( - 'https://web.archive.org/save/', headers=ia_headers, data={'url': url}) - - if r.status_code != 200: - return ArchiveResult(status="Internet archive failed") - - job_id = r.json()['job_id'] - - status_r = requests.get( - 'https://web.archive.org/save/status/' + job_id, headers=ia_headers) - - retries = 0 - - # wait 90-120 seconds for the archive job to finish - while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30: - time.sleep(3) - - try: - status_r = requests.get( - 'https://web.archive.org/save/status/' + job_id, headers=ia_headers) - except: - time.sleep(1) - - retries += 1 - - if status_r.status_code != 200: - return ArchiveResult(status="Internet archive failed") - - status_json = status_r.json() - - if status_json['status'] != 'success': - return ArchiveResult(status='Internet Archive failed: ' + status_json['message']) - - archive_url = 'https://web.archive.org/web/' + \ - status_json['timestamp'] + '/' + status_json['original_url'] - - try: - r = requests.get(archive_url) - - parsed = BeautifulSoup( - r.content, 'html.parser') - - title = parsed.find_all('title')[ - 0].text - except: - title = "Could not get title" - - result = ArchiveResult( - status='Internet Archive fallback', cdn_url=archive_url, title=title) - self.seen_urls[url] = result - return result - - -class TiktokArchiver(Archiver): - def download(self, url, check_if_exists=False): - if 'tiktok.com' not in url: - return False - - status = 'success' - - try: - info = tiktok_downloader.info_post(url) - key = 'tiktok_' + str(info.id) + '.mp4' - filename = 'tmp/' + key - - if check_if_exists: - try: - self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) - - # file exists - cdn_url = get_cdn_url(key) - - status = 'already archived' - - except ClientError: - pass - - if status != 'already archived': - media = tiktok_downloader.snaptik(url).get_media() - if len(media) > 0: - media[0].download(filename) - with open(filename, 'rb') as f: - do_s3_upload(self.s3, f, key) - - cdn_url = get_cdn_url(key) - else: - status = 'could not download media' - - try: - key_thumb, thumb_index = get_thumbnails( - filename, self.s3, duration=info.duration) - except: - key_thumb = '' - thumb_index = 'error creating thumbnails' - - os.remove(filename) - - return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, - thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat()) - - except tiktok_downloader.Except.InvalidUrl: - status = 'Invalid URL' - return ArchiveResult(status=status) - - except: - error = traceback.format_exc() - status = 'Other Tiktok error: ' + str(error) - return ArchiveResult(status=status) diff --git a/archivers/__init__.py b/archivers/__init__.py new file mode 100644 index 0000000..e6c4ba6 --- /dev/null +++ b/archivers/__init__.py @@ -0,0 +1,6 @@ +# we need to explicitly expose the available imports here +from .base_archiver import * +from .telegram_archiver import * +from .tiktok_archiver import * +from .wayback_archiver import * +from .youtubedl_archiver import * \ No newline at end of file diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py new file mode 100644 index 0000000..3f9f4ac --- /dev/null +++ b/archivers/base_archiver.py @@ -0,0 +1,115 @@ +import os +import ffmpeg +from dataclasses import dataclass +import datetime +from loguru import logger + +# TODO There should be a better way of generating keys, that adds the following info: +# - name of sheet that it is being archived from +# (this means we might archive the same media twice on different sheets, but that's OK I think) +# - name of archiver/platform that the video comes from +# This should make it easier to maintain and clean the archive later + +# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be +# cleaned up? Difficult is we don't know the filename until the archivers start working. + + +@dataclass +class ArchiveResult: + status: str + cdn_url: str = None + thumbnail: str = None + thumbnail_index: str = None + duration: float = None + title: str = None + timestamp: datetime.datetime = None + + +class Archiver: + name = "default" + + def __init__(self, s3_client): + self.s3 = s3_client + + def __str__(self): + return self.__class__.__name__ + + def download(self, url, check_if_exists=False): + logger.error("method 'download' not implemented") + + def get_cdn_url(self, key): + return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format( + os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key) + + def do_s3_upload(self, f, key): + self.s3.upload_fileobj(f, Bucket=os.getenv( + 'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'}) + + def get_key(self, filename): + print(f"key base implementation: {self.name}") + # TODO: refactor to be more manageable + key = filename.split('/')[1] + if 'unknown_video' in key: + key = key.replace('unknown_video', 'jpg') + return key + + def get_thumbnails(self, filename, duration=None): + if not os.path.exists(filename.split('.')[0]): + os.mkdir(filename.split('.')[0]) + + fps = 0.5 + if duration is not None: + duration = float(duration) + + if duration < 60: + fps = 10.0 / duration + elif duration < 120: + fps = 20.0 / duration + else: + fps = 40.0 / duration + + stream = ffmpeg.input(filename) + stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1) + stream.output(filename.split('.')[0] + '/out%d.jpg').run() + + thumbnails = os.listdir(filename.split('.')[0] + '/') + cdn_urls = [] + + for fname in thumbnails: + if fname[-3:] == 'jpg': + thumbnail_filename = filename.split('.')[0] + '/' + fname + key = filename.split('/')[1].split('.')[0] + '/' + fname + + cdn_url = self.get_cdn_url(key) + + with open(thumbnail_filename, 'rb') as f: + self.do_s3_upload(f, key) + + cdn_urls.append(cdn_url) + os.remove(thumbnail_filename) + + if len(cdn_urls) == 0: + return ('None', 'None') + + key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)] + + index_page = f'''{filename} + ''' + + for t in cdn_urls: + index_page += f'' + + index_page += f"" + index_fname = filename.split('.')[0] + '/index.html' + + with open(index_fname, 'w') as f: + f.write(index_page) + + thumb_index = filename.split('/')[1].split('.')[0] + '/index.html' + + self.s3.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv( + 'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'}) + + thumb_index_cdn_url = self.get_cdn_url(thumb_index) + + return (key_thumb, thumb_index_cdn_url) diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py new file mode 100644 index 0000000..d9168e4 --- /dev/null +++ b/archivers/telegram_archiver.py @@ -0,0 +1,76 @@ +import os +import requests +from bs4 import BeautifulSoup +from botocore.errorfactory import ClientError +from .base_archiver import Archiver, ArchiveResult + +# TODO: get_cdn_url, get_thumbnails, do_s3_upload + + +class TelegramArchiver(Archiver): + name = "telegram" + + def download(self, url, check_if_exists=False): + # detect URLs that we definitely cannot handle + if 'http://t.me/' not in url and 'https://t.me/' not in url: + return False + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' + } + status = "success" + + original_url = url + + # TODO: check if we can do this more resilient to user-input + if url[-8:] != "?embed=1": + url += "?embed=1" + + t = requests.get(url, headers=headers) + s = BeautifulSoup(t.content, 'html.parser') + video = s.find("video") + + if video is None: + return False # could not find video + + video_url = video.get('src') + key = video_url.split('/')[-1].split('?')[0] + filename = 'tmp/' + key + + if check_if_exists: + try: + self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) + + # file exists + cdn_url = self.get_cdn_url(key) + + status = 'already archived' + + except ClientError: + pass + + v = requests.get(video_url, headers=headers) + + with open(filename, 'wb') as f: + f.write(v.content) + + if status != 'already archived': + cdn_url = self.get_cdn_url(key) + + with open(filename, 'rb') as f: + self.do_s3_upload(f, key) + + # extract duration from HTML + duration = s.find_all('time')[0].contents[0] + if ':' in duration: + duration = float(duration.split( + ':')[0]) * 60 + float(duration.split(':')[1]) + else: + duration = float(duration) + + # process thumbnails + key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration) + os.remove(filename) + + return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, + duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime')) diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py new file mode 100644 index 0000000..1e3bcaf --- /dev/null +++ b/archivers/tiktok_archiver.py @@ -0,0 +1,68 @@ +import os, traceback +from botocore.errorfactory import ClientError +import tiktok_downloader +from loguru import logger +from .base_archiver import Archiver, ArchiveResult + +# TODO: get_cdn_url, do_s3_upload, get_thumbnails + + +class TiktokArchiver(Archiver): + name = "tiktok" + + def download(self, url, check_if_exists=False): + if 'tiktok.com' not in url: + return False + + status = 'success' + + try: + info = tiktok_downloader.info_post(url) + key = 'tiktok_' + str(info.id) + '.mp4' + filename = 'tmp/' + key + + if check_if_exists: + try: + self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) + + # file exists + cdn_url = self.get_cdn_url(key) + + status = 'already archived' + + except ClientError: + pass + + if status != 'already archived': + media = tiktok_downloader.snaptik(url).get_media() + if len(media) > 0: + media[0].download(filename) + with open(filename, 'rb') as f: + self.do_s3_upload(f, key) + + cdn_url = self.get_cdn_url(key) + else: + status = 'could not download media' + + try: + key_thumb, thumb_index = self.get_thumbnails( + filename, duration=info.duration) + except: + key_thumb = '' + thumb_index = 'error creating thumbnails' + + try: os.remove(filename) + except FileNotFoundError: + logger.info(f'tmp file not found thus not deleted {filename}') + + return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, + thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat()) + + except tiktok_downloader.Except.InvalidUrl: + status = 'Invalid URL' + return ArchiveResult(status=status) + + except: + error = traceback.format_exc() + status = 'Other Tiktok error: ' + str(error) + return ArchiveResult(status=status) diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py new file mode 100644 index 0000000..a021324 --- /dev/null +++ b/archivers/wayback_archiver.py @@ -0,0 +1,73 @@ +import time, requests, os +from bs4 import BeautifulSoup + +from .base_archiver import Archiver, ArchiveResult + + +class WaybackArchiver(Archiver): + name = "wayback" + + def __init__(self, s3_client): + self.s3 = s3_client + self.seen_urls = {} + + def download(self, url, check_if_exists=False): + if check_if_exists and url in self.seen_urls: + return self.seen_urls[url] + + ia_headers = { + "Accept": "application/json", + "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET') + } + + r = requests.post( + 'https://web.archive.org/save/', headers=ia_headers, data={'url': url}) + + if r.status_code != 200: + return ArchiveResult(status="Internet archive failed") + + job_id = r.json()['job_id'] + + status_r = requests.get( + 'https://web.archive.org/save/status/' + job_id, headers=ia_headers) + + retries = 0 + + # wait 90-120 seconds for the archive job to finish + while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30: + time.sleep(3) + + try: + status_r = requests.get( + 'https://web.archive.org/save/status/' + job_id, headers=ia_headers) + except: + time.sleep(1) + + retries += 1 + + if status_r.status_code != 200: + return ArchiveResult(status="Internet archive failed") + + status_json = status_r.json() + + if status_json['status'] != 'success': + return ArchiveResult(status='Internet Archive failed: ' + status_json['message']) + + archive_url = 'https://web.archive.org/web/' + \ + status_json['timestamp'] + '/' + status_json['original_url'] + + try: + r = requests.get(archive_url) + + parsed = BeautifulSoup( + r.content, 'html.parser') + + title = parsed.find_all('title')[ + 0].text + except: + title = "Could not get title" + + result = ArchiveResult( + status='Internet Archive fallback', cdn_url=archive_url, title=title) + self.seen_urls[url] = result + return result diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py new file mode 100644 index 0000000..8249cfa --- /dev/null +++ b/archivers/youtubedl_archiver.py @@ -0,0 +1,88 @@ + +import os +import datetime +import youtube_dl +from loguru import logger +from botocore.errorfactory import ClientError +from .base_archiver import Archiver, ArchiveResult + +class YoutubeDLArchiver(Archiver): + name = "yotube_dl" + + def download(self, url, check_if_exists=False): + ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} + if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'): + logger.info('Using Facebook cookie') + youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE') + + ydl = youtube_dl.YoutubeDL(ydl_opts) + cdn_url = None + status = 'success' + + try: + info = ydl.extract_info(url, download=False) + except youtube_dl.utils.DownloadError: + # no video here + return False + + if 'is_live' in info and info['is_live']: + logger.warning("Live streaming media, not archiving now") + return ArchiveResult(status="Streaming media") + + if check_if_exists: + if 'entries' in info: + if len(info['entries']) > 1: + logger.warning( + 'YoutubeDLArchiver cannot archive channels or pages with multiple videos') + return False + + filename = ydl.prepare_filename(info['entries'][0]) + else: + filename = ydl.prepare_filename(info) + + key = self.get_key(filename) + + try: + self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) + + # file exists + cdn_url = self.get_cdn_url(key) + + status = 'already archived' + + except ClientError: + pass + + # sometimes this results in a different filename, so do this again + info = ydl.extract_info(url, download=True) + + if 'entries' in info: + if len(info['entries']) > 1: + logger.warning( + 'YoutubeDLArchiver cannot archive channels or pages with multiple videos') + return False + else: + info = info['entries'][0] + + filename = ydl.prepare_filename(info) + + if not os.path.exists(filename): + filename = filename.split('.')[0] + '.mkv' + + if status != 'already archived': + key = self. get_key(filename) + cdn_url = self.get_cdn_url(key) + + with open(filename, 'rb') as f: + self.do_s3_upload(f, key) + + # get duration + duration = info['duration'] if 'duration' in info else None + + # get thumbnails + key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration) + os.remove(filename) + + return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, + title=info['title'] if 'title' in info else None, + timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None) diff --git a/auto_archive.py b/auto_archive.py index ef4f89c..c478463 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -1,14 +1,12 @@ -from dataclasses import dataclass -import gspread -from pathlib import Path -import datetime -import boto3 import os -from dotenv import load_dotenv +import datetime import argparse import math -import threading +import gspread +import boto3 from loguru import logger +from dotenv import load_dotenv + import archivers load_dotenv() @@ -156,6 +154,7 @@ def process_sheet(sheet): 'duration')) if 'duration' in headers else None + # order matters, first to succeed excludes remaining active_archivers = [ archivers.TelegramArchiver(s3_client), archivers.TiktokArchiver(s3_client), @@ -198,7 +197,7 @@ def process_sheet(sheet): def main(): parser = argparse.ArgumentParser( - description="Automatically use youtube-dl to download media from a Google Sheet") + description="Automatically archive social media videos from a Google Sheet") parser.add_argument("--sheet", action="store", dest="sheet") args = parser.parse_args() From 07b5d357b478b892313f8813e4fb77764fd811c9 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Tue, 22 Feb 2022 08:20:45 +0100 Subject: [PATCH 04/16] Fix bugs in WaybackArchiver, follow redirects sometimes --- archivers.py | 48 +++++++++++++++++++++++++++++++++++------------- auto_archive.py | 30 +++++++++++++++++++----------- 2 files changed, 54 insertions(+), 24 deletions(-) diff --git a/archivers.py b/archivers.py index d8a72f6..7c8df8c 100644 --- a/archivers.py +++ b/archivers.py @@ -210,7 +210,11 @@ class YoutubeDLArchiver(Archiver): if 'entries' in info: if len(info['entries']) > 1: logger.warning( - 'YoutubeDLArchiver cannot archive channels or pages with multiple videos') + 'YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos') + return False + elif len(info['entries']) == 0: + logger.warning( + 'YoutubeDLArchiver succeeded but did not find video') return False filename = ydl.prepare_filename(info['entries'][0]) @@ -257,13 +261,21 @@ class YoutubeDLArchiver(Archiver): duration = info['duration'] if 'duration' in info else None # get thumbnails - key_thumb, thumb_index = get_thumbnails( - filename, self.s3, duration=duration) + try: + key_thumb, thumb_index = get_thumbnails( + filename, self.s3, duration=duration) + except: + key_thumb = '' + thumb_index = 'Could not generate thumbnails' + os.remove(filename) + timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime( + info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None + return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, title=info['title'] if 'title' in info else None, - timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None) + timestamp=timestamp) class WaybackArchiver(Archiver): @@ -286,6 +298,9 @@ class WaybackArchiver(Archiver): if r.status_code != 200: return ArchiveResult(status="Internet archive failed") + if 'job_id' not in r.json() and 'message' in r.json(): + return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}") + job_id = r.json()['job_id'] status_r = requests.get( @@ -311,7 +326,7 @@ class WaybackArchiver(Archiver): status_json = status_r.json() if status_json['status'] != 'success': - return ArchiveResult(status='Internet Archive failed: ' + status_json['message']) + return ArchiveResult(status='Internet Archive failed: ' + str(status_json)) archive_url = 'https://web.archive.org/web/' + \ status_json['timestamp'] + '/' + status_json['original_url'] @@ -324,6 +339,9 @@ class WaybackArchiver(Archiver): title = parsed.find_all('title')[ 0].text + + if title == 'Wayback Machine': + title = 'Could not get title' except: title = "Could not get title" @@ -343,6 +361,7 @@ class TiktokArchiver(Archiver): try: info = tiktok_downloader.info_post(url) key = 'tiktok_' + str(info.id) + '.mp4' + cdn_url = get_cdn_url(key) filename = 'tmp/' + key if check_if_exists: @@ -357,16 +376,19 @@ class TiktokArchiver(Archiver): except ClientError: pass - if status != 'already archived': - media = tiktok_downloader.snaptik(url).get_media() - if len(media) > 0: - media[0].download(filename) - with open(filename, 'rb') as f: - do_s3_upload(self.s3, f, key) + media = tiktok_downloader.snaptik(url).get_media() - cdn_url = get_cdn_url(key) + if len(media) <= 0: + if status == 'already archived': + return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url) else: - status = 'could not download media' + return ArchiveResult(status='Could not download media') + + media[0].download(filename) + + if status != 'already archived': + with open(filename, 'rb') as f: + do_s3_upload(self.s3, f, key) try: key_thumb, thumb_index = get_thumbnails( diff --git a/auto_archive.py b/auto_archive.py index ef4f89c..fe2ccfd 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -10,6 +10,7 @@ import math import threading from loguru import logger import archivers +import requests load_dotenv() @@ -43,7 +44,7 @@ def index_to_col(index): return alphabet[index] -def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v): +def update_sheet(wks, row, result: archivers.ArchiveResult, columns, v): update = [] if columns['status'] is not None: @@ -155,7 +156,6 @@ def process_sheet(sheet): columns['duration'] = index_to_col(headers.index( 'duration')) if 'duration' in headers else None - active_archivers = [ archivers.TelegramArchiver(s3_client), archivers.TiktokArchiver(s3_client), @@ -163,7 +163,6 @@ def process_sheet(sheet): archivers.WaybackArchiver(s3_client) ] - # loop through rows in worksheet for i in range(2, len(values)+1): v = values[i-1] @@ -174,26 +173,35 @@ def process_sheet(sheet): # check so we don't step on each others' toes if latest_val == '' or latest_val is None: - wks.update( - columns['status'] + str(i), 'Archive in progress') + wks.update(columns['status'] + str(i), + 'Archive in progress') for archiver in active_archivers: logger.debug(f"Trying {archiver} on row {i}") - result = archiver.download(v[url_index], check_if_exists=True) + + url = v[url_index] + # expand short URL links + if 'https://t.co/' in url: + r = requests.get(url) + url = r.url + + result = archiver.download(url, check_if_exists=True) if result: logger.info(f"{archiver} succeeded on row {i}") break if result: update_sheet(wks, i, result, columns, v) - + else: + wks.update(columns['status'] + + str(i), 'failed: no archiver') # except: - # if any unexpected errors occured, log these into the Google Sheet - # t, value, traceback = sys.exc_info() + # if any unexpected errors occured, log these into the Google Sheet + # t, value, traceback = sys.exc_info() - # update_sheet(wks, i, str( - # value), {}, columns, v) + # update_sheet(wks, i, str( + # value), {}, columns, v) def main(): From e4603a942305bcd9ad772d14e21af7cb896ba759 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 22 Feb 2022 16:03:35 +0100 Subject: [PATCH 05/16] refactoring storage and bringing changes from origin --- __init__.py | 1 + archivers/base_archiver.py | 55 +++++++++---------------- archivers/telegram_archiver.py | 27 ++++-------- archivers/tiktok_archiver.py | 39 +++++++----------- archivers/wayback_archiver.py | 27 ++++++------ archivers/youtubedl_archiver.py | 41 +++++++++--------- auto_archive.py | 73 ++++++++++++++++++++------------- storages/__init__.py | 3 ++ storages/base_storage.py | 19 +++++++++ storages/s3_storage.py | 49 ++++++++++++++++++++++ 10 files changed, 197 insertions(+), 137 deletions(-) create mode 100644 __init__.py create mode 100644 storages/__init__.py create mode 100644 storages/base_storage.py create mode 100644 storages/s3_storage.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..b85e02a --- /dev/null +++ b/__init__.py @@ -0,0 +1 @@ +from storages import * \ No newline at end of file diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 3f9f4ac..b13a77f 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -1,17 +1,10 @@ import os import ffmpeg -from dataclasses import dataclass import datetime -from loguru import logger +from dataclasses import dataclass +from abc import ABC, abstractmethod -# TODO There should be a better way of generating keys, that adds the following info: -# - name of sheet that it is being archived from -# (this means we might archive the same media twice on different sheets, but that's OK I think) -# - name of archiver/platform that the video comes from -# This should make it easier to maintain and clean the archive later - -# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be -# cleaned up? Difficult is we don't know the filename until the archivers start working. +from storages import Storage @dataclass @@ -25,33 +18,27 @@ class ArchiveResult: timestamp: datetime.datetime = None -class Archiver: +class Archiver(ABC): name = "default" - def __init__(self, s3_client): - self.s3 = s3_client + def __init__(self, storage: Storage): + self.storage = storage def __str__(self): return self.__class__.__name__ - def download(self, url, check_if_exists=False): - logger.error("method 'download' not implemented") - - def get_cdn_url(self, key): - return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format( - os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key) - - def do_s3_upload(self, f, key): - self.s3.upload_fileobj(f, Bucket=os.getenv( - 'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'}) + @abstractmethod + def download(self, url, check_if_exists=False): pass def get_key(self, filename): - print(f"key base implementation: {self.name}") - # TODO: refactor to be more manageable - key = filename.split('/')[1] - if 'unknown_video' in key: - key = key.replace('unknown_video', 'jpg') - return key + """ + returns a key in the format "[archiverName]_[filename]" includes extension + """ + tail = os.path.split(filename)[1] # returns filename.ext from full path + _id, extension = os.path.splitext(tail) # returns [filename, .ext] + if 'unknown_video' in _id: + _id = _id.replace('unknown_video', 'jpg') + return f'{self.name}_{_id}{extension}' def get_thumbnails(self, filename, duration=None): if not os.path.exists(filename.split('.')[0]): @@ -80,10 +67,9 @@ class Archiver: thumbnail_filename = filename.split('.')[0] + '/' + fname key = filename.split('/')[1].split('.')[0] + '/' + fname - cdn_url = self.get_cdn_url(key) + cdn_url = self.storage.get_cdn_url(key) - with open(thumbnail_filename, 'rb') as f: - self.do_s3_upload(f, key) + self.storage.upload(thumbnail_filename, key) cdn_urls.append(cdn_url) os.remove(thumbnail_filename) @@ -107,9 +93,8 @@ class Archiver: thumb_index = filename.split('/')[1].split('.')[0] + '/index.html' - self.s3.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv( - 'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'}) + self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'}) - thumb_index_cdn_url = self.get_cdn_url(thumb_index) + thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index) return (key_thumb, thumb_index_cdn_url) diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index d9168e4..16c6ccf 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -1,15 +1,13 @@ import os import requests from bs4 import BeautifulSoup -from botocore.errorfactory import ClientError -from .base_archiver import Archiver, ArchiveResult -# TODO: get_cdn_url, get_thumbnails, do_s3_upload +from .base_archiver import Archiver, ArchiveResult class TelegramArchiver(Archiver): name = "telegram" - + def download(self, url, check_if_exists=False): # detect URLs that we definitely cannot handle if 'http://t.me/' not in url and 'https://t.me/' not in url: @@ -35,19 +33,13 @@ class TelegramArchiver(Archiver): video_url = video.get('src') key = video_url.split('/')[-1].split('?')[0] + key = self.get_key(key) + filename = 'tmp/' + key - if check_if_exists: - try: - self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) - - # file exists - cdn_url = self.get_cdn_url(key) - - status = 'already archived' - - except ClientError: - pass + if check_if_exists and self.storage.exists(key): + status = 'already archived' + cdn_url = self.storage.get_cdn_url(key) v = requests.get(video_url, headers=headers) @@ -55,10 +47,9 @@ class TelegramArchiver(Archiver): f.write(v.content) if status != 'already archived': - cdn_url = self.get_cdn_url(key) + cdn_url = self.storage.get_cdn_url(key) - with open(filename, 'rb') as f: - self.do_s3_upload(f, key) + self.storage.upload(filename, key) # extract duration from HTML duration = s.find_all('time')[0].contents[0] diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index 1e3bcaf..e61fec9 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -1,15 +1,13 @@ import os, traceback -from botocore.errorfactory import ClientError import tiktok_downloader from loguru import logger -from .base_archiver import Archiver, ArchiveResult -# TODO: get_cdn_url, do_s3_upload, get_thumbnails +from .base_archiver import Archiver, ArchiveResult class TiktokArchiver(Archiver): name = "tiktok" - + def download(self, url, check_if_exists=False): if 'tiktok.com' not in url: return False @@ -18,35 +16,28 @@ class TiktokArchiver(Archiver): try: info = tiktok_downloader.info_post(url) - key = 'tiktok_' + str(info.id) + '.mp4' + key = self.get_key(f'{info.id}.mp4') + cdn_url = self.get_cdn_url(key) filename = 'tmp/' + key - if check_if_exists: - try: - self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) + if check_if_exists and self.storage.exists(key): + status = 'already archived' - # file exists - cdn_url = self.get_cdn_url(key) + media = tiktok_downloader.snaptik(url).get_media() - status = 'already archived' + if len(media) <= 0: + if status == 'already archived': + return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url) + else: + return ArchiveResult(status='Could not download media') - except ClientError: - pass + media[0].download(filename) if status != 'already archived': - media = tiktok_downloader.snaptik(url).get_media() - if len(media) > 0: - media[0].download(filename) - with open(filename, 'rb') as f: - self.do_s3_upload(f, key) - - cdn_url = self.get_cdn_url(key) - else: - status = 'could not download media' + self.storage.upload(filename, key) try: - key_thumb, thumb_index = self.get_thumbnails( - filename, duration=info.duration) + key_thumb, thumb_index = self.get_thumbnails(filename, duration=info.duration) except: key_thumb = '' thumb_index = 'error creating thumbnails' diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index a021324..53b356f 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -1,14 +1,15 @@ import time, requests, os from bs4 import BeautifulSoup +from storages import Storage from .base_archiver import Archiver, ArchiveResult class WaybackArchiver(Archiver): name = "wayback" - - def __init__(self, s3_client): - self.s3 = s3_client + + def __init__(self, storage: Storage): + super(WaybackArchiver, self).__init__(storage) self.seen_urls = {} def download(self, url, check_if_exists=False): @@ -26,10 +27,12 @@ class WaybackArchiver(Archiver): if r.status_code != 200: return ArchiveResult(status="Internet archive failed") + if 'job_id' not in r.json() and 'message' in r.json(): + return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}") + job_id = r.json()['job_id'] - status_r = requests.get( - 'https://web.archive.org/save/status/' + job_id, headers=ia_headers) + status_r = requests.get('https://web.archive.org/save/status/' + job_id, headers=ia_headers) retries = 0 @@ -51,7 +54,7 @@ class WaybackArchiver(Archiver): status_json = status_r.json() if status_json['status'] != 'success': - return ArchiveResult(status='Internet Archive failed: ' + status_json['message']) + return ArchiveResult(status='Internet Archive failed: ' + str(status_json)) archive_url = 'https://web.archive.org/web/' + \ status_json['timestamp'] + '/' + status_json['original_url'] @@ -59,15 +62,15 @@ class WaybackArchiver(Archiver): try: r = requests.get(archive_url) - parsed = BeautifulSoup( - r.content, 'html.parser') + parsed = BeautifulSoup(r.content, 'html.parser') - title = parsed.find_all('title')[ - 0].text + title = parsed.find_all('title')[0].text + + if title == 'Wayback Machine': + title = 'Could not get title' except: title = "Could not get title" - result = ArchiveResult( - status='Internet Archive fallback', cdn_url=archive_url, title=title) + result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title) self.seen_urls[url] = result return result diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index 8249cfa..88f7970 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -3,12 +3,13 @@ import os import datetime import youtube_dl from loguru import logger -from botocore.errorfactory import ClientError + from .base_archiver import Archiver, ArchiveResult + class YoutubeDLArchiver(Archiver): name = "yotube_dl" - + def download(self, url, check_if_exists=False): ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'): @@ -32,8 +33,11 @@ class YoutubeDLArchiver(Archiver): if check_if_exists: if 'entries' in info: if len(info['entries']) > 1: + logger.warning('YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos') + return False + elif len(info['entries']) == 0: logger.warning( - 'YoutubeDLArchiver cannot archive channels or pages with multiple videos') + 'YoutubeDLArchiver succeeded but did not find video') return False filename = ydl.prepare_filename(info['entries'][0]) @@ -42,20 +46,14 @@ class YoutubeDLArchiver(Archiver): key = self.get_key(filename) - try: - self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) - - # file exists - cdn_url = self.get_cdn_url(key) - + if self.storage.exists(key): status = 'already archived' - - except ClientError: - pass + cdn_url = self.storage.get_cdn_url(key) # sometimes this results in a different filename, so do this again info = ydl.extract_info(url, download=True) + # TODO: add support for multiple videos if 'entries' in info: if len(info['entries']) > 1: logger.warning( @@ -70,19 +68,24 @@ class YoutubeDLArchiver(Archiver): filename = filename.split('.')[0] + '.mkv' if status != 'already archived': - key = self. get_key(filename) - cdn_url = self.get_cdn_url(key) + key = self.get_key(filename) + cdn_url = self.storage.get_cdn_url(key) - with open(filename, 'rb') as f: - self.do_s3_upload(f, key) + self.storage.upload(filename, key) # get duration duration = info['duration'] if 'duration' in info else None # get thumbnails - key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration) + try: + key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration) + except: + key_thumb = '' + thumb_index = 'Could not generate thumbnails' + os.remove(filename) + timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None + return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, - title=info['title'] if 'title' in info else None, - timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None) + title=info['title'] if 'title' in info else None, timestamp=timestamp) diff --git a/auto_archive.py b/auto_archive.py index c478463..36bbadb 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -2,12 +2,13 @@ import os import datetime import argparse import math +import requests import gspread -import boto3 from loguru import logger from dotenv import load_dotenv import archivers +from storages import S3Storage, S3Config load_dotenv() @@ -41,7 +42,7 @@ def index_to_col(index): return alphabet[index] -def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v): +def update_sheet(wks, row, result: archivers.ArchiveResult, columns, v): update = [] if columns['status'] is not None: @@ -103,19 +104,24 @@ def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v): def process_sheet(sheet): gc = gspread.service_account(filename='service_account.json') sh = gc.open(sheet) - n_worksheets = len(sh.worksheets()) - s3_client = boto3.client('s3', - region_name=os.getenv('DO_SPACES_REGION'), - endpoint_url='https://{}.digitaloceanspaces.com'.format( - os.getenv('DO_SPACES_REGION')), - aws_access_key_id=os.getenv('DO_SPACES_KEY'), - aws_secret_access_key=os.getenv('DO_SPACES_SECRET')) + s3_config = S3Config( + bucket=os.getenv('DO_BUCKET'), + region=os.getenv('DO_SPACES_REGION'), + key=os.getenv('DO_SPACES_KEY'), + secret=os.getenv('DO_SPACES_SECRET') + ) + + # s3_client = boto3.client('s3', + # region_name=os.getenv('DO_SPACES_REGION'), + # endpoint_url='https://{}.digitaloceanspaces.com'.format( + # os.getenv('DO_SPACES_REGION')), + # aws_access_key_id=os.getenv('DO_SPACES_KEY'), + # aws_secret_access_key=os.getenv('DO_SPACES_SECRET')) # loop through worksheets to check - for ii in range(n_worksheets): - logger.info("Opening worksheet " + str(ii)) - wks = sh.get_worksheet(ii) + for ii, wks in enumerate(sh.worksheets()): + logger.info(f'Opening worksheet {ii}: "{wks.title}"') values = wks.get_all_values() headers = [v.lower() for v in values[0]] @@ -126,7 +132,7 @@ def process_sheet(sheet): 'source url')) if 'source url' in headers else None if columns['url'] is None: - logger.warning("No 'Media URL' column found, skipping") + logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}') continue url_index = col_to_index(columns['url']) @@ -153,6 +159,9 @@ def process_sheet(sheet): columns['duration'] = index_to_col(headers.index( 'duration')) if 'duration' in headers else None + # archives will be in a folder 'doc_name/worksheet_name' + s3_config.folder = f'{sheet}/{wks.title}/' + s3_client = S3Storage(s3_config) # order matters, first to succeed excludes remaining active_archivers = [ @@ -162,37 +171,43 @@ def process_sheet(sheet): archivers.WaybackArchiver(s3_client) ] - # loop through rows in worksheet - for i in range(2, len(values)+1): - v = values[i-1] + for i in range(2, len(values) + 1): + v = values[i - 1] + url = v[url_index] - if v[url_index] != "" and v[col_to_index(columns['status'])] == "": - latest_val = wks.acell( - columns['status'] + str(i)).value + if url != "" and v[col_to_index(columns['status'])] == "": + latest_val = wks.acell(columns['status'] + str(i)).value # check so we don't step on each others' toes if latest_val == '' or latest_val is None: - wks.update( - columns['status'] + str(i), 'Archive in progress') + wks.update(columns['status'] + str(i), 'Archive in progress') + + # expand short URL links + if 'https://t.co/' in url: + r = requests.get(url) + url = r.url for archiver in active_archivers: logger.debug(f"Trying {archiver} on row {i}") - result = archiver.download(v[url_index], check_if_exists=True) + + result = archiver.download(url, check_if_exists=True) + if result: - logger.info(f"{archiver} succeeded on row {i}") + logger.success(f"{archiver} succeeded on row {i}") break if result: update_sheet(wks, i, result, columns, v) + else: + wks.update(columns['status'] + str(i), 'failed: no archiver') + # except: + # if any unexpected errors occured, log these into the Google Sheet + # t, value, traceback = sys.exc_info() - # except: - # if any unexpected errors occured, log these into the Google Sheet - # t, value, traceback = sys.exc_info() - - # update_sheet(wks, i, str( - # value), {}, columns, v) + # update_sheet(wks, i, str( + # value), {}, columns, v) def main(): diff --git a/storages/__init__.py b/storages/__init__.py new file mode 100644 index 0000000..3054d36 --- /dev/null +++ b/storages/__init__.py @@ -0,0 +1,3 @@ +# we need to explicitly expose the available imports here +from .base_storage import * +from .s3_storage import * \ No newline at end of file diff --git a/storages/base_storage.py b/storages/base_storage.py new file mode 100644 index 0000000..050a8eb --- /dev/null +++ b/storages/base_storage.py @@ -0,0 +1,19 @@ +from abc import ABC, abstractmethod + + +class Storage(ABC): + @abstractmethod + def __init__(self, config): pass + + @abstractmethod + def get_cdn_url(self, path): pass + + @abstractmethod + def exists(self, path): pass + + @abstractmethod + def uploadf(self, file, key, **kwargs): pass + + def upload(self, filename: str, key: str, **kwargs): + with open(filename, 'rb') as f: + self.uploadf(f, key, **kwargs) diff --git a/storages/s3_storage.py b/storages/s3_storage.py new file mode 100644 index 0000000..188db7e --- /dev/null +++ b/storages/s3_storage.py @@ -0,0 +1,49 @@ +import boto3 +from botocore.errorfactory import ClientError +from .base_storage import Storage +from dataclasses import dataclass + + +@dataclass +class S3Config: + bucket: str + region: str + key: str + secret: str + folder: str = "" + + +class S3Storage(Storage): + + def __init__(self, config: S3Config): + self.bucket = config.bucket + self.region = config.region + self.folder = config.folder + + if len(self.folder) and self.folder[-1] != '/': + self.folder += '/' + + self.s3 = boto3.client( + 's3', + region_name=self.region, + endpoint_url=f'https://{self.region}.digitaloceanspaces.com', + aws_access_key_id=config.key, + aws_secret_access_key=config.secret + ) + + def _get_path(self, key): + return self.folder + key + + def get_cdn_url(self, key): + return f'https://{self.bucket}.{self.region}.cdn.digitaloceanspaces.com/{self._get_path(key)}' + + def exists(self, key): + try: + self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key)) + return True + except ClientError: + return False + + def uploadf(self, file, key, **kwargs): + extra_args = kwargs["extra_args"] if "extra_args" in kwargs else {'ACL': 'public-read'} + self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args) From 2d145802b54d27012b5afedca4bef60b5a3038b5 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 23 Feb 2022 09:54:03 +0100 Subject: [PATCH 06/16] extracted worksheet operations --- archivers/tiktok_archiver.py | 2 +- auto_archive.py | 196 +++++++++-------------------------- gworksheet.py | 97 +++++++++++++++++ 3 files changed, 145 insertions(+), 150 deletions(-) create mode 100644 gworksheet.py diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index e61fec9..b54f956 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -17,7 +17,7 @@ class TiktokArchiver(Archiver): try: info = tiktok_downloader.info_post(url) key = self.get_key(f'{info.id}.mp4') - cdn_url = self.get_cdn_url(key) + cdn_url = self.storage.get_cdn_url(key) filename = 'tmp/' + key if check_if_exists and self.storage.exists(key): diff --git a/auto_archive.py b/auto_archive.py index 36bbadb..d636cbd 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -1,7 +1,6 @@ import os import datetime import argparse -import math import requests import gspread from loguru import logger @@ -9,96 +8,34 @@ from dotenv import load_dotenv import archivers from storages import S3Storage, S3Config +from gworksheet import GWorksheet load_dotenv() -def col_to_index(col): - col = list(col) - ndigits = len(col) - alphabet = ' ABCDEFGHIJKLMNOPQRSTUVWXYZ' - v = 0 - i = ndigits - 1 - - for digit in col: - index = alphabet.find(digit) - v += (26 ** i) * index - i -= 1 - - return v - 1 - - -def index_to_col(index): - alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' - - if index > 25: - t = index - dig = 0 - while t > 25: - t = math.floor(t / 26) - dig += 1 - return alphabet[t - 1] + index_to_col(index - t * int(math.pow(26, dig))) - else: - return alphabet[index] - - -def update_sheet(wks, row, result: archivers.ArchiveResult, columns, v): +def update_sheet(gw, row, result: archivers.ArchiveResult): update = [] - if columns['status'] is not None: - update += [{ - 'range': columns['status'] + str(row), - 'values': [[result.status]] - }] + def batch_if_valid(col, val, final_value=None): + final_value = final_value or val + if val and gw.col_exists(col) and gw.cell(row, col) == '': + update.append((row, col, final_value)) - if result.cdn_url and columns['archive'] is not None and v[col_to_index(columns['archive'])] == '': - update += [{ - 'range': columns['archive'] + str(row), - 'values': [[result.cdn_url]] - }] + update.append((row, 'status', result.status)) - if columns['date'] is not None and v[col_to_index(columns['date'])] == '': - update += [{ - 'range': columns['date'] + str(row), - 'values': [[datetime.datetime.now().isoformat()]] - }] + batch_if_valid('archive', result.cdn_url) + batch_if_valid('archive', True, datetime.datetime.now().isoformat()) + batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")') + batch_if_valid('thumbnail_index', result.thumbnail_index) + batch_if_valid('title', result.title) + batch_if_valid('duration', result.duration, str(result.duration)) - if result.thumbnail and columns['thumbnail'] is not None and v[col_to_index(columns['thumbnail'])] == '': - update += [{ - 'range': columns['thumbnail'] + str(row), - 'values': [['=IMAGE("' + result.thumbnail + '")']] - }] + if result.timestamp and type(result.timestamp) != str: + result.timestamp = datetime.datetime.fromtimestamp(result.timestamp).isoformat() + batch_if_valid('timestamp', result.timestamp) - if result.thumbnail_index and columns['thumbnail_index'] is not None and v[col_to_index(columns['thumbnail_index'])] == '': - update += [{ - 'range': columns['thumbnail_index'] + str(row), - 'values': [[result.thumbnail_index]] - }] - - if result.timestamp and columns['timestamp'] is not None and v[col_to_index(columns['timestamp'])] == '': - update += [{ - 'range': columns['timestamp'] + str(row), - 'values': [[result.timestamp]] if type(result.timestamp) == str else [[datetime.datetime.fromtimestamp(result.timestamp).isoformat()]] - }] - - if result.title and columns['title'] is not None and v[col_to_index(columns['title'])] == '': - update += [{ - 'range': columns['title'] + str(row), - 'values': [[result.title]] - }] - - if result.duration and columns['duration'] is not None and v[col_to_index(columns['duration'])] == '': - update += [{ - 'range': columns['duration'] + str(row), - 'values': [[str(result.duration)]] - }] - - wks.batch_update(update, value_input_option='USER_ENTERED') - - -# def record_stream(url, s3_client, wks, i, columns, v): -# video_data, status = download_vid(url, s3_client) -# update_sheet(wks, i, status, video_data, columns, v) + gw.update_batch(update) + def process_sheet(sheet): @@ -112,53 +49,19 @@ def process_sheet(sheet): secret=os.getenv('DO_SPACES_SECRET') ) - # s3_client = boto3.client('s3', - # region_name=os.getenv('DO_SPACES_REGION'), - # endpoint_url='https://{}.digitaloceanspaces.com'.format( - # os.getenv('DO_SPACES_REGION')), - # aws_access_key_id=os.getenv('DO_SPACES_KEY'), - # aws_secret_access_key=os.getenv('DO_SPACES_SECRET')) - # loop through worksheets to check for ii, wks in enumerate(sh.worksheets()): logger.info(f'Opening worksheet {ii}: "{wks.title}"') - values = wks.get_all_values() + gw = GWorksheet(wks) - headers = [v.lower() for v in values[0]] - columns = {} - - columns['url'] = index_to_col(headers.index( - 'media url')) if 'media url' in headers else index_to_col(headers.index( - 'source url')) if 'source url' in headers else None - - if columns['url'] is None: + if not gw.col_exists("url"): logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}') continue - url_index = col_to_index(columns['url']) - - columns['archive'] = index_to_col(headers.index( - 'archive location')) if 'archive location' in headers else None - columns['date'] = index_to_col(headers.index( - 'archive date')) if 'archive date' in headers else None - columns['status'] = index_to_col(headers.index( - 'archive status')) if 'archive status' in headers else None - - if columns['status'] is None: + if not gw.col_exists("status"): logger.warning("No 'Archive status' column found, skipping") continue - columns['thumbnail'] = index_to_col(headers.index( - 'thumbnail')) if 'thumbnail' in headers else None - columns['thumbnail_index'] = index_to_col(headers.index( - 'thumbnail index')) if 'thumbnail index' in headers else None - columns['timestamp'] = index_to_col(headers.index( - 'upload timestamp')) if 'upload timestamp' in headers else None - columns['title'] = index_to_col(headers.index( - 'upload title')) if 'upload title' in headers else None - columns['duration'] = index_to_col(headers.index( - 'duration')) if 'duration' in headers else None - # archives will be in a folder 'doc_name/worksheet_name' s3_config.folder = f'{sheet}/{wks.title}/' s3_client = S3Storage(s3_config) @@ -172,47 +75,42 @@ def process_sheet(sheet): ] # loop through rows in worksheet - for i in range(2, len(values) + 1): - v = values[i - 1] - url = v[url_index] + for i in range(2, gw.count_rows() + 1): + row = gw.get_row(i) + url = gw.cell(row, 'url') + status = gw.cell(row, 'status') + if url != '' and status in ['', None]: + gw.update(i, 'status', 'Archive in progress') - if url != "" and v[col_to_index(columns['status'])] == "": - latest_val = wks.acell(columns['status'] + str(i)).value + # expand short URL links + if 'https://t.co/' in url: + r = requests.get(url) + url = r.url - # check so we don't step on each others' toes - if latest_val == '' or latest_val is None: - wks.update(columns['status'] + str(i), 'Archive in progress') - - # expand short URL links - if 'https://t.co/' in url: - r = requests.get(url) - url = r.url - - for archiver in active_archivers: - logger.debug(f"Trying {archiver} on row {i}") - - result = archiver.download(url, check_if_exists=True) - - if result: - logger.success(f"{archiver} succeeded on row {i}") - break + for archiver in active_archivers: + logger.debug(f'Trying {archiver} on row {i}') + result = archiver.download(url, check_if_exists=True) if result: - update_sheet(wks, i, result, columns, v) - else: - wks.update(columns['status'] + str(i), 'failed: no archiver') + logger.success(f'{archiver} succeeded on row {i}') + break - # except: - # if any unexpected errors occured, log these into the Google Sheet - # t, value, traceback = sys.exc_info() + if result: + update_sheet(gw, i, result) + else: + gw.update(i, 'status', 'failed: no archiver') - # update_sheet(wks, i, str( - # value), {}, columns, v) + # # except: + # # if any unexpected errors occured, log these into the Google Sheet + # # t, value, traceback = sys.exc_info() + + # # update_sheet(wks, i, str( + # # value), {}, columns, v) def main(): parser = argparse.ArgumentParser( - description="Automatically archive social media videos from a Google Sheet") + description="Automatically archive social media videos from a Google Sheets document") parser.add_argument("--sheet", action="store", dest="sheet") args = parser.parse_args() diff --git a/gworksheet.py b/gworksheet.py new file mode 100644 index 0000000..721bb01 --- /dev/null +++ b/gworksheet.py @@ -0,0 +1,97 @@ +from gspread import utils + + +class GWorksheet: + COLUMN_NAMES = { + 'url': 'media url', + 'archive': 'archive location', + 'date': 'archive date', + 'status': 'archive status', + 'thumbnail': 'thumbnail', + 'thumbnail_index': 'thumbnail index', + 'timestamp': 'upload timestamp', + 'title': 'upload title', + 'duration': 'duration' + } + + def __init__(self, worksheet, columns=COLUMN_NAMES): + self.wks = worksheet + self.headers = [v.lower() for v in self.wks.row_values(1)] + self.columns = columns + + def worksheet(self): return self.wks + + def _check_col_exists(self, col: str): + if col not in self.columns: + raise Exception(f'Column {col} is not in the configured column names: {self.columns.keys()}') + + def col_exists(self, col: str): + self._check_col_exists(col) + return self.columns[col] in self.headers + + def col_index(self, col: str): + self._check_col_exists(col) + return self.headers.index(self.columns[col]) + + def count_rows(self): + return len(self.wks.get_values()) + + def get_row(self, row: int): + # row is 1-based + return self.wks.row_values(row) + + def cell(self, row, col: str): + # row can be index (1-based) or list of values + if type(row) == int: + row = self.get_row(row) + + col_index = self.col_index(col) + if col_index >= len(row): + return '' + return row[col_index] + + def update(self, row: int, col: str, val): + # row is 1-based + col_index = self.col_index(col) + 1 + self.wks.update_cell(row, col_index, val) + + def update_batch(self, updates): + updates = [ + { + 'range': self.to_a1(row, self.col_index(col) + 1), + 'values': [[val]] + } + for row, col, val in updates + ] + self.wks.batch_update(updates, value_input_option='USER_ENTERED') + + def to_a1(self, row: int, col: int): + # row, col are 1-based + return utils.rowcol_to_a1(row, col) + + # def index_to_col(self, index): + # alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + + # if index > 25: + # t = index + # dig = 0 + # while t > 25: + # t = math.floor(t / 26) + # dig += 1 + # return alphabet[t - 1] + self.index_to_col(index - t * int(math.pow(26, dig))) + # else: + # return alphabet[index] + + # def col_to_index(self, col): + # col = list(col) + # ndigits = len(col) + # alphabet = ' ABCDEFGHIJKLMNOPQRSTUVWXYZ' + # v = 0 + # i = ndigits - 1 + + # for digit in col: + # index = alphabet.find(digit) + # v += (26 ** i) * index + # i -= 1 + + # return v - 1 From 374852e740f38ba9609862b1c2f315e9ac9b9b6c Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 23 Feb 2022 09:57:04 +0100 Subject: [PATCH 07/16] cleanup --- auto_archive.py | 13 ++++++------- gworksheet.py | 27 --------------------------- 2 files changed, 6 insertions(+), 34 deletions(-) diff --git a/auto_archive.py b/auto_archive.py index d636cbd..cbe0744 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -35,7 +35,6 @@ def update_sheet(gw, row, result: archivers.ArchiveResult): batch_if_valid('timestamp', result.timestamp) gw.update_batch(update) - def process_sheet(sheet): @@ -54,11 +53,11 @@ def process_sheet(sheet): logger.info(f'Opening worksheet {ii}: "{wks.title}"') gw = GWorksheet(wks) - if not gw.col_exists("url"): + if not gw.col_exists('url'): logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}') continue - if not gw.col_exists("status"): + if not gw.col_exists('status'): logger.warning("No 'Archive status' column found, skipping") continue @@ -110,14 +109,14 @@ def process_sheet(sheet): def main(): parser = argparse.ArgumentParser( - description="Automatically archive social media videos from a Google Sheets document") - parser.add_argument("--sheet", action="store", dest="sheet") + description='Automatically archive social media videos from a Google Sheets document') + parser.add_argument('--sheet', action='store', dest='sheet') args = parser.parse_args() - logger.info("Opening document " + args.sheet) + logger.info(f'Opening document {args.sheet}') process_sheet(args.sheet) -if __name__ == "__main__": +if __name__ == '__main__': main() diff --git a/gworksheet.py b/gworksheet.py index 721bb01..496ddcc 100644 --- a/gworksheet.py +++ b/gworksheet.py @@ -68,30 +68,3 @@ class GWorksheet: def to_a1(self, row: int, col: int): # row, col are 1-based return utils.rowcol_to_a1(row, col) - - # def index_to_col(self, index): - # alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' - - # if index > 25: - # t = index - # dig = 0 - # while t > 25: - # t = math.floor(t / 26) - # dig += 1 - # return alphabet[t - 1] + self.index_to_col(index - t * int(math.pow(26, dig))) - # else: - # return alphabet[index] - - # def col_to_index(self, col): - # col = list(col) - # ndigits = len(col) - # alphabet = ' ABCDEFGHIJKLMNOPQRSTUVWXYZ' - # v = 0 - # i = ndigits - 1 - - # for digit in col: - # index = alphabet.find(digit) - # v += (26 ** i) * index - # i -= 1 - - # return v - 1 From 644aa0811c55219027e9c5eb98ef18e6c9e82057 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 23 Feb 2022 09:57:44 +0100 Subject: [PATCH 08/16] todo --- auto_archive.py | 1 + 1 file changed, 1 insertion(+) diff --git a/auto_archive.py b/auto_archive.py index cbe0744..7e624d0 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -88,6 +88,7 @@ def process_sheet(sheet): for archiver in active_archivers: logger.debug(f'Trying {archiver} on row {i}') + # TODO: add support for multiple videos/images result = archiver.download(url, check_if_exists=True) if result: From 9550cd509e684159823cf9ed4a7ba8733c912daa Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 23 Feb 2022 13:57:11 +0100 Subject: [PATCH 09/16] making code more resilient to exceptions --- archivers/telegram_archiver.py | 4 +-- auto_archive.py | 64 +++++++++++++++++++--------------- gworksheet.py | 37 +++++++++++--------- 3 files changed, 59 insertions(+), 46 deletions(-) diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index 16c6ccf..5593acd 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -54,8 +54,8 @@ class TelegramArchiver(Archiver): # extract duration from HTML duration = s.find_all('time')[0].contents[0] if ':' in duration: - duration = float(duration.split( - ':')[0]) * 60 + float(duration.split(':')[1]) + duration = float(duration.split(':')[0]) * 60 + + float(duration.split(':')[1]) else: duration = float(duration) diff --git a/auto_archive.py b/auto_archive.py index 7e624d0..cb70c58 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -14,17 +14,18 @@ load_dotenv() def update_sheet(gw, row, result: archivers.ArchiveResult): - update = [] + cell_updates = [] + row_values = gw.get_row(row) def batch_if_valid(col, val, final_value=None): final_value = final_value or val - if val and gw.col_exists(col) and gw.cell(row, col) == '': - update.append((row, col, final_value)) + if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '': + cell_updates.append((row, col, final_value)) - update.append((row, 'status', result.status)) + cell_updates.append((row, 'status', result.status)) batch_if_valid('archive', result.cdn_url) - batch_if_valid('archive', True, datetime.datetime.now().isoformat()) + batch_if_valid('date', True, datetime.datetime.now().isoformat()) batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")') batch_if_valid('thumbnail_index', result.thumbnail_index) batch_if_valid('title', result.title) @@ -34,7 +35,18 @@ def update_sheet(gw, row, result: archivers.ArchiveResult): result.timestamp = datetime.datetime.fromtimestamp(result.timestamp).isoformat() batch_if_valid('timestamp', result.timestamp) - gw.update_batch(update) + gw.batch_set_cell(cell_updates) + + +def expand_url(url): + # expand short URL links + if 'https://t.co/' in url: + try: + r = requests.get(url) + url = r.url + except: + logger.error(f'Failed to expand url {url}') + return url def process_sheet(sheet): @@ -74,38 +86,34 @@ def process_sheet(sheet): ] # loop through rows in worksheet - for i in range(2, gw.count_rows() + 1): - row = gw.get_row(i) - url = gw.cell(row, 'url') - status = gw.cell(row, 'status') + for row in range(2, gw.count_rows() + 1): + url = gw.get_cell(row, 'url') + status = gw.get_cell(row, 'status') if url != '' and status in ['', None]: - gw.update(i, 'status', 'Archive in progress') + gw.set_cell(row, 'status', 'Archive in progress') - # expand short URL links - if 'https://t.co/' in url: - r = requests.get(url) - url = r.url + url = expand_url(url) for archiver in active_archivers: - logger.debug(f'Trying {archiver} on row {i}') + logger.debug(f'Trying {archiver} on row {row}') + # TODO: add support for multiple videos/images - result = archiver.download(url, check_if_exists=True) + try: + result = archiver.download(url, check_if_exists=True) + except Exception as e: + result = False + logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}') if result: - logger.success(f'{archiver} succeeded on row {i}') - break + if result.status in ['success', 'already archived']: + logger.success(f'{archiver} succeeded on row {row}') + break + logger.warning(f'{archiver} did not succeed on row {row}, final status: {result.status}') if result: - update_sheet(gw, i, result) + update_sheet(gw, row, result) else: - gw.update(i, 'status', 'failed: no archiver') - - # # except: - # # if any unexpected errors occured, log these into the Google Sheet - # # t, value, traceback = sys.exc_info() - - # # update_sheet(wks, i, str( - # # value), {}, columns, v) + gw.set_cell(row, 'status', 'failed: no archiver') def main(): diff --git a/gworksheet.py b/gworksheet.py index 496ddcc..88de9a4 100644 --- a/gworksheet.py +++ b/gworksheet.py @@ -19,20 +19,18 @@ class GWorksheet: self.headers = [v.lower() for v in self.wks.row_values(1)] self.columns = columns - def worksheet(self): return self.wks - def _check_col_exists(self, col: str): if col not in self.columns: raise Exception(f'Column {col} is not in the configured column names: {self.columns.keys()}') + def _col_index(self, col: str): + self._check_col_exists(col) + return self.headers.index(self.columns[col]) + def col_exists(self, col: str): self._check_col_exists(col) return self.columns[col] in self.headers - def col_index(self, col: str): - self._check_col_exists(col) - return self.headers.index(self.columns[col]) - def count_rows(self): return len(self.wks.get_values()) @@ -40,30 +38,37 @@ class GWorksheet: # row is 1-based return self.wks.row_values(row) - def cell(self, row, col: str): - # row can be index (1-based) or list of values + def get_cell(self, row, col: str): + """ + returns the cell value from (row, col), + where row can be an index (1-based) OR list of values + as received from self.get_row(row) + """ if type(row) == int: row = self.get_row(row) - col_index = self.col_index(col) + col_index = self._col_index(col) if col_index >= len(row): return '' return row[col_index] - def update(self, row: int, col: str, val): + def set_cell(self, row: int, col: str, val): # row is 1-based - col_index = self.col_index(col) + 1 + col_index = self._col_index(col) + 1 self.wks.update_cell(row, col_index, val) - def update_batch(self, updates): - updates = [ + def batch_set_cell(self, cell_updates): + """ + receives a list of [(row:int, col:str, val)] and batch updates it, the parameters are the same as in the self.set_cell() method + """ + cell_updates = [ { - 'range': self.to_a1(row, self.col_index(col) + 1), + 'range': self.to_a1(row, self._col_index(col) + 1), 'values': [[val]] } - for row, col, val in updates + for row, col, val in cell_updates ] - self.wks.batch_update(updates, value_input_option='USER_ENTERED') + self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED') def to_a1(self, row: int, col: int): # row, col are 1-based From 9a264a7dfeffc0077b7451f578fd46407efdedbd Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 23 Feb 2022 16:07:58 +0100 Subject: [PATCH 10/16] cleanup and docs --- README.md | 20 ++++++++++++++++++++ archivers/base_archiver.py | 26 ++++++++++++++++---------- archivers/telegram_archiver.py | 10 +++++----- archivers/tiktok_archiver.py | 5 +++-- archivers/youtubedl_archiver.py | 13 +++++++------ gworksheet.py | 8 ++++---- 6 files changed, 55 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index cec6e9a..3d7f751 100644 --- a/README.md +++ b/README.md @@ -68,3 +68,23 @@ To make it easier to set up new auto-archiver sheets, the auto-auto-archiver wil ![A screenshot of a Google Spreadsheet configured to show instructional text and a list of sheet names to check with auto-archiver.](docs/auto-auto.png) +# Code structure +Code is split into functional concepts: +1. [Archivers](archivers/) - receive a URL that they try to archive +2. [Storages](storages/) - they deal with where the archived files go +3. utilities + 1. [GWorksheet](gworksheet.py) - facilitates some of the reading/writing tasks for a Google Worksheet + +### Current Archivers +```mermaid +graph TD + A(Archiver) -->|parent of| B(TelegramArchiver) + A -->|parent of| C(TikTokArchiver) + A -->|parent of| D(YoutubeDLArchiver) + A -->|parent of| E(WaybackArchiver) +``` +### Current Storages +```mermaid +graph TD + A(BaseStorage) -->|parent of| B(S3Storage) +``` diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index b13a77f..6257aba 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -3,6 +3,7 @@ import ffmpeg import datetime from dataclasses import dataclass from abc import ABC, abstractmethod +from urllib.parse import urlparse from storages import Storage @@ -30,6 +31,9 @@ class Archiver(ABC): @abstractmethod def download(self, url, check_if_exists=False): pass + def get_netloc(self, url): + return urlparse(url).netloc + def get_key(self, filename): """ returns a key in the format "[archiverName]_[filename]" includes extension @@ -40,9 +44,12 @@ class Archiver(ABC): _id = _id.replace('unknown_video', 'jpg') return f'{self.name}_{_id}{extension}' - def get_thumbnails(self, filename, duration=None): - if not os.path.exists(filename.split('.')[0]): - os.mkdir(filename.split('.')[0]) + def get_thumbnails(self, filename, key, duration=None): + thumbnails_folder = filename.split('.')[0] + '/' + key_folder = key.split('.')[0] + '/' + + if not os.path.exists(thumbnails_folder): + os.mkdir(thumbnails_folder) fps = 0.5 if duration is not None: @@ -57,15 +64,14 @@ class Archiver(ABC): stream = ffmpeg.input(filename) stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1) - stream.output(filename.split('.')[0] + '/out%d.jpg').run() + stream.output(thumbnails_folder + 'out%d.jpg').run() - thumbnails = os.listdir(filename.split('.')[0] + '/') + thumbnails = os.listdir(thumbnails_folder) cdn_urls = [] - for fname in thumbnails: if fname[-3:] == 'jpg': - thumbnail_filename = filename.split('.')[0] + '/' + fname - key = filename.split('/')[1].split('.')[0] + '/' + fname + thumbnail_filename = thumbnails_folder + fname + key = key_folder + fname cdn_url = self.storage.get_cdn_url(key) @@ -86,12 +92,12 @@ class Archiver(ABC): index_page += f'' index_page += f"" - index_fname = filename.split('.')[0] + '/index.html' + index_fname = thumbnails_folder + 'index.html' with open(index_fname, 'w') as f: f.write(index_page) - thumb_index = filename.split('/')[1].split('.')[0] + '/index.html' + thumb_index = key_folder + 'index.html' self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'}) diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index 5593acd..5a9b013 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -10,7 +10,7 @@ class TelegramArchiver(Archiver): def download(self, url, check_if_exists=False): # detect URLs that we definitely cannot handle - if 'http://t.me/' not in url and 'https://t.me/' not in url: + if 't.me' != self.get_netloc(url): return False headers = { @@ -20,7 +20,7 @@ class TelegramArchiver(Archiver): original_url = url - # TODO: check if we can do this more resilient to user-input + # TODO: check if we can do this more resilient to variable URLs if url[-8:] != "?embed=1": url += "?embed=1" @@ -32,8 +32,8 @@ class TelegramArchiver(Archiver): return False # could not find video video_url = video.get('src') - key = video_url.split('/')[-1].split('?')[0] - key = self.get_key(key) + video_id = video_url.split('/')[-1].split('?')[0] + key = self.get_key(video_id) filename = 'tmp/' + key @@ -60,7 +60,7 @@ class TelegramArchiver(Archiver): duration = float(duration) # process thumbnails - key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration) + key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration) os.remove(filename) return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index b54f956..62aa415 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -37,8 +37,9 @@ class TiktokArchiver(Archiver): self.storage.upload(filename, key) try: - key_thumb, thumb_index = self.get_thumbnails(filename, duration=info.duration) - except: + key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=info.duration) + except Exception as e: + logger.error(e) key_thumb = '' thumb_index = 'error creating thumbnails' diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index 88f7970..ec11061 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -9,14 +9,15 @@ from .base_archiver import Archiver, ArchiveResult class YoutubeDLArchiver(Archiver): name = "yotube_dl" + ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} def download(self, url, check_if_exists=False): - ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} - if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'): + netloc = self.get_netloc(url) + if netloc in ['facebook.com', 'wwww.facebook.com'] and os.getenv('FB_COOKIE'): logger.info('Using Facebook cookie') youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE') - ydl = youtube_dl.YoutubeDL(ydl_opts) + ydl = youtube_dl.YoutubeDL(YoutubeDLArchiver.ydl_opts) cdn_url = None status = 'success' @@ -26,7 +27,7 @@ class YoutubeDLArchiver(Archiver): # no video here return False - if 'is_live' in info and info['is_live']: + if info.get('is_live', False): logger.warning("Live streaming media, not archiving now") return ArchiveResult(status="Streaming media") @@ -74,11 +75,11 @@ class YoutubeDLArchiver(Archiver): self.storage.upload(filename, key) # get duration - duration = info['duration'] if 'duration' in info else None + duration = info.get('duration') # get thumbnails try: - key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration) + key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration) except: key_thumb = '' thumb_index = 'Could not generate thumbnails' diff --git a/gworksheet.py b/gworksheet.py index 88de9a4..4349e2a 100644 --- a/gworksheet.py +++ b/gworksheet.py @@ -63,13 +63,13 @@ class GWorksheet: """ cell_updates = [ { - 'range': self.to_a1(row, self._col_index(col) + 1), + 'range': self.to_a1(row, col), 'values': [[val]] } for row, col, val in cell_updates ] self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED') - def to_a1(self, row: int, col: int): - # row, col are 1-based - return utils.rowcol_to_a1(row, col) + def to_a1(self, row: int, col: str): + # row is 1-based + return utils.rowcol_to_a1(row, self._col_index(col) + 1) From 2601313249fd46581150b05692f908df5c709c07 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 23 Feb 2022 16:13:09 +0100 Subject: [PATCH 11/16] removed archivers.py --- archivers.py | 412 --------------------------------------------------- 1 file changed, 412 deletions(-) delete mode 100644 archivers.py diff --git a/archivers.py b/archivers.py deleted file mode 100644 index 7c8df8c..0000000 --- a/archivers.py +++ /dev/null @@ -1,412 +0,0 @@ -from dataclasses import dataclass -import youtube_dl -from bs4 import BeautifulSoup -import requests -import tiktok_downloader -from loguru import logger -import os -import datetime -import ffmpeg -from botocore.errorfactory import ClientError -import time -import traceback - -# TODO There should be a better way of generating keys, that adds the following info: -# - name of sheet that it is being archived from -# (this means we might archive the same media twice on different sheets, but that's OK I think) -# - name of archiver/platform that the video comes from -# This should make it easier to maintain and clean the archive later - -# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be -# cleaned up? Difficult is we don't know the filename until the archivers start working. - - -def get_cdn_url(key): - return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format( - os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key) - - -def do_s3_upload(s3_client, f, key): - s3_client.upload_fileobj(f, Bucket=os.getenv( - 'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'}) - - -def get_key(filename): - key = filename.split('/')[1] - if 'unknown_video' in key: - key = key.replace('unknown_video', 'jpg') - return key - - -def get_thumbnails(filename, s3_client, duration=None): - if not os.path.exists(filename.split('.')[0]): - os.mkdir(filename.split('.')[0]) - - fps = 0.5 - if duration is not None: - duration = float(duration) - - if duration < 60: - fps = 10.0 / duration - elif duration < 120: - fps = 20.0 / duration - else: - fps = 40.0 / duration - - stream = ffmpeg.input(filename) - stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1) - stream.output(filename.split('.')[0] + '/out%d.jpg').run() - - thumbnails = os.listdir(filename.split('.')[0] + '/') - cdn_urls = [] - - for fname in thumbnails: - if fname[-3:] == 'jpg': - thumbnail_filename = filename.split('.')[0] + '/' + fname - key = filename.split('/')[1].split('.')[0] + '/' + fname - - cdn_url = get_cdn_url(key) - - with open(thumbnail_filename, 'rb') as f: - do_s3_upload(s3_client, f, key) - - cdn_urls.append(cdn_url) - os.remove(thumbnail_filename) - - if len(cdn_urls) == 0: - return ('None', 'None') - - key_thumb = cdn_urls[int(len(cdn_urls)*0.1)] - - index_page = f'''{filename} - ''' - - for t in cdn_urls: - index_page += f'' - - index_page += f"" - index_fname = filename.split('.')[0] + '/index.html' - - with open(index_fname, 'w') as f: - f.write(index_page) - - thumb_index = filename.split('/')[1].split('.')[0] + '/index.html' - - s3_client.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv( - 'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'}) - - thumb_index_cdn_url = get_cdn_url(thumb_index) - - return (key_thumb, thumb_index_cdn_url) - - -@dataclass -class ArchiveResult: - status: str - cdn_url: str = None - thumbnail: str = None - thumbnail_index: str = None - duration: float = None - title: str = None - timestamp: datetime.datetime = None - - -class Archiver: - def __init__(self, s3_client): - self.s3 = s3_client - - def download(self, url): - pass - - -class TelegramArchiver(Archiver): - def download(self, url, check_if_exists=False): - # detect URLs that we definitely cannot handle - if 'http://t.me/' not in url and 'https://t.me/' not in url: - return False - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'} - status = "success" - - original_url = url - - if url[-8:] != "?embed=1": - url += "?embed=1" - - t = requests.get(url, headers=headers) - s = BeautifulSoup(t.content, 'html.parser') - video = s.find("video") - - if video is None: - return False # could not find video - - video_url = video.get('src') - key = video_url.split('/')[-1].split('?')[0] - filename = 'tmp/' + key - - if check_if_exists: - try: - self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) - - # file exists - cdn_url = get_cdn_url(key) - - status = 'already archived' - - except ClientError: - pass - - v = requests.get(video_url, headers=headers) - - with open(filename, 'wb') as f: - f.write(v.content) - - if status != 'already archived': - cdn_url = get_cdn_url(key) - - with open(filename, 'rb') as f: - do_s3_upload(self.s3, f, key) - - # extract duration from HTML - duration = s.find_all('time')[0].contents[0] - if ':' in duration: - duration = float(duration.split( - ':')[0])*60 + float(duration.split(':')[1]) - else: - duration = float(duration) - - # process thumbnails - key_thumb, thumb_index = get_thumbnails( - filename, self.s3, duration=duration) - os.remove(filename) - - return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, - duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime')) - - -class YoutubeDLArchiver(Archiver): - def download(self, url, check_if_exists=False): - ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} - if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'): - logger.info('Using Facebook cookie') - youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE') - - ydl = youtube_dl.YoutubeDL(ydl_opts) - cdn_url = None - status = 'success' - - try: - info = ydl.extract_info(url, download=False) - except youtube_dl.utils.DownloadError: - # no video here - return False - - if 'is_live' in info and info['is_live']: - logger.warning("Live streaming media, not archiving now") - return ArchiveResult(status="Streaming media") - - if check_if_exists: - if 'entries' in info: - if len(info['entries']) > 1: - logger.warning( - 'YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos') - return False - elif len(info['entries']) == 0: - logger.warning( - 'YoutubeDLArchiver succeeded but did not find video') - return False - - filename = ydl.prepare_filename(info['entries'][0]) - else: - filename = ydl.prepare_filename(info) - - key = get_key(filename) - - try: - self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) - - # file exists - cdn_url = get_cdn_url(key) - - status = 'already archived' - - except ClientError: - pass - - # sometimes this results in a different filename, so do this again - info = ydl.extract_info(url, download=True) - - if 'entries' in info: - if len(info['entries']) > 1: - logger.warning( - 'YoutubeDLArchiver cannot archive channels or pages with multiple videos') - return False - else: - info = info['entries'][0] - - filename = ydl.prepare_filename(info) - - if not os.path.exists(filename): - filename = filename.split('.')[0] + '.mkv' - - if status != 'already archived': - key = get_key(filename) - cdn_url = get_cdn_url(key) - - with open(filename, 'rb') as f: - do_s3_upload(self.s3, f, key) - - # get duration - duration = info['duration'] if 'duration' in info else None - - # get thumbnails - try: - key_thumb, thumb_index = get_thumbnails( - filename, self.s3, duration=duration) - except: - key_thumb = '' - thumb_index = 'Could not generate thumbnails' - - os.remove(filename) - - timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime( - info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None - - return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, - title=info['title'] if 'title' in info else None, - timestamp=timestamp) - - -class WaybackArchiver(Archiver): - def __init__(self, s3_client): - self.s3 = s3_client - self.seen_urls = {} - - def download(self, url, check_if_exists=False): - if check_if_exists and url in self.seen_urls: - return self.seen_urls[url] - - ia_headers = { - "Accept": "application/json", - "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET') - } - - r = requests.post( - 'https://web.archive.org/save/', headers=ia_headers, data={'url': url}) - - if r.status_code != 200: - return ArchiveResult(status="Internet archive failed") - - if 'job_id' not in r.json() and 'message' in r.json(): - return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}") - - job_id = r.json()['job_id'] - - status_r = requests.get( - 'https://web.archive.org/save/status/' + job_id, headers=ia_headers) - - retries = 0 - - # wait 90-120 seconds for the archive job to finish - while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30: - time.sleep(3) - - try: - status_r = requests.get( - 'https://web.archive.org/save/status/' + job_id, headers=ia_headers) - except: - time.sleep(1) - - retries += 1 - - if status_r.status_code != 200: - return ArchiveResult(status="Internet archive failed") - - status_json = status_r.json() - - if status_json['status'] != 'success': - return ArchiveResult(status='Internet Archive failed: ' + str(status_json)) - - archive_url = 'https://web.archive.org/web/' + \ - status_json['timestamp'] + '/' + status_json['original_url'] - - try: - r = requests.get(archive_url) - - parsed = BeautifulSoup( - r.content, 'html.parser') - - title = parsed.find_all('title')[ - 0].text - - if title == 'Wayback Machine': - title = 'Could not get title' - except: - title = "Could not get title" - - result = ArchiveResult( - status='Internet Archive fallback', cdn_url=archive_url, title=title) - self.seen_urls[url] = result - return result - - -class TiktokArchiver(Archiver): - def download(self, url, check_if_exists=False): - if 'tiktok.com' not in url: - return False - - status = 'success' - - try: - info = tiktok_downloader.info_post(url) - key = 'tiktok_' + str(info.id) + '.mp4' - cdn_url = get_cdn_url(key) - filename = 'tmp/' + key - - if check_if_exists: - try: - self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) - - # file exists - cdn_url = get_cdn_url(key) - - status = 'already archived' - - except ClientError: - pass - - media = tiktok_downloader.snaptik(url).get_media() - - if len(media) <= 0: - if status == 'already archived': - return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url) - else: - return ArchiveResult(status='Could not download media') - - media[0].download(filename) - - if status != 'already archived': - with open(filename, 'rb') as f: - do_s3_upload(self.s3, f, key) - - try: - key_thumb, thumb_index = get_thumbnails( - filename, self.s3, duration=info.duration) - except: - key_thumb = '' - thumb_index = 'error creating thumbnails' - - os.remove(filename) - - return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, - thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat()) - - except tiktok_downloader.Except.InvalidUrl: - status = 'Invalid URL' - return ArchiveResult(status=status) - - except: - error = traceback.format_exc() - status = 'Other Tiktok error: ' + str(error) - return ArchiveResult(status=status) From 1d62009c4f6043c53df39a88ae56df69752a012a Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 23 Feb 2022 16:24:59 +0100 Subject: [PATCH 12/16] creates utils module and moves gworkseet there --- README.md | 4 ++-- auto_archive.py | 2 +- utils/__init__.py | 2 ++ gworksheet.py => utils/gworksheet.py | 0 4 files changed, 5 insertions(+), 3 deletions(-) create mode 100644 utils/__init__.py rename gworksheet.py => utils/gworksheet.py (100%) diff --git a/README.md b/README.md index 3d7f751..7910e30 100644 --- a/README.md +++ b/README.md @@ -72,8 +72,8 @@ To make it easier to set up new auto-archiver sheets, the auto-auto-archiver wil Code is split into functional concepts: 1. [Archivers](archivers/) - receive a URL that they try to archive 2. [Storages](storages/) - they deal with where the archived files go -3. utilities - 1. [GWorksheet](gworksheet.py) - facilitates some of the reading/writing tasks for a Google Worksheet +3. [Utilities](utils/) + 1. [GWorksheet](utils/gworksheet.py) - facilitates some of the reading/writing tasks for a Google Worksheet ### Current Archivers ```mermaid diff --git a/auto_archive.py b/auto_archive.py index cb70c58..ba05310 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -8,7 +8,7 @@ from dotenv import load_dotenv import archivers from storages import S3Storage, S3Config -from gworksheet import GWorksheet +from utils import GWorksheet load_dotenv() diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..482e144 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,2 @@ +# we need to explicitly expose the available imports here +from .gworksheet import GWorksheet \ No newline at end of file diff --git a/gworksheet.py b/utils/gworksheet.py similarity index 100% rename from gworksheet.py rename to utils/gworksheet.py From 3cafc444fc964ddc2ffcb34b94ac6ed67ae04a94 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 23 Feb 2022 16:32:38 +0100 Subject: [PATCH 13/16] creates tmp folder if not exists --- __init__.py | 1 - archivers/base_archiver.py | 4 ++-- auto_archive.py | 3 ++- utils/__init__.py | 3 ++- utils/misc.py | 5 +++++ 5 files changed, 11 insertions(+), 5 deletions(-) delete mode 100644 __init__.py create mode 100644 utils/misc.py diff --git a/__init__.py b/__init__.py deleted file mode 100644 index b85e02a..0000000 --- a/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from storages import * \ No newline at end of file diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 6257aba..dc47273 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -6,6 +6,7 @@ from abc import ABC, abstractmethod from urllib.parse import urlparse from storages import Storage +from utils import mkdir_if_not_exists @dataclass @@ -48,8 +49,7 @@ class Archiver(ABC): thumbnails_folder = filename.split('.')[0] + '/' key_folder = key.split('.')[0] + '/' - if not os.path.exists(thumbnails_folder): - os.mkdir(thumbnails_folder) + mkdir_if_not_exists(thumbnails_folder) fps = 0.5 if duration is not None: diff --git a/auto_archive.py b/auto_archive.py index ba05310..472efd2 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -8,7 +8,7 @@ from dotenv import load_dotenv import archivers from storages import S3Storage, S3Config -from utils import GWorksheet +from utils import GWorksheet, mkdir_if_not_exists load_dotenv() @@ -124,6 +124,7 @@ def main(): logger.info(f'Opening document {args.sheet}') + mkdir_if_not_exists('tmp') process_sheet(args.sheet) diff --git a/utils/__init__.py b/utils/__init__.py index 482e144..9b58126 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -1,2 +1,3 @@ # we need to explicitly expose the available imports here -from .gworksheet import GWorksheet \ No newline at end of file +from .gworksheet import GWorksheet +from .misc import * \ No newline at end of file diff --git a/utils/misc.py b/utils/misc.py new file mode 100644 index 0000000..e8ef66d --- /dev/null +++ b/utils/misc.py @@ -0,0 +1,5 @@ +import os + +def mkdir_if_not_exists(folder): + if not os.path.exists(folder): + os.mkdir(folder) \ No newline at end of file From 214d52d36f21e1d72851f3f9dbba6b77f1b3f79f Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 23 Feb 2022 16:43:42 +0100 Subject: [PATCH 14/16] improved tmp folder management --- archivers/base_archiver.py | 3 ++- auto_archive.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index dc47273..12cca80 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -1,6 +1,7 @@ import os import ffmpeg import datetime +import shutil from dataclasses import dataclass from abc import ABC, abstractmethod from urllib.parse import urlparse @@ -78,7 +79,6 @@ class Archiver(ABC): self.storage.upload(thumbnail_filename, key) cdn_urls.append(cdn_url) - os.remove(thumbnail_filename) if len(cdn_urls) == 0: return ('None', 'None') @@ -100,6 +100,7 @@ class Archiver(ABC): thumb_index = key_folder + 'index.html' self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'}) + shutil.rmtree(thumbnails_folder) thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index) diff --git a/auto_archive.py b/auto_archive.py index 472efd2..ce82ee1 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -2,6 +2,7 @@ import os import datetime import argparse import requests +import shutil import gspread from loguru import logger from dotenv import load_dotenv @@ -126,7 +127,7 @@ def main(): mkdir_if_not_exists('tmp') process_sheet(args.sheet) - + shutil.rmtree('tmp') if __name__ == '__main__': main() From 4bbbdcc7fd4902d28efeba063c587d8b32fb6e99 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 23 Feb 2022 18:30:06 +0100 Subject: [PATCH 15/16] minor update --- auto_archive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_archive.py b/auto_archive.py index ce82ee1..211d3d7 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -71,7 +71,7 @@ def process_sheet(sheet): continue if not gw.col_exists('status'): - logger.warning("No 'Archive status' column found, skipping") + logger.warning(f'No "Archive status" column found, skipping worksheet {wks.title}') continue # archives will be in a folder 'doc_name/worksheet_name' From 8bce84082a94f8b5793203ec9a4c1a03c821a881 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 23 Feb 2022 18:32:40 +0100 Subject: [PATCH 16/16] minor updates --- archivers/telegram_archiver.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index 5a9b013..d7b8924 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -36,10 +36,10 @@ class TelegramArchiver(Archiver): key = self.get_key(video_id) filename = 'tmp/' + key + cdn_url = self.storage.get_cdn_url(key) if check_if_exists and self.storage.exists(key): status = 'already archived' - cdn_url = self.storage.get_cdn_url(key) v = requests.get(video_url, headers=headers) @@ -47,8 +47,6 @@ class TelegramArchiver(Archiver): f.write(v.content) if status != 'already archived': - cdn_url = self.storage.get_cdn_url(key) - self.storage.upload(filename, key) # extract duration from HTML