Merge pull request #13 from bellingcat/refactor-archivers

WIP: Refactor archivers
This commit is contained in:
Logan Williams
2022-02-25 08:05:22 +01:00
committed by GitHub
19 changed files with 890 additions and 536 deletions

4
.gitignore vendored
View File

@@ -1,6 +1,8 @@
tmp/
.env
.env*
.DS_Store
expmt/
service_account.json
__pycache__/
._*
anu.html

View File

@@ -9,10 +9,13 @@ boto3 = "*"
python-dotenv = "*"
youtube_dl = "*"
argparse = "*"
ffmpeg-python = "*"
beautifulsoup4 = "*"
tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"}
bs4 = "*"
loguru = "*"
ffmpeg-python = "*"
[dev-packages]
[requires]
python_version = "3.8"
python_version = "3.9"

271
Pipfile.lock generated
View File

@@ -1,11 +1,11 @@
{
"_meta": {
"hash": {
"sha256": "2aa6e5f9d7cda1a459444bf812fb2f7a4acfe547e7c65a975ab41530f9213da5"
"sha256": "9a5218275503e5ae779407349d0a76f44712dc4824e066b10aeb047264a168be"
},
"pipfile-spec": 6,
"requires": {
"python_version": "3.8"
"python_version": "3.9"
},
"sources": [
{
@@ -26,49 +26,80 @@
},
"beautifulsoup4": {
"hashes": [
"sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35",
"sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25",
"sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"
"sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf",
"sha256:c23ad23c521d818955a4151a67d81580319d4bf548d3d49f4223ae041ff98891"
],
"index": "pypi",
"version": "==4.9.3"
"version": "==4.10.0"
},
"boto3": {
"hashes": [
"sha256:7209b79833bdf13753aa24f76bf533890ffed2cc4fe1fe08619d223c209bbd11",
"sha256:f46c93d09acd4d4bfc6b9522ed852fecbdc508e0365f29ddfb3c146aae784b4e"
"sha256:aa00024cc1f3d24b2318dae4d5dbaa173c8da8bc6f9d12f0b2e67467ec460989",
"sha256:ab4ab2392f7520c01ce6e40e6df4b5b65a575ee6bd9fb78db0239cb2a06de557"
],
"index": "pypi",
"version": "==1.18.27"
"version": "==1.21.3"
},
"botocore": {
"hashes": [
"sha256:8c99abd7093ab11ce8d09c68732aeeb6065a53d2fe371568452e99291817fff5",
"sha256:b9e2c90bad164d111c229102f58f995c28576e719dd116b446965e1b786f8fa5"
"sha256:979e5c5e826ff115f4903fe9887b191f3809229f694a747f910e1221fe63efc7",
"sha256:ca33f747c67cd0e109fab9398d39c38c1a2df352c1e1f9823839df8f1db58046"
],
"version": "==1.21.27"
"markers": "python_version >= '3.6'",
"version": "==1.24.3"
},
"bs4": {
"hashes": [
"sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"
],
"index": "pypi",
"version": "==0.0.1"
},
"cachetools": {
"hashes": [
"sha256:2cc0b89715337ab6dbba85b5b50effe2b0c74e035d83ee8ed637cf52f12ae001",
"sha256:61b5ed1e22a0924aed1d23b478f37e8d52549ff8a961de2909c69bf950020cff"
"sha256:486471dfa8799eb7ec503a8059e263db000cdda20075ce5e48903087f79d5fd6",
"sha256:8fecd4203a38af17928be7b90689d8083603073622229ca7077b72d8e5a976e4"
],
"version": "==4.2.2"
"markers": "python_version ~= '3.7'",
"version": "==5.0.0"
},
"certifi": {
"hashes": [
"sha256:2bbf76fd432960138b3ef6dda3dde0544f27cbf8546c458e60baf371917ba9ee",
"sha256:50b1e4f8446b06f41be7dd6338db18e0990601dce795c2b1686458aa7e8fa7d8"
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
"sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
],
"version": "==2021.5.30"
"version": "==2021.10.8"
},
"charset-normalizer": {
"hashes": [
"sha256:0c8911edd15d19223366a194a513099a302055a962bca2cec0f54b8b63175d8b",
"sha256:f23667ebe1084be45f6ae0538e4a5a865206544097e4e8bbcacf42cd02a348f3"
"sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
"sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"
],
"markers": "python_version >= '3'",
"version": "==2.0.4"
"version": "==2.0.12"
},
"click": {
"hashes": [
"sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1",
"sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb"
],
"markers": "python_version >= '3.6'",
"version": "==8.0.4"
},
"cloudscraper": {
"hashes": [
"sha256:674fd739f9412188aae8d6614e3e6316939fc0670ef5646abd3d316f1a59d3c2",
"sha256:dda29028c5628b5ba3e4dc43816ed38fd46bd945ef938c420f185586a6d8dff2"
],
"version": "==1.2.58"
},
"faker": {
"hashes": [
"sha256:ee8d9181137cdd2b198bd3d0653b0a3b7b385213862348e15ba8a423324b702b",
"sha256:f545b2a1ba5f7effc4ed71af0a5204d939445f0190838d41bee6bc160958bfbe"
],
"markers": "python_version >= '3.6'",
"version": "==13.0.0"
},
"ffmpeg-python": {
"hashes": [
@@ -78,55 +109,138 @@
"index": "pypi",
"version": "==0.2.0"
},
"flask": {
"hashes": [
"sha256:59da8a3170004800a2837844bfa84d49b022550616070f7cb1a659682b2e7c9f",
"sha256:e1120c228ca2f553b470df4a5fa927ab66258467526069981b3eb0a91902687d"
],
"markers": "python_version >= '3.6'",
"version": "==2.0.3"
},
"future": {
"hashes": [
"sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d"
],
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.18.2"
},
"google-auth": {
"hashes": [
"sha256:c012c8be7c442c8309ca8fa0876fef33f5fd977c467be1e1c1c2f721e8ebd73c",
"sha256:ea1af050b3e06eb73e4470f704d23007307bc0e87c13e015f6b90460f1407bd3"
"sha256:218ca03d7744ca0c8b6697b6083334be7df49b7bf76a69d555962fd1a7657b5f",
"sha256:ad160fc1ea8f19e331a16a14a79f3d643d813a69534ba9611d2c80dc10439dad"
],
"version": "==2.0.1"
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==2.6.0"
},
"google-auth-oauthlib": {
"hashes": [
"sha256:4ab58e6c3dc6ccf112f921fcced40e5426fba266768986ea502228488276eaba",
"sha256:b5a1ce7c617d247ccb2dfbba9d4bfc734b41096803d854a2c52592ae80150a67"
"sha256:3f2a6e802eebbb6fb736a370fbf3b055edcb6b52878bf2f26330b5e041316c73",
"sha256:a90a072f6993f2c327067bf65270046384cda5a8ecb20b94ea9a687f1f233a7a"
],
"version": "==0.4.5"
"markers": "python_version >= '3.6'",
"version": "==0.4.6"
},
"gspread": {
"hashes": [
"sha256:236a0f24e3724b49bae4cbd5144ed036b0ae6feaf5828ad033eb2824bf05e5be",
"sha256:4933c3e2359e82698c0990f3b0e312627fcbf8fecc8bc81d26713f5860e20b48"
"sha256:d9db8c43d552f541ea072d4727d1e955bc2368b095dd86c5429a845c9d8aed8f",
"sha256:ffba57786e27519fb97125e3de37a0f062134a396506681f5baacaf47a9febe3"
],
"index": "pypi",
"version": "==4.0.1"
"version": "==5.1.1"
},
"idna": {
"hashes": [
"sha256:14475042e284991034cb48e06f6851428fb14c4dc953acd9be9a5e95c7b6dd7a",
"sha256:467fbad99067910785144ce333826c71fb0e63a425657295239737f7ecd125f3"
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
],
"markers": "python_version >= '3'",
"version": "==3.2"
"version": "==3.3"
},
"itsdangerous": {
"hashes": [
"sha256:29285842166554469a56d427addc0843914172343784cb909695fdbe90a3e129",
"sha256:d848fcb8bc7d507c4546b448574e8a44fc4ea2ba84ebf8d783290d53e81992f5"
],
"markers": "python_version >= '3.7'",
"version": "==2.1.0"
},
"jinja2": {
"hashes": [
"sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8",
"sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"
],
"markers": "python_version >= '3.6'",
"version": "==3.0.3"
},
"jmespath": {
"hashes": [
"sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9",
"sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f"
],
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.10.0"
},
"loguru": {
"hashes": [
"sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c",
"sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3"
],
"index": "pypi",
"version": "==0.6.0"
},
"markupsafe": {
"hashes": [
"sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3",
"sha256:09c86c9643cceb1d87ca08cdc30160d1b7ab49a8a21564868921959bd16441b8",
"sha256:142119fb14a1ef6d758912b25c4e803c3ff66920635c44078666fe7cc3f8f759",
"sha256:1d1fb9b2eec3c9714dd936860850300b51dbaa37404209c8d4cb66547884b7ed",
"sha256:204730fd5fe2fe3b1e9ccadb2bd18ba8712b111dcabce185af0b3b5285a7c989",
"sha256:24c3be29abb6b34052fd26fc7a8e0a49b1ee9d282e3665e8ad09a0a68faee5b3",
"sha256:290b02bab3c9e216da57c1d11d2ba73a9f73a614bbdcc027d299a60cdfabb11a",
"sha256:3028252424c72b2602a323f70fbf50aa80a5d3aa616ea6add4ba21ae9cc9da4c",
"sha256:30c653fde75a6e5eb814d2a0a89378f83d1d3f502ab710904ee585c38888816c",
"sha256:3cace1837bc84e63b3fd2dfce37f08f8c18aeb81ef5cf6bb9b51f625cb4e6cd8",
"sha256:4056f752015dfa9828dce3140dbadd543b555afb3252507348c493def166d454",
"sha256:454ffc1cbb75227d15667c09f164a0099159da0c1f3d2636aa648f12675491ad",
"sha256:598b65d74615c021423bd45c2bc5e9b59539c875a9bdb7e5f2a6b92dfcfc268d",
"sha256:599941da468f2cf22bf90a84f6e2a65524e87be2fce844f96f2dd9a6c9d1e635",
"sha256:5ddea4c352a488b5e1069069f2f501006b1a4362cb906bee9a193ef1245a7a61",
"sha256:62c0285e91414f5c8f621a17b69fc0088394ccdaa961ef469e833dbff64bd5ea",
"sha256:679cbb78914ab212c49c67ba2c7396dc599a8479de51b9a87b174700abd9ea49",
"sha256:6e104c0c2b4cd765b4e83909cde7ec61a1e313f8a75775897db321450e928cce",
"sha256:736895a020e31b428b3382a7887bfea96102c529530299f426bf2e636aacec9e",
"sha256:75bb36f134883fdbe13d8e63b8675f5f12b80bb6627f7714c7d6c5becf22719f",
"sha256:7d2f5d97fcbd004c03df8d8fe2b973fe2b14e7bfeb2cfa012eaa8759ce9a762f",
"sha256:80beaf63ddfbc64a0452b841d8036ca0611e049650e20afcb882f5d3c266d65f",
"sha256:84ad5e29bf8bab3ad70fd707d3c05524862bddc54dc040982b0dbcff36481de7",
"sha256:8da5924cb1f9064589767b0f3fc39d03e3d0fb5aa29e0cb21d43106519bd624a",
"sha256:961eb86e5be7d0973789f30ebcf6caab60b844203f4396ece27310295a6082c7",
"sha256:96de1932237abe0a13ba68b63e94113678c379dca45afa040a17b6e1ad7ed076",
"sha256:a0a0abef2ca47b33fb615b491ce31b055ef2430de52c5b3fb19a4042dbc5cadb",
"sha256:b2a5a856019d2833c56a3dcac1b80fe795c95f401818ea963594b345929dffa7",
"sha256:b8811d48078d1cf2a6863dafb896e68406c5f513048451cd2ded0473133473c7",
"sha256:c532d5ab79be0199fa2658e24a02fce8542df196e60665dd322409a03db6a52c",
"sha256:d3b64c65328cb4cd252c94f83e66e3d7acf8891e60ebf588d7b493a55a1dbf26",
"sha256:d4e702eea4a2903441f2735799d217f4ac1b55f7d8ad96ab7d4e25417cb0827c",
"sha256:d5653619b3eb5cbd35bfba3c12d575db2a74d15e0e1c08bf1db788069d410ce8",
"sha256:d66624f04de4af8bbf1c7f21cc06649c1c69a7f84109179add573ce35e46d448",
"sha256:e67ec74fada3841b8c5f4c4f197bea916025cb9aa3fe5abf7d52b655d042f956",
"sha256:e6f7f3f41faffaea6596da86ecc2389672fa949bd035251eab26dc6697451d05",
"sha256:f02cf7221d5cd915d7fa58ab64f7ee6dd0f6cddbb48683debf5d04ae9b1c2cc1",
"sha256:f0eddfcabd6936558ec020130f932d479930581171368fd728efcfb6ef0dd357",
"sha256:fabbe18087c3d33c5824cb145ffca52eccd053061df1d79d4b66dafa5ad2a5ea",
"sha256:fc3150f85e2dbcf99e65238c842d1cfe69d3e7649b19864c1cc043213d9cd730"
],
"markers": "python_version >= '3.7'",
"version": "==2.1.0"
},
"oauthlib": {
"hashes": [
"sha256:42bf6354c2ed8c6acb54d971fce6f88193d97297e18602a3a886603f9d7730cc",
"sha256:8f0215fcc533dd8dd1bee6f4c412d4f0cd7297307d43ac61666389e3bc3198a3"
"sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2",
"sha256:6db33440354787f9b7f3a6dbd4febf5d0f93758354060e802f6c06cb493022fe"
],
"version": "==3.1.1"
"markers": "python_version >= '3.6'",
"version": "==3.2.0"
},
"pyasn1": {
"hashes": [
@@ -164,79 +278,112 @@
],
"version": "==0.2.8"
},
"pyparsing": {
"hashes": [
"sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea",
"sha256:a6c06a88f252e6c322f65faf8f418b16213b51bdfaece0524c1c1bc30c63c484"
],
"markers": "python_version >= '3.6'",
"version": "==3.0.7"
},
"python-dateutil": {
"hashes": [
"sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
"sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.8.2"
},
"python-dotenv": {
"hashes": [
"sha256:aae25dc1ebe97c420f50b81fb0e5c949659af713f31fdb63c749ca68748f34b1",
"sha256:f521bc2ac9a8e03c736f62911605c5d83970021e3fa95b37d769e2bbbe9b6172"
"sha256:32b2bdc1873fd3a3c346da1c6db83d0053c3c62f28f1f38516070c4c8971b1d3",
"sha256:a5de49a31e953b45ff2d2fd434bbc2670e8db5273606c1e737cc6b93eff3655f"
],
"index": "pypi",
"version": "==0.19.0"
"version": "==0.19.2"
},
"requests": {
"hashes": [
"sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
"sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
],
"version": "==2.26.0"
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==2.27.1"
},
"requests-oauthlib": {
"hashes": [
"sha256:7f71572defaecd16372f9006f33c2ec8c077c3cfa6f5911a9a90202beb513f3d",
"sha256:b4261601a71fd721a8bd6d7aa1cc1d6a8a93b4a9f5e96626f8e4d91e8beeaa6a",
"sha256:fa6c47b933f01060936d87ae9327fead68768b69c6c9ea2109c48be30f2d4dbc"
"sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5",
"sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a"
],
"version": "==1.3.0"
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.3.1"
},
"requests-toolbelt": {
"hashes": [
"sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f",
"sha256:968089d4584ad4ad7c171454f0a5c6dac23971e9472521ea3b6d49d610aa6fc0"
],
"version": "==0.9.1"
},
"rsa": {
"hashes": [
"sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2",
"sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9"
"sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17",
"sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"
],
"version": "==4.7.2"
"markers": "python_version >= '3.6'",
"version": "==4.8"
},
"s3transfer": {
"hashes": [
"sha256:50ed823e1dc5868ad40c8dc92072f757aa0e653a192845c94a3b676f4a62da4c",
"sha256:9c1dc369814391a6bda20ebbf4b70a0f34630592c9aa520856bf384916af2803"
"sha256:25c140f5c66aa79e1ac60be50dcd45ddc59e83895f062a3aab263b870102911f",
"sha256:69d264d3e760e569b78aaa0f22c97e955891cd22e32b10c51f784eeda4d9d10a"
],
"version": "==0.5.0"
"markers": "python_version >= '3.6'",
"version": "==0.5.1"
},
"six": {
"hashes": [
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
"sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.16.0"
},
"soupsieve": {
"hashes": [
"sha256:052774848f448cf19c7e959adf5566904d525f33a3f8b6ba6f6f8f26ec7de0cc",
"sha256:c2c1c2d44f158cdbddab7824a9af8c4f83c76b1e23e049479aa432feb6c4c23b"
"sha256:1a3cca2617c6b38c0343ed661b1fa5de5637f257d4fe22bd9f1338010a1efefb",
"sha256:b8d49b1cd4f037c7082a9683dfa1801aa2597fb11c3a1155b7a5b94829b4f1f9"
],
"markers": "python_version >= '3.0'",
"version": "==2.2.1"
"markers": "python_version >= '3.6'",
"version": "==2.3.1"
},
"tiktok-downloader": {
"git": "https://github.com/msramalho/tiktok-downloader",
"ref": "81c6ea1f959b2cc5620961043272592bd1bfc2e2"
},
"urllib3": {
"hashes": [
"sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4",
"sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f"
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
],
"version": "==1.26.6"
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.26.8"
},
"werkzeug": {
"hashes": [
"sha256:1421ebfc7648a39a5c58c601b154165d05cf47a3cd0ccb70857cbdacf6c8f2b8",
"sha256:b863f8ff057c522164b6067c9e28b041161b4be5ba4d0daceeaa50a163822d3c"
],
"markers": "python_version >= '3.6'",
"version": "==2.0.3"
},
"youtube-dl": {
"hashes": [
"sha256:263e04d53fb8ba3dfbd246ad09b7d388e896c132a20cc770c26ee7684de050ac",
"sha256:cb2d3ee002158ede783e97a82c95f3817594df54367ea6a77ce5ceea4772f0ab"
"sha256:bc59e86c5d15d887ac590454511f08ce2c47698d5a82c27bfe27b5d814bbaed2",
"sha256:f1336d5de68647e0364a47b3c0712578e59ec76f02048ff5c50ef1c69d79cd55"
],
"index": "pypi",
"version": "==2021.6.6"
"version": "==2021.12.17"
}
},
"develop": {}

View File

@@ -8,6 +8,8 @@ If you are using `pipenv` (recommended), `pipenv install` is sufficient to insta
[A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script.
[ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work.
A `.env` file is required for saving content to a Digital Ocean space, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables:
```
@@ -66,3 +68,23 @@ To make it easier to set up new auto-archiver sheets, the auto-auto-archiver wil
![A screenshot of a Google Spreadsheet configured to show instructional text and a list of sheet names to check with auto-archiver.](docs/auto-auto.png)
# Code structure
Code is split into functional concepts:
1. [Archivers](archivers/) - receive a URL that they try to archive
2. [Storages](storages/) - they deal with where the archived files go
3. [Utilities](utils/)
1. [GWorksheet](utils/gworksheet.py) - facilitates some of the reading/writing tasks for a Google Worksheet
### Current Archivers
```mermaid
graph TD
A(Archiver) -->|parent of| B(TelegramArchiver)
A -->|parent of| C(TikTokArchiver)
A -->|parent of| D(YoutubeDLArchiver)
A -->|parent of| E(WaybackArchiver)
```
### Current Storages
```mermaid
graph TD
A(BaseStorage) -->|parent of| B(S3Storage)
```

6
archivers/__init__.py Normal file
View File

@@ -0,0 +1,6 @@
# we need to explicitly expose the available imports here
from .base_archiver import *
from .telegram_archiver import *
from .tiktok_archiver import *
from .wayback_archiver import *
from .youtubedl_archiver import *

107
archivers/base_archiver.py Normal file
View File

@@ -0,0 +1,107 @@
import os
import ffmpeg
import datetime
import shutil
from dataclasses import dataclass
from abc import ABC, abstractmethod
from urllib.parse import urlparse
from storages import Storage
from utils import mkdir_if_not_exists
@dataclass
class ArchiveResult:
status: str
cdn_url: str = None
thumbnail: str = None
thumbnail_index: str = None
duration: float = None
title: str = None
timestamp: datetime.datetime = None
class Archiver(ABC):
name = "default"
def __init__(self, storage: Storage):
self.storage = storage
def __str__(self):
return self.__class__.__name__
@abstractmethod
def download(self, url, check_if_exists=False): pass
def get_netloc(self, url):
return urlparse(url).netloc
def get_key(self, filename):
"""
returns a key in the format "[archiverName]_[filename]" includes extension
"""
tail = os.path.split(filename)[1] # returns filename.ext from full path
_id, extension = os.path.splitext(tail) # returns [filename, .ext]
if 'unknown_video' in _id:
_id = _id.replace('unknown_video', 'jpg')
return f'{self.name}_{_id}{extension}'
def get_thumbnails(self, filename, key, duration=None):
thumbnails_folder = filename.split('.')[0] + '/'
key_folder = key.split('.')[0] + '/'
mkdir_if_not_exists(thumbnails_folder)
fps = 0.5
if duration is not None:
duration = float(duration)
if duration < 60:
fps = 10.0 / duration
elif duration < 120:
fps = 20.0 / duration
else:
fps = 40.0 / duration
stream = ffmpeg.input(filename)
stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
stream.output(thumbnails_folder + 'out%d.jpg').run()
thumbnails = os.listdir(thumbnails_folder)
cdn_urls = []
for fname in thumbnails:
if fname[-3:] == 'jpg':
thumbnail_filename = thumbnails_folder + fname
key = key_folder + fname
cdn_url = self.storage.get_cdn_url(key)
self.storage.upload(thumbnail_filename, key)
cdn_urls.append(cdn_url)
if len(cdn_urls) == 0:
return ('None', 'None')
key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]
index_page = f'''<html><head><title>{filename}</title></head>
<body>'''
for t in cdn_urls:
index_page += f'<img src="{t}" />'
index_page += f"</body></html>"
index_fname = thumbnails_folder + 'index.html'
with open(index_fname, 'w') as f:
f.write(index_page)
thumb_index = key_folder + 'index.html'
self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'})
shutil.rmtree(thumbnails_folder)
thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)
return (key_thumb, thumb_index_cdn_url)

View File

@@ -0,0 +1,65 @@
import os
import requests
from bs4 import BeautifulSoup
from .base_archiver import Archiver, ArchiveResult
class TelegramArchiver(Archiver):
name = "telegram"
def download(self, url, check_if_exists=False):
# detect URLs that we definitely cannot handle
if 't.me' != self.get_netloc(url):
return False
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
status = "success"
original_url = url
# TODO: check if we can do this more resilient to variable URLs
if url[-8:] != "?embed=1":
url += "?embed=1"
t = requests.get(url, headers=headers)
s = BeautifulSoup(t.content, 'html.parser')
video = s.find("video")
if video is None:
return False # could not find video
video_url = video.get('src')
video_id = video_url.split('/')[-1].split('?')[0]
key = self.get_key(video_id)
filename = 'tmp/' + key
cdn_url = self.storage.get_cdn_url(key)
if check_if_exists and self.storage.exists(key):
status = 'already archived'
v = requests.get(video_url, headers=headers)
with open(filename, 'wb') as f:
f.write(v.content)
if status != 'already archived':
self.storage.upload(filename, key)
# extract duration from HTML
duration = s.find_all('time')[0].contents[0]
if ':' in duration:
duration = float(duration.split(':')[0]) * 60
+ float(duration.split(':')[1])
else:
duration = float(duration)
# process thumbnails
key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
os.remove(filename)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))

View File

@@ -0,0 +1,60 @@
import os, traceback
import tiktok_downloader
from loguru import logger
from .base_archiver import Archiver, ArchiveResult
class TiktokArchiver(Archiver):
name = "tiktok"
def download(self, url, check_if_exists=False):
if 'tiktok.com' not in url:
return False
status = 'success'
try:
info = tiktok_downloader.info_post(url)
key = self.get_key(f'{info.id}.mp4')
cdn_url = self.storage.get_cdn_url(key)
filename = 'tmp/' + key
if check_if_exists and self.storage.exists(key):
status = 'already archived'
media = tiktok_downloader.snaptik(url).get_media()
if len(media) <= 0:
if status == 'already archived':
return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url)
else:
return ArchiveResult(status='Could not download media')
media[0].download(filename)
if status != 'already archived':
self.storage.upload(filename, key)
try:
key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=info.duration)
except Exception as e:
logger.error(e)
key_thumb = ''
thumb_index = 'error creating thumbnails'
try: os.remove(filename)
except FileNotFoundError:
logger.info(f'tmp file not found thus not deleted {filename}')
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat())
except tiktok_downloader.Except.InvalidUrl:
status = 'Invalid URL'
return ArchiveResult(status=status)
except:
error = traceback.format_exc()
status = 'Other Tiktok error: ' + str(error)
return ArchiveResult(status=status)

View File

@@ -0,0 +1,76 @@
import time, requests, os
from bs4 import BeautifulSoup
from storages import Storage
from .base_archiver import Archiver, ArchiveResult
class WaybackArchiver(Archiver):
name = "wayback"
def __init__(self, storage: Storage):
super(WaybackArchiver, self).__init__(storage)
self.seen_urls = {}
def download(self, url, check_if_exists=False):
if check_if_exists and url in self.seen_urls:
return self.seen_urls[url]
ia_headers = {
"Accept": "application/json",
"Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
}
r = requests.post(
'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
if r.status_code != 200:
return ArchiveResult(status="Internet archive failed")
if 'job_id' not in r.json() and 'message' in r.json():
return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
job_id = r.json()['job_id']
status_r = requests.get('https://web.archive.org/save/status/' + job_id, headers=ia_headers)
retries = 0
# wait 90-120 seconds for the archive job to finish
while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
time.sleep(3)
try:
status_r = requests.get(
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
except:
time.sleep(1)
retries += 1
if status_r.status_code != 200:
return ArchiveResult(status="Internet archive failed")
status_json = status_r.json()
if status_json['status'] != 'success':
return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
archive_url = 'https://web.archive.org/web/' + \
status_json['timestamp'] + '/' + status_json['original_url']
try:
r = requests.get(archive_url)
parsed = BeautifulSoup(r.content, 'html.parser')
title = parsed.find_all('title')[0].text
if title == 'Wayback Machine':
title = 'Could not get title'
except:
title = "Could not get title"
result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title)
self.seen_urls[url] = result
return result

View File

@@ -0,0 +1,92 @@
import os
import datetime
import youtube_dl
from loguru import logger
from .base_archiver import Archiver, ArchiveResult
class YoutubeDLArchiver(Archiver):
name = "yotube_dl"
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
def download(self, url, check_if_exists=False):
netloc = self.get_netloc(url)
if netloc in ['facebook.com', 'wwww.facebook.com'] and os.getenv('FB_COOKIE'):
logger.info('Using Facebook cookie')
youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
ydl = youtube_dl.YoutubeDL(YoutubeDLArchiver.ydl_opts)
cdn_url = None
status = 'success'
try:
info = ydl.extract_info(url, download=False)
except youtube_dl.utils.DownloadError:
# no video here
return False
if info.get('is_live', False):
logger.warning("Live streaming media, not archiving now")
return ArchiveResult(status="Streaming media")
if check_if_exists:
if 'entries' in info:
if len(info['entries']) > 1:
logger.warning('YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos')
return False
elif len(info['entries']) == 0:
logger.warning(
'YoutubeDLArchiver succeeded but did not find video')
return False
filename = ydl.prepare_filename(info['entries'][0])
else:
filename = ydl.prepare_filename(info)
key = self.get_key(filename)
if self.storage.exists(key):
status = 'already archived'
cdn_url = self.storage.get_cdn_url(key)
# sometimes this results in a different filename, so do this again
info = ydl.extract_info(url, download=True)
# TODO: add support for multiple videos
if 'entries' in info:
if len(info['entries']) > 1:
logger.warning(
'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
return False
else:
info = info['entries'][0]
filename = ydl.prepare_filename(info)
if not os.path.exists(filename):
filename = filename.split('.')[0] + '.mkv'
if status != 'already archived':
key = self.get_key(filename)
cdn_url = self.storage.get_cdn_url(key)
self.storage.upload(filename, key)
# get duration
duration = info.get('duration')
# get thumbnails
try:
key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
except:
key_thumb = ''
thumb_index = 'Could not generate thumbnails'
os.remove(filename)
timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
title=info['title'] if 'title' in info else None, timestamp=timestamp)

View File

@@ -1,506 +1,133 @@
import gspread
import youtube_dl
from pathlib import Path
import sys
import datetime
import boto3
import os
from dotenv import load_dotenv
from botocore.errorfactory import ClientError
import datetime
import argparse
import math
import ffmpeg
import threading
import time
from bs4 import BeautifulSoup
import requests
import shutil
import gspread
from loguru import logger
from dotenv import load_dotenv
import archivers
from storages import S3Storage, S3Config
from utils import GWorksheet, mkdir_if_not_exists
load_dotenv()
def col_to_index(col):
col = list(col)
ndigits = len(col)
alphabet = ' ABCDEFGHIJKLMNOPQRSTUVWXYZ'
v = 0
i = ndigits - 1
def update_sheet(gw, row, result: archivers.ArchiveResult):
cell_updates = []
row_values = gw.get_row(row)
for digit in col:
index = alphabet.find(digit)
v += (26 ** i) * index
i -= 1
def batch_if_valid(col, val, final_value=None):
final_value = final_value or val
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
cell_updates.append((row, col, final_value))
return v - 1
cell_updates.append((row, 'status', result.status))
batch_if_valid('archive', result.cdn_url)
batch_if_valid('date', True, datetime.datetime.now().isoformat())
batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
batch_if_valid('thumbnail_index', result.thumbnail_index)
batch_if_valid('title', result.title)
batch_if_valid('duration', result.duration, str(result.duration))
def index_to_col(index):
alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
if result.timestamp and type(result.timestamp) != str:
result.timestamp = datetime.datetime.fromtimestamp(result.timestamp).isoformat()
batch_if_valid('timestamp', result.timestamp)
if index > 25:
t = index
dig = 0
while t > 25:
t = math.floor(t / 26)
dig += 1
return alphabet[t - 1] + index_to_col(index - t * int(math.pow(26, dig)))
else:
return alphabet[index]
gw.batch_set_cell(cell_updates)
def get_cdn_url(key):
return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
def do_s3_upload(s3_client, f, key):
s3_client.upload_fileobj(f, Bucket=os.getenv(
'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
def get_thumbnails(filename, s3_client, duration = None):
if not os.path.exists(filename.split('.')[0]):
os.mkdir(filename.split('.')[0])
fps = 0.5
if duration is not None:
duration = float(duration)
if duration < 60:
fps = 10.0 / duration
elif duration < 120:
fps = 20.0 / duration
else:
fps = 40.0 / duration
stream = ffmpeg.input(filename)
stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
stream.output(filename.split('.')[0] + '/out%d.jpg').run()
thumbnails = os.listdir(filename.split('.')[0] + '/')
cdn_urls = []
for fname in thumbnails:
if fname[-3:] == 'jpg':
thumbnail_filename = filename.split('.')[0] + '/' + fname
key = filename.split('/')[1].split('.')[0] + '/' + fname
cdn_url = get_cdn_url(key)
with open(thumbnail_filename, 'rb') as f:
do_s3_upload(s3_client, f, key)
cdn_urls.append(cdn_url)
os.remove(thumbnail_filename)
if len(cdn_urls) == 0:
return ('None', 'None')
key_thumb = cdn_urls[int(len(cdn_urls)*0.1)]
index_page = f'''<html><head><title>{filename}</title></head>
<body>'''
for t in cdn_urls:
index_page += f'<img src="{t}" />'
index_page += f"</body></html>"
index_fname = filename.split('.')[0] + '/index.html'
with open(index_fname, 'w') as f:
f.write(index_page)
thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
s3_client.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
thumb_index_cdn_url = get_cdn_url(thumb_index)
return (key_thumb, thumb_index_cdn_url)
def download_telegram_video(url, s3_client, check_if_exists=False):
status = 'success'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
original_url = url
if url[-8:] != "?embed=1":
url += "?embed=1"
t = requests.get(url, headers=headers)
s = BeautifulSoup(t.content, 'html.parser')
video = s.find("video")
if video is None:
return ({}, 'No telegram video found')
else:
video_url = video.get('src')
key = video_url.split('/')[-1].split('?')[0]
filename = 'tmp/' + key
if check_if_exists:
try:
s3_client.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
# file exists
cdn_url = get_cdn_url(key)
status = 'already archived'
except ClientError:
pass
v = requests.get(video_url, headers=headers)
with open(filename, 'wb') as f:
f.write(v.content)
if status != 'already archived':
cdn_url = get_cdn_url(key)
with open(filename, 'rb') as f:
do_s3_upload(s3_client, f, key)
duration = s.find_all('time')[0].contents[0]
if ':' in duration:
duration = float(duration.split(':')[0])*60 + float(duration.split(':')[1])
else:
duration = float(duration)
key_thumb, thumb_index = get_thumbnails(filename, s3_client, duration=duration)
os.remove(filename)
video_data = {
'cdn_url': cdn_url,
'thumbnail': key_thumb,
'thumbnail_index': thumb_index,
'duration': duration,
'title': original_url,
'timestamp': s.find_all('time')[1].get('datetime')
}
return (video_data, status)
def internet_archive(url, s3_client):
ia_headers = {
"Accept": "application/json",
"Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
}
r = requests.post(
'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
if r.status_code != 200:
return ({}, 'Internet archive failed')
else:
job_id = r.json()['job_id']
status_r = requests.get(
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
retries = 0
while status_r.json()['status'] == 'pending' and retries < 40:
time.sleep(5)
try:
status_r = requests.get(
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
except:
time.sleep(1)
retries += 1
status_json = status_r.json()
if status_json['status'] == 'success':
url = 'https://web.archive.org/web/' + \
status_json['timestamp'] + '/' + status_json['original_url']
r = requests.get(url)
parsed = BeautifulSoup(
r.content, 'html.parser')
title = parsed.find_all('title')[
0].text
return ({'cdn_url': url, 'title': title}, 'Internet Archive fallback')
else:
return ({}, 'Internet Archive failed: ' + status_json['message'])
def get_key(filename):
key = filename.split('/')[1]
if 'unknown_video' in key:
key = key.replace('unknown_video', 'jpg')
return key
def download_vid(url, s3_client, check_if_exists=False):
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
print('Using cookie')
youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
ydl = youtube_dl.YoutubeDL(ydl_opts)
cdn_url = None
status = 'success'
if check_if_exists:
info = ydl.extract_info(url, download=False)
if 'entries' in info:
if len(info['entries']) > 1:
raise Exception(
'ERROR: Cannot archive channels or pages with multiple videos')
filename = ydl.prepare_filename(info['entries'][0])
else:
filename = ydl.prepare_filename(info)
key = get_key(filename)
def expand_url(url):
# expand short URL links
if 'https://t.co/' in url:
try:
s3_client.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
# file exists
cdn_url = get_cdn_url(os, key)
status = 'already archived'
except ClientError:
pass
# sometimes this results in a different filename, so do this again
info = ydl.extract_info(url, download=True)
if 'entries' in info:
if len(info['entries']) > 1:
raise Exception(
'ERROR: Cannot archive channels or pages with multiple videos')
else:
info = info['entries'][0]
filename = ydl.prepare_filename(info)
if not os.path.exists(filename):
filename = filename.split('.')[0] + '.mkv'
if status != 'already archived':
key = get_key(filename)
cdn_url = get_cdn_url(os, key)
with open(filename, 'rb') as f:
do_s3_upload(s3_client, f, key)
duration = info['duration'] if 'duration' in info else None
key_thumb, thumb_index = get_thumbnails(filename, s3_client, duration=duration)
os.remove(filename)
video_data = {
'cdn_url': cdn_url,
'thumbnail': key_thumb,
'thumbnail_index': thumb_index,
'duration': duration,
'title': info['title'] if 'title' in info else None,
'timestamp': info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None,
}
return (video_data, status)
def update_sheet(wks, row, status, video_data, columns, v):
update = []
if columns['status'] is not None:
update += [{
'range': columns['status'] + str(row),
'values': [[status]]
}]
if 'cdn_url' in video_data and video_data['cdn_url'] is not None and columns['archive'] is not None and v[col_to_index(columns['archive'])] == '':
update += [{
'range': columns['archive'] + str(row),
'values': [[video_data['cdn_url']]]
}]
if 'date' in video_data and columns['date'] is not None and v[col_to_index(columns['date'])] == '':
update += [{
'range': columns['date'] + str(row),
'values': [[datetime.datetime.now().isoformat()]]
}]
if 'thumbnail' in video_data and columns['thumbnail'] is not None and v[col_to_index(columns['thumbnail'])] == '':
update += [{
'range': columns['thumbnail'] + str(row),
'values': [['=IMAGE("' + video_data['thumbnail'] + '")']]
}]
if 'thumbnail_index' in video_data and columns['thumbnail_index'] is not None and v[col_to_index(columns['thumbnail_index'])] == '':
update += [{
'range': columns['thumbnail_index'] + str(row),
'values': [[video_data['thumbnail_index']]]
}]
if 'timestamp' in video_data and columns['timestamp'] is not None and video_data['timestamp'] is not None and v[col_to_index(columns['timestamp'])] == '':
update += [{
'range': columns['timestamp'] + str(row),
'values': [[video_data['timestamp']]] if type(video_data['timestamp']) == str else [[datetime.datetime.fromtimestamp(video_data['timestamp']).isoformat()]]
}]
if 'title' in video_data and columns['title'] is not None and video_data['title'] is not None and v[col_to_index(columns['title'])] == '':
update += [{
'range': columns['title'] + str(row),
'values': [[video_data['title']]]
}]
if 'duration' in video_data and columns['duration'] is not None and video_data['duration'] is not None and v[col_to_index(columns['duration'])] == '':
update += [{
'range': columns['duration'] + str(row),
'values': [[str(video_data['duration'])]]
}]
wks.batch_update(update, value_input_option='USER_ENTERED')
def record_stream(url, s3_client, wks, i, columns, v):
video_data, status = download_vid(url, s3_client)
update_sheet(wks, i, status, video_data, columns, v)
r = requests.get(url)
url = r.url
except:
logger.error(f'Failed to expand url {url}')
return url
def process_sheet(sheet):
gc = gspread.service_account(filename='service_account.json')
sh = gc.open(sheet)
n_worksheets = len(sh.worksheets())
s3_client = boto3.client('s3',
region_name=os.getenv('DO_SPACES_REGION'),
endpoint_url='https://{}.digitaloceanspaces.com'.format(
os.getenv('DO_SPACES_REGION')),
aws_access_key_id=os.getenv('DO_SPACES_KEY'),
aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
s3_config = S3Config(
bucket=os.getenv('DO_BUCKET'),
region=os.getenv('DO_SPACES_REGION'),
key=os.getenv('DO_SPACES_KEY'),
secret=os.getenv('DO_SPACES_SECRET')
)
# loop through worksheets to check
for ii in range(n_worksheets):
print("Opening worksheet " + str(ii))
wks = sh.get_worksheet(ii)
values = wks.get_all_values()
for ii, wks in enumerate(sh.worksheets()):
logger.info(f'Opening worksheet {ii}: "{wks.title}"')
gw = GWorksheet(wks)
headers = [v.lower() for v in values[0]]
columns = {}
columns['url'] = index_to_col(headers.index(
'media url')) if 'media url' in headers else index_to_col(headers.index(
'source url')) if 'source url' in headers else None
if columns['url'] is None:
print("No 'Media URL' column found, skipping")
if not gw.col_exists('url'):
logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}')
continue
url_index = col_to_index(columns['url'])
columns['archive'] = index_to_col(headers.index(
'archive location')) if 'archive location' in headers else None
columns['date'] = index_to_col(headers.index(
'archive date')) if 'archive date' in headers else None
columns['status'] = index_to_col(headers.index(
'archive status')) if 'archive status' in headers else None
if columns['status'] is None:
print("No 'Archive status' column found, skipping")
if not gw.col_exists('status'):
logger.warning(f'No "Archive status" column found, skipping worksheet {wks.title}')
continue
columns['thumbnail'] = index_to_col(headers.index(
'thumbnail')) if 'thumbnail' in headers else None
columns['thumbnail_index'] = index_to_col(headers.index(
'thumbnail index')) if 'thumbnail index' in headers else None
columns['timestamp'] = index_to_col(headers.index(
'upload timestamp')) if 'upload timestamp' in headers else None
columns['title'] = index_to_col(headers.index(
'upload title')) if 'upload title' in headers else None
columns['duration'] = index_to_col(headers.index(
'duration')) if 'duration' in headers else None
# archives will be in a folder 'doc_name/worksheet_name'
s3_config.folder = f'{sheet}/{wks.title}/'
s3_client = S3Storage(s3_config)
# order matters, first to succeed excludes remaining
active_archivers = [
archivers.TelegramArchiver(s3_client),
archivers.TiktokArchiver(s3_client),
archivers.YoutubeDLArchiver(s3_client),
archivers.WaybackArchiver(s3_client)
]
# loop through rows in worksheet
for i in range(2, len(values)+1):
v = values[i-1]
for row in range(2, gw.count_rows() + 1):
url = gw.get_cell(row, 'url')
status = gw.get_cell(row, 'status')
if url != '' and status in ['', None]:
gw.set_cell(row, 'status', 'Archive in progress')
if v[url_index] != "" and v[col_to_index(columns['status'])] == "":
latest_val = wks.acell(
columns['status'] + str(i)).value
url = expand_url(url)
# check so we don't step on each others' toes
if latest_val == '' or latest_val is None:
wks.update(
columns['status'] + str(i), 'Archive in progress')
for archiver in active_archivers:
logger.debug(f'Trying {archiver} on row {row}')
if 'http://t.me/' in v[url_index] or 'https://t.me/' in v[url_index]:
video_data, status = download_telegram_video(
v[url_index], s3_client, check_if_exists=True)
if status == 'No telegram video found':
print("Trying Internet Archive fallback")
# TODO: add support for multiple videos/images
try:
result = archiver.download(url, check_if_exists=True)
except Exception as e:
result = False
logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}')
video_data, status = internet_archive(
v[url_index], s3_client)
update_sheet(wks, i, status, video_data, columns, v)
if result:
if result.status in ['success', 'already archived']:
logger.success(f'{archiver} succeeded on row {row}')
break
logger.warning(f'{archiver} did not succeed on row {row}, final status: {result.status}')
else:
try:
ydl_opts = {
'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
if (v[url_index][0:21] == 'https://facebook.com/' or v[url_index][0:25] == 'https://www.facebook.com/') and os.getenv('FB_COOKIE'):
print('Using cookie')
youtube_dl.utils.std_headers['cookie'] = os.getenv(
'FB_COOKIE')
ydl = youtube_dl.YoutubeDL(ydl_opts)
info = ydl.extract_info(
v[url_index], download=False)
if 'is_live' in info and info['is_live']:
wks.update(columns['status'] +
str(i), 'Recording stream')
t = threading.Thread(target=record_stream, args=(
v[url_index], s3_client, wks, i, columns, v))
t.start()
continue
elif 'is_live' not in info or not info['is_live']:
video_data, status = download_vid(
v[url_index], s3_client, check_if_exists=True)
update_sheet(wks, i, status,
video_data, columns, v)
except:
# i'm sure there's a better way to handle this than nested try/catch blocks
try:
print("Trying Internet Archive fallback")
video_data, status = internet_archive(
v[url_index], s3_client)
update_sheet(wks, i, status,
video_data, columns, v)
except:
# if any unexpected errors occured, log these into the Google Sheet
t, value, traceback = sys.exc_info()
update_sheet(wks, i, str(
value), {}, columns, v)
if result:
update_sheet(gw, row, result)
else:
gw.set_cell(row, 'status', 'failed: no archiver')
def main():
parser = argparse.ArgumentParser(
description="Automatically use youtube-dl to download media from a Google Sheet")
parser.add_argument("--sheet", action="store", dest="sheet")
description='Automatically archive social media videos from a Google Sheets document')
parser.add_argument('--sheet', action='store', dest='sheet')
args = parser.parse_args()
print("Opening document " + args.sheet)
logger.info(f'Opening document {args.sheet}')
mkdir_if_not_exists('tmp')
process_sheet(args.sheet)
shutil.rmtree('tmp')
if __name__ == "__main__":
if __name__ == '__main__':
main()

View File

@@ -1,8 +1,7 @@
import gspread
import subprocess
import argparse
import auto_archive
import datetime
from loguru import logger
def main():
parser = argparse.ArgumentParser(
@@ -11,8 +10,7 @@ def main():
args = parser.parse_args()
print(datetime.datetime.now())
print("Opening document " + args.sheet)
logger.info("Opening document " + args.sheet)
gc = gspread.service_account(filename='service_account.json')
sh = gc.open(args.sheet)
@@ -23,7 +21,7 @@ def main():
for i in range(11, len(values)):
sheet_name = values[i][0]
print("Processing " + sheet_name)
logger.info("Processing " + sheet_name)
auto_archive.process_sheet(sheet_name)

View File

@@ -1,5 +0,0 @@
gspread
youtube_dl
boto3
python-dotenv

3
storages/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
# we need to explicitly expose the available imports here
from .base_storage import *
from .s3_storage import *

19
storages/base_storage.py Normal file
View File

@@ -0,0 +1,19 @@
from abc import ABC, abstractmethod
class Storage(ABC):
@abstractmethod
def __init__(self, config): pass
@abstractmethod
def get_cdn_url(self, path): pass
@abstractmethod
def exists(self, path): pass
@abstractmethod
def uploadf(self, file, key, **kwargs): pass
def upload(self, filename: str, key: str, **kwargs):
with open(filename, 'rb') as f:
self.uploadf(f, key, **kwargs)

49
storages/s3_storage.py Normal file
View File

@@ -0,0 +1,49 @@
import boto3
from botocore.errorfactory import ClientError
from .base_storage import Storage
from dataclasses import dataclass
@dataclass
class S3Config:
bucket: str
region: str
key: str
secret: str
folder: str = ""
class S3Storage(Storage):
def __init__(self, config: S3Config):
self.bucket = config.bucket
self.region = config.region
self.folder = config.folder
if len(self.folder) and self.folder[-1] != '/':
self.folder += '/'
self.s3 = boto3.client(
's3',
region_name=self.region,
endpoint_url=f'https://{self.region}.digitaloceanspaces.com',
aws_access_key_id=config.key,
aws_secret_access_key=config.secret
)
def _get_path(self, key):
return self.folder + key
def get_cdn_url(self, key):
return f'https://{self.bucket}.{self.region}.cdn.digitaloceanspaces.com/{self._get_path(key)}'
def exists(self, key):
try:
self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key))
return True
except ClientError:
return False
def uploadf(self, file, key, **kwargs):
extra_args = kwargs["extra_args"] if "extra_args" in kwargs else {'ACL': 'public-read'}
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)

3
utils/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
# we need to explicitly expose the available imports here
from .gworksheet import GWorksheet
from .misc import *

75
utils/gworksheet.py Normal file
View File

@@ -0,0 +1,75 @@
from gspread import utils
class GWorksheet:
COLUMN_NAMES = {
'url': 'media url',
'archive': 'archive location',
'date': 'archive date',
'status': 'archive status',
'thumbnail': 'thumbnail',
'thumbnail_index': 'thumbnail index',
'timestamp': 'upload timestamp',
'title': 'upload title',
'duration': 'duration'
}
def __init__(self, worksheet, columns=COLUMN_NAMES):
self.wks = worksheet
self.headers = [v.lower() for v in self.wks.row_values(1)]
self.columns = columns
def _check_col_exists(self, col: str):
if col not in self.columns:
raise Exception(f'Column {col} is not in the configured column names: {self.columns.keys()}')
def _col_index(self, col: str):
self._check_col_exists(col)
return self.headers.index(self.columns[col])
def col_exists(self, col: str):
self._check_col_exists(col)
return self.columns[col] in self.headers
def count_rows(self):
return len(self.wks.get_values())
def get_row(self, row: int):
# row is 1-based
return self.wks.row_values(row)
def get_cell(self, row, col: str):
"""
returns the cell value from (row, col),
where row can be an index (1-based) OR list of values
as received from self.get_row(row)
"""
if type(row) == int:
row = self.get_row(row)
col_index = self._col_index(col)
if col_index >= len(row):
return ''
return row[col_index]
def set_cell(self, row: int, col: str, val):
# row is 1-based
col_index = self._col_index(col) + 1
self.wks.update_cell(row, col_index, val)
def batch_set_cell(self, cell_updates):
"""
receives a list of [(row:int, col:str, val)] and batch updates it, the parameters are the same as in the self.set_cell() method
"""
cell_updates = [
{
'range': self.to_a1(row, col),
'values': [[val]]
}
for row, col, val in cell_updates
]
self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
def to_a1(self, row: int, col: str):
# row is 1-based
return utils.rowcol_to_a1(row, self._col_index(col) + 1)

5
utils/misc.py Normal file
View File

@@ -0,0 +1,5 @@
import os
def mkdir_if_not_exists(folder):
if not os.path.exists(folder):
os.mkdir(folder)