mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-10 20:28:28 +03:00
Compare commits
25 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d2d6db162b | ||
|
|
5cfbcc0137 | ||
|
|
5fdaa6c739 | ||
|
|
3d389ee05b | ||
|
|
0ecbed0df0 | ||
|
|
69bcfea2eb | ||
|
|
2e2e695444 | ||
|
|
493055a8d9 | ||
|
|
6f6eb2db7a | ||
|
|
906ed0f6e0 | ||
|
|
39818e648a | ||
|
|
2bbf534d67 | ||
|
|
6be7536fad | ||
|
|
0654e8c5c6 | ||
|
|
0e3c427371 | ||
|
|
7497bc08c0 | ||
|
|
49863768fe | ||
|
|
7b9483bbf9 | ||
|
|
cd81cae559 | ||
|
|
23894fad51 | ||
|
|
876988b587 | ||
|
|
f95293b84b | ||
|
|
2fbcbe4e8b | ||
|
|
d1e4574c6c | ||
|
|
d347b26d37 |
4
Pipfile
4
Pipfile
@@ -14,7 +14,6 @@ loguru = "*"
|
||||
ffmpeg-python = "*"
|
||||
selenium = "*"
|
||||
snscrape = "*"
|
||||
yt-dlp = "*"
|
||||
telethon = "*"
|
||||
google-api-python-client = "*"
|
||||
google-auth-httplib2 = "*"
|
||||
@@ -23,13 +22,14 @@ oauth2client = "*"
|
||||
python-slugify = "*"
|
||||
pyyaml = "*"
|
||||
dateparser = "*"
|
||||
vk-url-scraper = "*"
|
||||
python-twitter-v2 = "*"
|
||||
instaloader = "*"
|
||||
tqdm = "*"
|
||||
jinja2 = "*"
|
||||
cryptography = "==38.0.4"
|
||||
dataclasses-json = "*"
|
||||
yt-dlp = ">=2023.2.17"
|
||||
vk-url-scraper = "*"
|
||||
|
||||
[requires]
|
||||
python_version = "3.9"
|
||||
|
||||
248
Pipfile.lock
generated
248
Pipfile.lock
generated
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "e2f5d017d9bc9eef90cced189b6e3017d740c35d204962479417109a4deeb7f4"
|
||||
"sha256": "7176a6666639452dbf30939fa095ff23518aee6da7d9561de0f12ba0aceed527"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
@@ -57,19 +57,19 @@
|
||||
},
|
||||
"boto3": {
|
||||
"hashes": [
|
||||
"sha256:3a1ffeecfe6e61d414617294b822b008e604ccfd83434c483f429a2922db314d",
|
||||
"sha256:ebea98f3054b467caf6c8aead9f0ef78395a78bce78b04db12fde452c02b3734"
|
||||
"sha256:17f0d782487275cac12676a61b3f1a4900954cc454c842b8551ca47a3dcd59b4",
|
||||
"sha256:bf808f7433629650128ab577a9d4a0f4daf072d9f2f3a907b9d567a6952d9154"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.26.66"
|
||||
"version": "==1.26.77"
|
||||
},
|
||||
"botocore": {
|
||||
"hashes": [
|
||||
"sha256:4d1ac019e677cc39e615f9d473fa658ea22a8d906c1c562f9406b5d0cd854cbd",
|
||||
"sha256:772da07d2a49a9d2dc8d23e060e88eb72881e58074be7c813aa946ecdbd0e5b5"
|
||||
"sha256:9d94a02f2584b52c65fb3cb309fb1b29d6d0c36d69062722b0275c1c382c44c9",
|
||||
"sha256:d8aa7bffe2422de282b2d02945b7b45d5fecf00f67b65eebb0b1fa3de1abc6d0"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==1.29.66"
|
||||
"version": "==1.29.77"
|
||||
},
|
||||
"brotli": {
|
||||
"hashes": [
|
||||
@@ -176,11 +176,11 @@
|
||||
},
|
||||
"certifi": {
|
||||
"hashes": [
|
||||
"sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d",
|
||||
"sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412"
|
||||
"sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3",
|
||||
"sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2022.6.15"
|
||||
"version": "==2022.12.7"
|
||||
},
|
||||
"cffi": {
|
||||
"hashes": [
|
||||
@@ -253,11 +253,97 @@
|
||||
},
|
||||
"charset-normalizer": {
|
||||
"hashes": [
|
||||
"sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
|
||||
"sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"
|
||||
"sha256:00d3ffdaafe92a5dc603cb9bd5111aaa36dfa187c8285c543be562e61b755f6b",
|
||||
"sha256:024e606be3ed92216e2b6952ed859d86b4cfa52cd5bc5f050e7dc28f9b43ec42",
|
||||
"sha256:0298eafff88c99982a4cf66ba2efa1128e4ddaca0b05eec4c456bbc7db691d8d",
|
||||
"sha256:02a51034802cbf38db3f89c66fb5d2ec57e6fe7ef2f4a44d070a593c3688667b",
|
||||
"sha256:083c8d17153ecb403e5e1eb76a7ef4babfc2c48d58899c98fcaa04833e7a2f9a",
|
||||
"sha256:0a11e971ed097d24c534c037d298ad32c6ce81a45736d31e0ff0ad37ab437d59",
|
||||
"sha256:0bf2dae5291758b6f84cf923bfaa285632816007db0330002fa1de38bfcb7154",
|
||||
"sha256:0c0a590235ccd933d9892c627dec5bc7511ce6ad6c1011fdf5b11363022746c1",
|
||||
"sha256:0f438ae3532723fb6ead77e7c604be7c8374094ef4ee2c5e03a3a17f1fca256c",
|
||||
"sha256:109487860ef6a328f3eec66f2bf78b0b72400280d8f8ea05f69c51644ba6521a",
|
||||
"sha256:11b53acf2411c3b09e6af37e4b9005cba376c872503c8f28218c7243582df45d",
|
||||
"sha256:12db3b2c533c23ab812c2b25934f60383361f8a376ae272665f8e48b88e8e1c6",
|
||||
"sha256:14e76c0f23218b8f46c4d87018ca2e441535aed3632ca134b10239dfb6dadd6b",
|
||||
"sha256:16a8663d6e281208d78806dbe14ee9903715361cf81f6d4309944e4d1e59ac5b",
|
||||
"sha256:292d5e8ba896bbfd6334b096e34bffb56161c81408d6d036a7dfa6929cff8783",
|
||||
"sha256:2c03cc56021a4bd59be889c2b9257dae13bf55041a3372d3295416f86b295fb5",
|
||||
"sha256:2e396d70bc4ef5325b72b593a72c8979999aa52fb8bcf03f701c1b03e1166918",
|
||||
"sha256:2edb64ee7bf1ed524a1da60cdcd2e1f6e2b4f66ef7c077680739f1641f62f555",
|
||||
"sha256:31a9ddf4718d10ae04d9b18801bd776693487cbb57d74cc3458a7673f6f34639",
|
||||
"sha256:356541bf4381fa35856dafa6a965916e54bed415ad8a24ee6de6e37deccf2786",
|
||||
"sha256:358a7c4cb8ba9b46c453b1dd8d9e431452d5249072e4f56cfda3149f6ab1405e",
|
||||
"sha256:37f8febc8ec50c14f3ec9637505f28e58d4f66752207ea177c1d67df25da5aed",
|
||||
"sha256:39049da0ffb96c8cbb65cbf5c5f3ca3168990adf3551bd1dee10c48fce8ae820",
|
||||
"sha256:39cf9ed17fe3b1bc81f33c9ceb6ce67683ee7526e65fde1447c772afc54a1bb8",
|
||||
"sha256:3ae1de54a77dc0d6d5fcf623290af4266412a7c4be0b1ff7444394f03f5c54e3",
|
||||
"sha256:3b590df687e3c5ee0deef9fc8c547d81986d9a1b56073d82de008744452d6541",
|
||||
"sha256:3e45867f1f2ab0711d60c6c71746ac53537f1684baa699f4f668d4c6f6ce8e14",
|
||||
"sha256:3fc1c4a2ffd64890aebdb3f97e1278b0cc72579a08ca4de8cd2c04799a3a22be",
|
||||
"sha256:4457ea6774b5611f4bed5eaa5df55f70abde42364d498c5134b7ef4c6958e20e",
|
||||
"sha256:44ba614de5361b3e5278e1241fda3dc1838deed864b50a10d7ce92983797fa76",
|
||||
"sha256:4a8fcf28c05c1f6d7e177a9a46a1c52798bfe2ad80681d275b10dcf317deaf0b",
|
||||
"sha256:4b0d02d7102dd0f997580b51edc4cebcf2ab6397a7edf89f1c73b586c614272c",
|
||||
"sha256:502218f52498a36d6bf5ea77081844017bf7982cdbe521ad85e64cabee1b608b",
|
||||
"sha256:503e65837c71b875ecdd733877d852adbc465bd82c768a067badd953bf1bc5a3",
|
||||
"sha256:5995f0164fa7df59db4746112fec3f49c461dd6b31b841873443bdb077c13cfc",
|
||||
"sha256:59e5686dd847347e55dffcc191a96622f016bc0ad89105e24c14e0d6305acbc6",
|
||||
"sha256:601f36512f9e28f029d9481bdaf8e89e5148ac5d89cffd3b05cd533eeb423b59",
|
||||
"sha256:608862a7bf6957f2333fc54ab4399e405baad0163dc9f8d99cb236816db169d4",
|
||||
"sha256:62595ab75873d50d57323a91dd03e6966eb79c41fa834b7a1661ed043b2d404d",
|
||||
"sha256:70990b9c51340e4044cfc394a81f614f3f90d41397104d226f21e66de668730d",
|
||||
"sha256:71140351489970dfe5e60fc621ada3e0f41104a5eddaca47a7acb3c1b851d6d3",
|
||||
"sha256:72966d1b297c741541ca8cf1223ff262a6febe52481af742036a0b296e35fa5a",
|
||||
"sha256:74292fc76c905c0ef095fe11e188a32ebd03bc38f3f3e9bcb85e4e6db177b7ea",
|
||||
"sha256:761e8904c07ad053d285670f36dd94e1b6ab7f16ce62b9805c475b7aa1cffde6",
|
||||
"sha256:772b87914ff1152b92a197ef4ea40efe27a378606c39446ded52c8f80f79702e",
|
||||
"sha256:79909e27e8e4fcc9db4addea88aa63f6423ebb171db091fb4373e3312cb6d603",
|
||||
"sha256:7e189e2e1d3ed2f4aebabd2d5b0f931e883676e51c7624826e0a4e5fe8a0bf24",
|
||||
"sha256:7eb33a30d75562222b64f569c642ff3dc6689e09adda43a082208397f016c39a",
|
||||
"sha256:81d6741ab457d14fdedc215516665050f3822d3e56508921cc7239f8c8e66a58",
|
||||
"sha256:8499ca8f4502af841f68135133d8258f7b32a53a1d594aa98cc52013fff55678",
|
||||
"sha256:84c3990934bae40ea69a82034912ffe5a62c60bbf6ec5bc9691419641d7d5c9a",
|
||||
"sha256:87701167f2a5c930b403e9756fab1d31d4d4da52856143b609e30a1ce7160f3c",
|
||||
"sha256:88600c72ef7587fe1708fd242b385b6ed4b8904976d5da0893e31df8b3480cb6",
|
||||
"sha256:8ac7b6a045b814cf0c47f3623d21ebd88b3e8cf216a14790b455ea7ff0135d18",
|
||||
"sha256:8b8af03d2e37866d023ad0ddea594edefc31e827fee64f8de5611a1dbc373174",
|
||||
"sha256:8c7fe7afa480e3e82eed58e0ca89f751cd14d767638e2550c77a92a9e749c317",
|
||||
"sha256:8eade758719add78ec36dc13201483f8e9b5d940329285edcd5f70c0a9edbd7f",
|
||||
"sha256:911d8a40b2bef5b8bbae2e36a0b103f142ac53557ab421dc16ac4aafee6f53dc",
|
||||
"sha256:93ad6d87ac18e2a90b0fe89df7c65263b9a99a0eb98f0a3d2e079f12a0735837",
|
||||
"sha256:95dea361dd73757c6f1c0a1480ac499952c16ac83f7f5f4f84f0658a01b8ef41",
|
||||
"sha256:9ab77acb98eba3fd2a85cd160851816bfce6871d944d885febf012713f06659c",
|
||||
"sha256:9cb3032517f1627cc012dbc80a8ec976ae76d93ea2b5feaa9d2a5b8882597579",
|
||||
"sha256:9cf4e8ad252f7c38dd1f676b46514f92dc0ebeb0db5552f5f403509705e24753",
|
||||
"sha256:9d9153257a3f70d5f69edf2325357251ed20f772b12e593f3b3377b5f78e7ef8",
|
||||
"sha256:a152f5f33d64a6be73f1d30c9cc82dfc73cec6477ec268e7c6e4c7d23c2d2291",
|
||||
"sha256:a16418ecf1329f71df119e8a65f3aa68004a3f9383821edcb20f0702934d8087",
|
||||
"sha256:a60332922359f920193b1d4826953c507a877b523b2395ad7bc716ddd386d866",
|
||||
"sha256:a8d0fc946c784ff7f7c3742310cc8a57c5c6dc31631269876a88b809dbeff3d3",
|
||||
"sha256:ab5de034a886f616a5668aa5d098af2b5385ed70142090e2a31bcbd0af0fdb3d",
|
||||
"sha256:c22d3fe05ce11d3671297dc8973267daa0f938b93ec716e12e0f6dee81591dc1",
|
||||
"sha256:c2ac1b08635a8cd4e0cbeaf6f5e922085908d48eb05d44c5ae9eabab148512ca",
|
||||
"sha256:c512accbd6ff0270939b9ac214b84fb5ada5f0409c44298361b2f5e13f9aed9e",
|
||||
"sha256:c75ffc45f25324e68ab238cb4b5c0a38cd1c3d7f1fb1f72b5541de469e2247db",
|
||||
"sha256:c95a03c79bbe30eec3ec2b7f076074f4281526724c8685a42872974ef4d36b72",
|
||||
"sha256:cadaeaba78750d58d3cc6ac4d1fd867da6fc73c88156b7a3212a3cd4819d679d",
|
||||
"sha256:cd6056167405314a4dc3c173943f11249fa0f1b204f8b51ed4bde1a9cd1834dc",
|
||||
"sha256:db72b07027db150f468fbada4d85b3b2729a3db39178abf5c543b784c1254539",
|
||||
"sha256:df2c707231459e8a4028eabcd3cfc827befd635b3ef72eada84ab13b52e1574d",
|
||||
"sha256:e62164b50f84e20601c1ff8eb55620d2ad25fb81b59e3cd776a1902527a788af",
|
||||
"sha256:e696f0dd336161fca9adbb846875d40752e6eba585843c768935ba5c9960722b",
|
||||
"sha256:eaa379fcd227ca235d04152ca6704c7cb55564116f8bc52545ff357628e10602",
|
||||
"sha256:ebea339af930f8ca5d7a699b921106c6e29c617fe9606fa7baa043c1cdae326f",
|
||||
"sha256:f4c39b0e3eac288fedc2b43055cfc2ca7a60362d0e5e87a637beac5d801ef478",
|
||||
"sha256:f5057856d21e7586765171eac8b9fc3f7d44ef39425f85dbcccb13b3ebea806c",
|
||||
"sha256:f6f45710b4459401609ebebdbcfb34515da4fc2aa886f95107f556ac69a9147e",
|
||||
"sha256:f97e83fa6c25693c7a35de154681fcc257c1c41b38beb0304b9c4d2d9e164479",
|
||||
"sha256:f9d0c5c045a3ca9bedfc35dca8526798eb91a07aa7a2c0fee134c6c6f321cbd7",
|
||||
"sha256:ff6f3db31555657f3163b15a6b7c6938d08df7adbfc9dd13d9d19edad678f1e8"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==2.0.12"
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.0.1"
|
||||
},
|
||||
"click": {
|
||||
"hashes": [
|
||||
@@ -348,11 +434,11 @@
|
||||
},
|
||||
"flask": {
|
||||
"hashes": [
|
||||
"sha256:642c450d19c4ad482f96729bd2a8f6d32554aa1e231f4f6b4e7e5264b16cca2b",
|
||||
"sha256:b9c46cc36662a7949f34b52d8ec7bb59c0d74ba08ba6cb9ce9adc1d8676d9526"
|
||||
"sha256:7eb373984bf1c770023fce9db164ed0c3353cd0b53f130f4693da0ca756a2e6d",
|
||||
"sha256:c0bec9477df1cb867e5a67c9e1ab758de9cb4a3e52dd70681f59fa40a62b3f2d"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==2.2.2"
|
||||
"version": "==2.2.3"
|
||||
},
|
||||
"future": {
|
||||
"hashes": [
|
||||
@@ -371,19 +457,19 @@
|
||||
},
|
||||
"google-api-python-client": {
|
||||
"hashes": [
|
||||
"sha256:42a44e9adfca6bb27540ce52348aa1d3b81e214bcc53d454a76ebfbe8eee1483",
|
||||
"sha256:f18e9dbb365f0485194a8daf5d60da2cff6a80ce2c9a694efc2b279922cb3dd0"
|
||||
"sha256:577c0aeae1eb3c754eacb9122d369d67609fef759bc6a4fa16cafeab4f30019b",
|
||||
"sha256:b9b6dc5f139892310093ba75d0df4c78f48655078953c923957dab1ec86129e7"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.77.0"
|
||||
"version": "==2.79.0"
|
||||
},
|
||||
"google-auth": {
|
||||
"hashes": [
|
||||
"sha256:5045648c821fb72384cdc0e82cc326df195f113a33049d9b62b74589243d2acc",
|
||||
"sha256:ed7057a101af1146f0554a769930ac9de506aeca4fd5af6543ebe791851a9fbd"
|
||||
"sha256:5fd170986bce6bfd7bb5c845c4b8362edb1e0cba901e062196e83f8bb5d5d32c",
|
||||
"sha256:75d76ea857df65938e1f71dcbcd7d0cd48e3f80b34b8870ba229c9292081f7ef"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
|
||||
"version": "==2.16.0"
|
||||
"version": "==2.16.1"
|
||||
},
|
||||
"google-auth-httplib2": {
|
||||
"hashes": [
|
||||
@@ -435,18 +521,18 @@
|
||||
},
|
||||
"idna": {
|
||||
"hashes": [
|
||||
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
|
||||
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
|
||||
"sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4",
|
||||
"sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==3.3"
|
||||
"version": "==3.4"
|
||||
},
|
||||
"instaloader": {
|
||||
"hashes": [
|
||||
"sha256:ba925a87e2c305a3d24173d1bb0457d5a7e2e77dbac7206eeeb46f9104ecb08e"
|
||||
"sha256:16040c170fb5230c1981a47e1990261e3c0ecffe0417be95fa265632244e7c01"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==4.9.5"
|
||||
"version": "==4.9.6"
|
||||
},
|
||||
"itsdangerous": {
|
||||
"hashes": [
|
||||
@@ -565,11 +651,11 @@
|
||||
},
|
||||
"markdown-it-py": {
|
||||
"hashes": [
|
||||
"sha256:93de681e5c021a432c63147656fe21790bc01231e0cd2da73626f1aa3ac0fe27",
|
||||
"sha256:cf7e59fed14b5ae17c0006eff14a2d9a00ed5f3a846148153899a0224e2c07da"
|
||||
"sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30",
|
||||
"sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==2.1.0"
|
||||
"version": "==2.2.0"
|
||||
},
|
||||
"markupsafe": {
|
||||
"hashes": [
|
||||
@@ -700,23 +786,22 @@
|
||||
},
|
||||
"protobuf": {
|
||||
"hashes": [
|
||||
"sha256:1f22ac0ca65bb70a876060d96d914dae09ac98d114294f77584b0d2644fa9c30",
|
||||
"sha256:237216c3326d46808a9f7c26fd1bd4b20015fb6867dc5d263a493ef9a539293b",
|
||||
"sha256:27f4d15021da6d2b706ddc3860fac0a5ddaba34ab679dc182b60a8bb4e1121cc",
|
||||
"sha256:299ea899484ee6f44604deb71f424234f654606b983cb496ea2a53e3c63ab791",
|
||||
"sha256:3d164928ff0727d97022957c2b849250ca0e64777ee31efd7d6de2e07c494717",
|
||||
"sha256:6ab80df09e3208f742c98443b6166bcb70d65f52cfeb67357d52032ea1ae9bec",
|
||||
"sha256:78a28c9fa223998472886c77042e9b9afb6fe4242bd2a2a5aced88e3f4422aa7",
|
||||
"sha256:7cd532c4566d0e6feafecc1059d04c7915aec8e182d1cf7adee8b24ef1e2e6ab",
|
||||
"sha256:89f9149e4a0169cddfc44c74f230d7743002e3aa0b9472d8c28f0388102fc4c2",
|
||||
"sha256:a53fd3f03e578553623272dc46ac2f189de23862e68565e83dde203d41b76fc5",
|
||||
"sha256:b135410244ebe777db80298297a97fbb4c862c881b4403b71bac9d4107d61fd1",
|
||||
"sha256:b98d0148f84e3a3c569e19f52103ca1feacdac0d2df8d6533cf983d1fda28462",
|
||||
"sha256:d1736130bce8cf131ac7957fa26880ca19227d4ad68b4888b3be0dea1f95df97",
|
||||
"sha256:f45460f9ee70a0ec1b6694c6e4e348ad2019275680bd68a1d9314b8c7e01e574"
|
||||
"sha256:1669cb7524221a8e2d9008d0842453dbefdd0fcdd64d67672f657244867635fb",
|
||||
"sha256:29288813aacaa302afa2381db1d6e0482165737b0afdf2811df5fa99185c457b",
|
||||
"sha256:47d31bdf58222dd296976aa1646c68c6ee80b96d22e0a3c336c9174e253fd35e",
|
||||
"sha256:652d8dfece122a24d98eebfef30e31e455d300efa41999d1182e015984ac5930",
|
||||
"sha256:7c535d126e7dcc714105ab20b418c4fedbd28f8b8afc42b7350b1e317bbbcc71",
|
||||
"sha256:86c3d20428b007537ba6792b475c0853bba7f66b1f60e610d913b77d94b486e4",
|
||||
"sha256:a33a273d21852f911b8bda47f39f4383fe7c061eb1814db2c76c9875c89c2491",
|
||||
"sha256:ab4d043865dd04e6b09386981fe8f80b39a1e46139fb4a3c206229d6b9f36ff6",
|
||||
"sha256:b2fea9dc8e3c0f32c38124790ef16cba2ee0628fe2022a52e435e1117bfef9b1",
|
||||
"sha256:c27f371f0159feb70e6ea52ed7e768b3f3a4c5676c1900a7e51a24740381650e",
|
||||
"sha256:c3325803095fb4c2a48649c321d2fbde59f8fbfcb9bfc7a86df27d112831c571",
|
||||
"sha256:e474b63bab0a2ea32a7b26a4d8eec59e33e709321e5e16fb66e766b61b82a95e",
|
||||
"sha256:e894e9ae603e963f0842498c4cd5d39c6a60f0d7e4c103df50ee939564298658"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==4.21.12"
|
||||
"version": "==4.22.0"
|
||||
},
|
||||
"pyaes": {
|
||||
"hashes": [
|
||||
@@ -838,14 +923,6 @@
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.8.2"
|
||||
},
|
||||
"python-dotenv": {
|
||||
"hashes": [
|
||||
"sha256:b7e3b04a59693c42c36f9ab1cc2acc46fa5df8c78e178fc33a8d4cd05c8d498f",
|
||||
"sha256:d92a187be61fe482e4fd675b6d52200e7be63a12b724abbf931a40ce4fa92938"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==0.20.0"
|
||||
},
|
||||
"python-slugify": {
|
||||
"hashes": [
|
||||
"sha256:51f217508df20a6c166c7821683384b998560adcf8f19a6c2ca8b460528ccd9c",
|
||||
@@ -1019,11 +1096,11 @@
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
"sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f",
|
||||
"sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b"
|
||||
"sha256:64299f4909223da747622c030b781c0d7811e359c37124b4bd368fb8c6518baa",
|
||||
"sha256:98b1b2782e3c6c4904938b84c0eb932721069dfdb9134313beff7c83c2df24bf"
|
||||
],
|
||||
"markers": "python_version >= '3.7' and python_version < '4'",
|
||||
"version": "==2.28.0"
|
||||
"version": "==2.28.2"
|
||||
},
|
||||
"requests-oauthlib": {
|
||||
"hashes": [
|
||||
@@ -1067,11 +1144,11 @@
|
||||
},
|
||||
"selenium": {
|
||||
"hashes": [
|
||||
"sha256:20f28ee4ea9b273b4112a7df5276ebb3052f79ff6eff42a564db6143e5926683",
|
||||
"sha256:fee36724d6cf0b18c73781bb8ec7be4a35ab1e2564e64e64e64da75e50e052af"
|
||||
"sha256:bd04eb41395605d9b2b65fe587f3fed21431da75512985c52772529e5e210c60",
|
||||
"sha256:c48372905bffcc3b24bd55ab4683a07ee5e1f30fe918c59558ea5ee44cedf6c3"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==4.8.0"
|
||||
"version": "==4.8.2"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
@@ -1106,11 +1183,11 @@
|
||||
},
|
||||
"soupsieve": {
|
||||
"hashes": [
|
||||
"sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759",
|
||||
"sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"
|
||||
"sha256:49e5368c2cda80ee7e84da9dbe3e110b70a4575f196efb74e51b94549d921955",
|
||||
"sha256:e28dba9ca6c7c00173e34e4ba57448f0688bb681b7c5e8bf4971daafc093d69a"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.3.2.post1"
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==2.4"
|
||||
},
|
||||
"telethon": {
|
||||
"hashes": [
|
||||
@@ -1160,11 +1237,11 @@
|
||||
},
|
||||
"typing-extensions": {
|
||||
"hashes": [
|
||||
"sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa",
|
||||
"sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"
|
||||
"sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb",
|
||||
"sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==4.4.0"
|
||||
"version": "==4.5.0"
|
||||
},
|
||||
"typing-inspect": {
|
||||
"hashes": [
|
||||
@@ -1198,27 +1275,30 @@
|
||||
"version": "==4.1.1"
|
||||
},
|
||||
"urllib3": {
|
||||
"hashes": [
|
||||
"sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14",
|
||||
"sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"
|
||||
"extras": [
|
||||
"socks"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
|
||||
"version": "==1.26.9"
|
||||
"hashes": [
|
||||
"sha256:076907bf8fd355cde77728471316625a4d2f7e713c125f51953bb5b3eecf4f72",
|
||||
"sha256:75edcdc2f7d85b137124a6c3c9fc3933cdeaa12ecb9a6a959f22797a0feca7e1"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
|
||||
"version": "==1.26.14"
|
||||
},
|
||||
"vk-api": {
|
||||
"hashes": [
|
||||
"sha256:11c731e214ebc7fa911db81efb021f97587493a5402b992f24748fe1cd9d7afc",
|
||||
"sha256:d0ae766fa93a40d47c5da045d94201721bf766dbde122a1d2253516b35c5edf3"
|
||||
"sha256:c71021506449afe5b9bbb1c4acb0d86b35a007ddc21678478e46fbbeabd1f3ef",
|
||||
"sha256:c7741e40bc05980c91ed94c84542e1e7e7370e101b5eaa74222958d4130fe3c2"
|
||||
],
|
||||
"version": "==11.9.8"
|
||||
"version": "==11.9.9"
|
||||
},
|
||||
"vk-url-scraper": {
|
||||
"hashes": [
|
||||
"sha256:1cd6daad89a1f920902cb68c5952c5ab5e80ba2bf4a8c3657c781b5b0f9d406b",
|
||||
"sha256:d430de947575e321cedceecfdf198b8bd14db3026038b924547e8b1c7c6a09ed"
|
||||
"sha256:5a32fb5419f7bb8bd35de8548948fe27a06f857a4d086c87e142bf07aabc3fd7",
|
||||
"sha256:a87c5aa7c1570c3aa87031e78c2052105e3681f57503fd4cb56470c3ab6106d6"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.3.10"
|
||||
"version": "==0.3.15"
|
||||
},
|
||||
"websockets": {
|
||||
"hashes": [
|
||||
@@ -1297,11 +1377,11 @@
|
||||
},
|
||||
"werkzeug": {
|
||||
"hashes": [
|
||||
"sha256:7ea2d48322cc7c0f8b3a215ed73eabd7b5d75d0b50e31ab006286ccff9e00b8f",
|
||||
"sha256:f979ab81f58d7318e064e99c4506445d60135ac5cd2e177a2de0089bfd4c9bd5"
|
||||
"sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe",
|
||||
"sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==2.2.2"
|
||||
"version": "==2.2.3"
|
||||
},
|
||||
"wsproto": {
|
||||
"hashes": [
|
||||
@@ -1313,11 +1393,11 @@
|
||||
},
|
||||
"yt-dlp": {
|
||||
"hashes": [
|
||||
"sha256:0e7b81fc6ac8d1b7d3fffa79f9044ca4163784422582c9a3593305da2a69ec02",
|
||||
"sha256:d7d1f81d230756f094b4d9ee59b37b2c13b2e63ff5fb72cda53625edb072cdae"
|
||||
"sha256:3b2df037c80922f0f83f63ee2f9253496b4a8668c0fe8d2a836ba9040f853b07",
|
||||
"sha256:9af92de5effc193bdb51216d9ebf28874d96180d202fae752b0d9f2a63380f3a"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2022.7.18"
|
||||
"version": "==2023.2.17"
|
||||
}
|
||||
},
|
||||
"develop": {
|
||||
|
||||
12
README.md
12
README.md
@@ -153,11 +153,11 @@ These assume you've installed with pipenv, see docker section above for how to r
|
||||
# all the configurations come from ./orchestration.yaml
|
||||
auto-archiver
|
||||
# all the configurations come from ./secrets/orchestration.yaml
|
||||
auto-archiver --config orchestration.yaml
|
||||
# uses the configurations but for another google docs sheet
|
||||
auto-archiver --config secrets/orchestration.yaml
|
||||
# uses the same configurations but for another google docs sheet
|
||||
# with a header on row 2 and with some different column names
|
||||
# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided
|
||||
auto-archiver --config orchestration.yaml --gsheets_feeder.sheet="use it on another sheets doc" --gsheets_feeder.header=2 --gsheets_feeder.columns='{"url": "link"}'
|
||||
auto-archiver --config orchestration.yaml --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}'
|
||||
# all the configurations come from orchestration.yaml and specifies that s3 files should be private
|
||||
auto-archiver --s3_storage.private=1
|
||||
```
|
||||
@@ -166,11 +166,11 @@ auto-archiver --s3_storage.private=1
|
||||
#### Google Drive
|
||||
To use Google Drive storage you need the id of the shared folder in the `config.yaml` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com` and then you can use `--storage=gd`
|
||||
|
||||
#### Telethon (Telegrams API Library)
|
||||
#### Telethon + Instagram with telegram bot
|
||||
The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root.
|
||||
|
||||
|
||||
## Running on Google Sheets Feeder (gsheets_feeder)
|
||||
## Running on Google Sheets Feeder (gsheet_feeder)
|
||||
The `--gseets_feeder.sheet` property is the name of the Google Sheet to check for URLs.
|
||||
This sheet must have been shared with the Google Service account used by `gspread`.
|
||||
This sheet must also have specific columns (case-insensitive) in the `header` row - see [Gsheet.configs](src/auto_archiver/utils/gsheet.py) for all their names.
|
||||
@@ -183,7 +183,7 @@ When the auto archiver starts running, it updates the "Archive status" column.
|
||||

|
||||
The links are downloaded and archived, and the spreadsheet is updated to the following:
|
||||

|
||||
Note that the first row is skipped, as it is assumed to be a header row (`--gsheets_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked.
|
||||
Note that the first row is skipped, as it is assumed to be a header row (`--gsheet_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked.
|
||||
|
||||
|
||||
---
|
||||
|
||||
@@ -1,22 +1,21 @@
|
||||
steps:
|
||||
# only 1 feeder allowed
|
||||
# feeder: cli_feeder # default feeder
|
||||
feeder: gsheet_feeder # default -> only expects URL from CLI
|
||||
archivers: # order matters
|
||||
feeder: gsheet_feeder # defaults to cli_feeder
|
||||
archivers: # order matters, uncomment to activate
|
||||
# - vk_archiver
|
||||
# - telethon_archiver
|
||||
# - telegram_archiver
|
||||
# - twitter_archiver
|
||||
# - twitter_api_archiver
|
||||
# - instagram_archiver
|
||||
# - instagram_tbot_archiver
|
||||
# - instagram_archiver
|
||||
# - tiktok_archiver
|
||||
- youtubedl_archiver
|
||||
# - wayback_archiver_enricher
|
||||
- wayback_archiver_enricher
|
||||
enrichers:
|
||||
- hash_enricher
|
||||
- screenshot_enricher
|
||||
- thumbnail_enricher
|
||||
# - screenshot_enricher
|
||||
# - thumbnail_enricher
|
||||
# - wayback_archiver_enricher
|
||||
# - wacz_enricher
|
||||
|
||||
@@ -26,16 +25,18 @@ steps:
|
||||
# - s3_storage
|
||||
# - gdrive_storage
|
||||
databases:
|
||||
# - console_db
|
||||
- console_db
|
||||
# - csv_db
|
||||
- gsheet_db
|
||||
# - gsheet_db
|
||||
# - mongo_db
|
||||
|
||||
configurations:
|
||||
gsheet_feeder:
|
||||
sheet: auto-archiver-test
|
||||
header: 2 # defaults to 1 in GSheetsFeeder
|
||||
sheet: "your sheet name"
|
||||
header: 1
|
||||
service_account: "secrets/service_account.json"
|
||||
# allow_worksheets: "only parse this worksheet"
|
||||
# block_worksheets: "blocked sheet 1,blocked sheet 2"
|
||||
use_sheet_names_in_stored_paths: false
|
||||
columns:
|
||||
url: link
|
||||
@@ -53,27 +54,70 @@ configurations:
|
||||
hash: hash
|
||||
wacz: wacz
|
||||
replaywebpage: replaywebpage
|
||||
instagram_tbot_archiver:
|
||||
api_id: "TELEGRAM_BOT_API_ID"
|
||||
api_hash: "TELEGRAM_BOT_API_HASH"
|
||||
# session_file: "secrets/anon"
|
||||
telethon_archiver:
|
||||
api_id: "TELEGRAM_BOT_API_ID"
|
||||
api_hash: "TELEGRAM_BOT_API_HASH"
|
||||
# session_file: "secrets/anon"
|
||||
join_channels: false
|
||||
channel_invites: # if you want to archive from private channels
|
||||
- invite: https://t.me/+123456789
|
||||
id: 0000000001
|
||||
- invite: https://t.me/+123456788
|
||||
id: 0000000002
|
||||
|
||||
twitter_api_archiver:
|
||||
# either bearer_token only
|
||||
bearer_token: "TWITTER_BEARER_TOKEN"
|
||||
# OR all of the below
|
||||
# consumer_key: ""
|
||||
# consumer_secret: ""
|
||||
# access_token: ""
|
||||
# access_secret: ""
|
||||
instagram_archiver:
|
||||
username: "INSTAGRAM_USERNAME"
|
||||
password: "INSTAGRAM_PASSWORD"
|
||||
# session_file: "secrets/instaloader.session"
|
||||
|
||||
vk_archiver:
|
||||
username: "or phone number"
|
||||
password: "vk pass"
|
||||
session_file: "secrets/vk_config.v2.json"
|
||||
|
||||
screenshot_enricher:
|
||||
width: 1280
|
||||
height: 2300
|
||||
wayback_archiver_enricher:
|
||||
timeout: 10
|
||||
key: ""
|
||||
secret: ""
|
||||
key: "wayback key"
|
||||
secret: "wayback secret"
|
||||
hash_enricher:
|
||||
algorithm: "SHA3-512"
|
||||
# wacz:
|
||||
# profile: secrets/profile.tar.gz
|
||||
algorithm: "SHA3-512" # can also be SHA-256
|
||||
wacz_enricher:
|
||||
profile: secrets/profile.tar.gz
|
||||
local_storage:
|
||||
save_to: "./local_archive"
|
||||
save_absolute: true
|
||||
filename_generator: static
|
||||
path_generator: flat
|
||||
s3_storage:
|
||||
bucket: your-bucket-name
|
||||
region: reg1
|
||||
key: S3_KEY
|
||||
secret: S3_SECRET
|
||||
endpoint_url: "https://{region}.digitaloceanspaces.com"
|
||||
cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
|
||||
# if private:true S3 urls will not be readable online
|
||||
private: false
|
||||
# with 'random' you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files, alternative is 'default' or not omitted from config
|
||||
key_path: random
|
||||
|
||||
gdrive_storage:
|
||||
path_generator: url
|
||||
filename_generator: random
|
||||
root_folder_id: TODO
|
||||
oauth_token: secrets/gd-token.json
|
||||
root_folder_id: folder_id_from_url
|
||||
oauth_token: secrets/gd-token.json # needs to be generated with scripts/create_update_gdrive_oauth_token.py
|
||||
service_account: "secrets/service_account.json"
|
||||
|
||||
@@ -3,8 +3,8 @@ from abc import abstractmethod
|
||||
from dataclasses import dataclass
|
||||
import os
|
||||
import mimetypes, requests
|
||||
from ..core import Metadata
|
||||
from ..core import Step
|
||||
|
||||
from ..core import Metadata, Step, ArchivingContext
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -51,7 +51,7 @@ class Archiver(Step):
|
||||
if len(to_filename) > 64:
|
||||
to_filename = to_filename[-64:]
|
||||
if item:
|
||||
to_filename = os.path.join(item.get_tmp_dir(), to_filename)
|
||||
to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
||||
}
|
||||
|
||||
@@ -2,14 +2,14 @@
|
||||
from telethon.sync import TelegramClient
|
||||
from loguru import logger
|
||||
import time, os
|
||||
|
||||
from sqlite3 import OperationalError
|
||||
from . import Archiver
|
||||
from ..core import Metadata, Media
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
|
||||
|
||||
class InstagramTbotArchiver(Archiver):
|
||||
"""
|
||||
calls a telegram bot to fetch instagram posts/stories...
|
||||
calls a telegram bot to fetch instagram posts/stories... and gets available media from it
|
||||
https://github.com/adw0rd/instagrapi
|
||||
https://t.me/instagram_load_bot
|
||||
"""
|
||||
@@ -20,14 +20,17 @@ class InstagramTbotArchiver(Archiver):
|
||||
self.assert_valid_string("api_id")
|
||||
self.assert_valid_string("api_hash")
|
||||
self.timeout = int(self.timeout)
|
||||
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
|
||||
try:
|
||||
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
|
||||
except OperationalError as e:
|
||||
logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_archiver. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
|
||||
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
||||
"session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
||||
"session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
||||
"timeout": {"default": 15, "help": "timeout to fetch the instagram content in seconds."},
|
||||
}
|
||||
|
||||
@@ -41,26 +44,33 @@ class InstagramTbotArchiver(Archiver):
|
||||
if not "instagram.com" in url: return False
|
||||
|
||||
result = Metadata()
|
||||
tmp_dir = item.get_tmp_dir()
|
||||
tmp_dir = ArchivingContext.get_tmp_dir()
|
||||
with self.client.start():
|
||||
chat = self.client.get_entity("instagram_load_bot")
|
||||
since_id = self.client.send_message(entity=chat, message=url).id
|
||||
|
||||
attempts = 0
|
||||
media = None
|
||||
seen_media = []
|
||||
message = ""
|
||||
time.sleep(4)
|
||||
while attempts < self.timeout and (not message or not media):
|
||||
# media is added before text by the bot so it can be used as a stop-logic mechanism
|
||||
while attempts < self.timeout and (not message or not len(seen_media)):
|
||||
attempts += 1
|
||||
time.sleep(1)
|
||||
for post in self.client.iter_messages(chat, min_id=since_id):
|
||||
since_id = max(since_id, post.id)
|
||||
if post.media and not media:
|
||||
if post.media and post.id not in seen_media:
|
||||
filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
|
||||
media = self.client.download_media(post.media, filename_dest)
|
||||
if media: result.add_media(Media(media))
|
||||
if media:
|
||||
result.add_media(Media(media))
|
||||
seen_media.append(post.id)
|
||||
if post.message: message += post.message
|
||||
|
||||
if "You must enter a URL to a post" in message:
|
||||
logger.debug(f"invalid link {url=} for {self.name}: {message}")
|
||||
return False
|
||||
|
||||
if message:
|
||||
result.set_content(message).set_title(message[:128])
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ from tqdm import tqdm
|
||||
import re, time, json, os
|
||||
|
||||
from . import Archiver
|
||||
from ..core import Metadata, Media
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
|
||||
|
||||
class TelethonArchiver(Archiver):
|
||||
@@ -128,7 +128,7 @@ class TelethonArchiver(Archiver):
|
||||
media_posts = self._get_media_posts_in_group(chat, post)
|
||||
logger.debug(f'got {len(media_posts)=} for {url=}')
|
||||
|
||||
tmp_dir = item.get_tmp_dir()
|
||||
tmp_dir = ArchivingContext.get_tmp_dir()
|
||||
|
||||
group_id = post.grouped_id if post.grouped_id is not None else post.id
|
||||
title = post.message
|
||||
|
||||
@@ -3,7 +3,7 @@ import tiktok_downloader
|
||||
from loguru import logger
|
||||
|
||||
from . import Archiver
|
||||
from ..core import Metadata, Media
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
|
||||
|
||||
class TiktokArchiver(Archiver):
|
||||
@@ -41,7 +41,7 @@ class TiktokArchiver(Archiver):
|
||||
logger.warning(f'Other Tiktok error {error}')
|
||||
|
||||
try:
|
||||
filename = os.path.join(item.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4')
|
||||
filename = os.path.join(ArchivingContext.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4')
|
||||
tiktok_media = tiktok_downloader.snaptik(url).get_media()
|
||||
|
||||
if len(tiktok_media) <= 0:
|
||||
|
||||
@@ -3,7 +3,7 @@ from vk_url_scraper import VkScraper
|
||||
|
||||
from ..utils.misc import dump_payload
|
||||
from . import Archiver
|
||||
from ..core import Metadata, Media
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
|
||||
|
||||
class VkArchiver(Archiver):
|
||||
@@ -50,7 +50,7 @@ class VkArchiver(Archiver):
|
||||
|
||||
result.set_content(dump_payload(vk_scrapes))
|
||||
|
||||
filenames = self.vks.download_media(vk_scrapes, item.get_tmp_dir())
|
||||
filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir())
|
||||
for filename in filenames:
|
||||
result.add_media(Media(filename))
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ import datetime, os, yt_dlp
|
||||
from loguru import logger
|
||||
|
||||
from . import Archiver
|
||||
from ..core import Metadata, Media
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
|
||||
|
||||
class YoutubeDLArchiver(Archiver):
|
||||
@@ -25,7 +25,7 @@ class YoutubeDLArchiver(Archiver):
|
||||
logger.debug('Using Facebook cookie')
|
||||
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
|
||||
|
||||
ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(item.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False})
|
||||
ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False})
|
||||
|
||||
try:
|
||||
# don'd download since it can be a live stream
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from .media import Media
|
||||
from .metadata import Metadata
|
||||
from .media import Media
|
||||
from .step import Step
|
||||
from .context import ArchivingContext
|
||||
|
||||
# cannot import ArchivingOrchestrator/Config to avoid circular dep
|
||||
# from .orchestrator import ArchivingOrchestrator
|
||||
|
||||
53
src/auto_archiver/core/context.py
Normal file
53
src/auto_archiver/core/context.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class ArchivingContext:
|
||||
"""
|
||||
Singleton context class.
|
||||
ArchivingContext._get_instance() to retrieve it if needed
|
||||
otherwise just
|
||||
ArchivingContext.set(key, value)
|
||||
and
|
||||
ArchivingContext.get(key, default)
|
||||
|
||||
When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True)
|
||||
reset(full_reset=True) will recreate everything including the keep_on_reset status
|
||||
"""
|
||||
_instance = None
|
||||
|
||||
def __init__(self):
|
||||
self.configs = {}
|
||||
self.keep_on_reset = set()
|
||||
|
||||
@staticmethod
|
||||
def get_instance():
|
||||
if ArchivingContext._instance is None:
|
||||
ArchivingContext._instance = ArchivingContext()
|
||||
return ArchivingContext._instance
|
||||
|
||||
@staticmethod
|
||||
def set(key, value, keep_on_reset: bool = False):
|
||||
logger.error(f"SET [{key}]={value}")
|
||||
ac = ArchivingContext.get_instance()
|
||||
ac.configs[key] = value
|
||||
if keep_on_reset: ac.keep_on_reset.add(key)
|
||||
|
||||
@staticmethod
|
||||
def get(key: str, default=None):
|
||||
return ArchivingContext.get_instance().configs.get(key, default)
|
||||
|
||||
@staticmethod
|
||||
def reset(full_reset: bool = False):
|
||||
ac = ArchivingContext.get_instance()
|
||||
if full_reset: ac.keep_on_reset = set()
|
||||
ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
|
||||
|
||||
# ---- custom getters/setters for widely used context values
|
||||
|
||||
@staticmethod
|
||||
def set_tmp_dir(tmp_dir: str):
|
||||
ArchivingContext.get_instance().configs["tmp_dir"] = tmp_dir
|
||||
|
||||
@staticmethod
|
||||
def get_tmp_dir() -> str:
|
||||
return ArchivingContext.get_instance().configs.get("tmp_dir")
|
||||
@@ -3,18 +3,46 @@ from __future__ import annotations
|
||||
from ast import List
|
||||
from typing import Any
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses_json import dataclass_json
|
||||
from dataclasses_json import dataclass_json, config
|
||||
import mimetypes
|
||||
|
||||
# annotation order matters
|
||||
@dataclass_json
|
||||
from .context import ArchivingContext
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
@dataclass_json # annotation order matters
|
||||
@dataclass
|
||||
class Media:
|
||||
filename: str
|
||||
key: str = None
|
||||
urls: List[str] = field(default_factory=list)
|
||||
_mimetype: str = None # eg: image/jpeg
|
||||
properties: dict = field(default_factory=dict)
|
||||
_mimetype: str = None # eg: image/jpeg
|
||||
_stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True)) # always exclude
|
||||
|
||||
def store(self: Media, override_storages: List = None, url: str = "url-not-available"):
|
||||
# stores the media into the provided/available storages [Storage]
|
||||
# repeats the process for its properties, in case they have inner media themselves
|
||||
# for now it only goes down 1 level but it's easy to make it recursive if needed
|
||||
storages = override_storages or ArchivingContext.get("storages")
|
||||
if not len(storages):
|
||||
logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
|
||||
return
|
||||
|
||||
for s in storages:
|
||||
s.store(self, url)
|
||||
# Media can be inside media properties, examples include transformations on original media
|
||||
for prop in self.properties.values():
|
||||
if isinstance(prop, Media):
|
||||
s.store(prop, url)
|
||||
if isinstance(prop, list):
|
||||
for prop_media in prop:
|
||||
if isinstance(prop_media, Media):
|
||||
s.store(prop_media, url)
|
||||
|
||||
def is_stored(self) -> bool:
|
||||
return len(self.urls) > 0
|
||||
|
||||
def set(self, key: str, value: Any) -> Media:
|
||||
self.properties[key] = value
|
||||
@@ -40,3 +68,6 @@ class Media:
|
||||
|
||||
def is_video(self) -> bool:
|
||||
return self.mimetype.startswith("video")
|
||||
|
||||
def is_audio(self) -> bool:
|
||||
return self.mimetype.startswith("audio")
|
||||
|
||||
@@ -3,24 +3,25 @@ from __future__ import annotations
|
||||
from ast import List, Set
|
||||
from typing import Any, Union, Dict
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses_json import dataclass_json
|
||||
from dataclasses_json import dataclass_json, config
|
||||
import datetime
|
||||
from urllib.parse import urlparse
|
||||
from dateutil.parser import parse as parse_dt
|
||||
from .media import Media
|
||||
from .context import ArchivingContext
|
||||
|
||||
|
||||
# annotation order matters
|
||||
@dataclass_json
|
||||
@dataclass_json # annotation order matters
|
||||
@dataclass
|
||||
class Metadata:
|
||||
status: str = "no archiver"
|
||||
_processed_at: datetime = field(default_factory=datetime.datetime.utcnow)
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude": True}) # keys that are not to be saved in DBs
|
||||
media: List[Media] = field(default_factory=list)
|
||||
rearchivable: bool = True # defaults to true, archivers can overwrite
|
||||
|
||||
def __post_init__(self):
|
||||
self.set("_processed_at", datetime.datetime.utcnow())
|
||||
|
||||
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
||||
"""
|
||||
merges two Metadata instances, will overwrite according to overwrite_left flag
|
||||
@@ -30,7 +31,6 @@ class Metadata:
|
||||
if right.status and len(right.status):
|
||||
self.status = right.status
|
||||
self.rearchivable |= right.rearchivable
|
||||
self.tmp_keys |= right.tmp_keys
|
||||
for k, v in right.metadata.items():
|
||||
assert k not in self.metadata or type(v) == type(self.get(k))
|
||||
if type(v) not in [dict, list, set] or k not in self.metadata:
|
||||
@@ -43,10 +43,14 @@ class Metadata:
|
||||
return right.merge(self)
|
||||
return self
|
||||
|
||||
def set(self, key: str, val: Any, is_tmp=False) -> Metadata:
|
||||
# if not self.metadata: self.metadata = {}
|
||||
def store(self: Metadata, override_storages: List = None):
|
||||
# calls .store for all contained media. storages [Storage]
|
||||
storages = override_storages or ArchivingContext.get("storages")
|
||||
for media in self.media:
|
||||
media.store(override_storages=storages)
|
||||
|
||||
def set(self, key: str, val: Any) -> Metadata:
|
||||
self.metadata[key] = val
|
||||
if is_tmp: self.tmp_keys.add(key)
|
||||
return self
|
||||
|
||||
def get(self, key: str, default: Any = None, create_if_missing=False) -> Union[Metadata, str]:
|
||||
@@ -64,7 +68,7 @@ class Metadata:
|
||||
return "success" in self.status
|
||||
|
||||
def is_empty(self) -> bool:
|
||||
return not self.is_success() and len(self.media) == 0 and len(self.get_clean_metadata()) <= 2 # url, processed_at
|
||||
return not self.is_success() and len(self.media) == 0 and len(self.metadata) <= 2 # url, processed_at
|
||||
|
||||
@property # getter .netloc
|
||||
def netloc(self) -> str:
|
||||
@@ -93,12 +97,6 @@ class Metadata:
|
||||
def get_title(self) -> str:
|
||||
return self.get("title")
|
||||
|
||||
def set_tmp_dir(self, tmp_dir: str) -> Metadata:
|
||||
return self.set("tmp_dir", tmp_dir, True)
|
||||
|
||||
def get_tmp_dir(self) -> str:
|
||||
return self.get("tmp_dir")
|
||||
|
||||
def set_timestamp(self, timestamp: datetime.datetime) -> Metadata:
|
||||
if type(timestamp) == str:
|
||||
timestamp = parse_dt(timestamp)
|
||||
@@ -139,8 +137,5 @@ class Metadata:
|
||||
_default = self.media[0] if len(self.media) else None
|
||||
return self.get_media_by_id("_final_media", _default)
|
||||
|
||||
def get_clean_metadata(self) -> Metadata:
|
||||
return dict(
|
||||
{k: v for k, v in self.metadata.items() if k not in self.tmp_keys},
|
||||
**{"processed_at": self._processed_at}
|
||||
)
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
@@ -2,6 +2,8 @@ from __future__ import annotations
|
||||
from ast import List
|
||||
from typing import Union
|
||||
|
||||
from .context import ArchivingContext
|
||||
|
||||
from ..archivers import Archiver
|
||||
from ..feeders import Feeder
|
||||
from ..formatters import Formatter
|
||||
@@ -23,6 +25,7 @@ class ArchivingOrchestrator:
|
||||
self.archivers: List[Archiver] = config.archivers
|
||||
self.databases: List[Database] = config.databases
|
||||
self.storages: List[Storage] = config.storages
|
||||
ArchivingContext.set("storages", self.storages, keep_on_reset=True)
|
||||
|
||||
for a in self.archivers: a.setup()
|
||||
|
||||
@@ -32,8 +35,9 @@ class ArchivingOrchestrator:
|
||||
|
||||
def feed_item(self, item: Metadata) -> Metadata:
|
||||
try:
|
||||
ArchivingContext.reset()
|
||||
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
|
||||
item.set_tmp_dir(tmp_dir)
|
||||
ArchivingContext.set_tmp_dir(tmp_dir)
|
||||
return self.archive(item)
|
||||
except KeyboardInterrupt:
|
||||
# catches keyboard interruptions to do a clean exit
|
||||
@@ -105,22 +109,12 @@ class ArchivingOrchestrator:
|
||||
|
||||
# 5 - store media
|
||||
# looks for Media in result.media and also result.media[x].properties (as list or dict values)
|
||||
for s in self.storages:
|
||||
for m in result.media:
|
||||
s.store(m, result) # modifies media
|
||||
# Media can be inside media properties, examples include transformations on original media
|
||||
for prop in m.properties.values():
|
||||
if isinstance(prop, Media):
|
||||
s.store(prop, result)
|
||||
if isinstance(prop, list) and len(prop) > 0 and isinstance(prop[0], Media):
|
||||
for prop_media in prop:
|
||||
s.store(prop_media, result)
|
||||
result.store()
|
||||
|
||||
# 6 - format and store formatted if needed
|
||||
# enrichers typically need access to already stored URLs etc
|
||||
if (final_media := self.formatter.format(result)):
|
||||
for s in self.storages:
|
||||
s.store(final_media, result)
|
||||
final_media.store(url=url)
|
||||
result.set_final_media(final_media)
|
||||
|
||||
if result.is_empty():
|
||||
|
||||
@@ -5,8 +5,7 @@ from urllib.parse import quote
|
||||
from loguru import logger
|
||||
|
||||
from . import Database
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
from ..utils import GWorksheet
|
||||
|
||||
|
||||
@@ -86,7 +85,7 @@ class GsheetsDb(Database):
|
||||
logger.debug(f"Unable to update sheet: {e}")
|
||||
|
||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now
|
||||
gw: GWorksheet = item.get("gsheet").get("worksheet")
|
||||
row: int = item.get("gsheet").get("row")
|
||||
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
|
||||
gw: GWorksheet = ArchivingContext.get("gsheet").get("worksheet")
|
||||
row: int = ArchivingContext.get("gsheet").get("row")
|
||||
return gw, row
|
||||
|
||||
@@ -3,4 +3,5 @@ from .screenshot_enricher import ScreenshotEnricher
|
||||
from .wayback_enricher import WaybackArchiverEnricher
|
||||
from .hash_enricher import HashEnricher
|
||||
from .thumbnail_enricher import ThumbnailEnricher
|
||||
from .wacz_enricher import WaczEnricher
|
||||
from .wacz_enricher import WaczEnricher
|
||||
from .whisper_enricher import WhisperEnricher
|
||||
@@ -16,11 +16,13 @@ class HashEnricher(Enricher):
|
||||
super().__init__(config)
|
||||
algo_choices = self.configs()["algorithm"]["choices"]
|
||||
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
|
||||
self.chunksize = int(self.chunksize)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}
|
||||
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
|
||||
"chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
@@ -28,12 +30,19 @@ class HashEnricher(Enricher):
|
||||
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
|
||||
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
with open(m.filename, "rb") as f:
|
||||
bytes = f.read() # read entire file as bytes
|
||||
hash = None
|
||||
if self.algorithm == "SHA-256":
|
||||
hash = hashlib.sha256(bytes)
|
||||
elif self.algorithm == "SHA3-512":
|
||||
hash = hashlib.sha3_512(bytes)
|
||||
else: continue
|
||||
to_enrich.media[i].set("hash", f"{self.algorithm}:{hash.hexdigest()}")
|
||||
if len(hd := self.calculate_hash(m.filename)):
|
||||
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
|
||||
|
||||
def calculate_hash(self, filename):
|
||||
hash = None
|
||||
if self.algorithm == "SHA-256":
|
||||
hash = hashlib.sha256()
|
||||
elif self.algorithm == "SHA3-512":
|
||||
hash = hashlib.sha3_512()
|
||||
else: return ""
|
||||
with open(filename, "rb") as f:
|
||||
while True:
|
||||
buf = f.read(self.chunksize)
|
||||
if not buf: break
|
||||
hash.update(buf)
|
||||
return hash.hexdigest()
|
||||
|
||||
@@ -4,7 +4,7 @@ from selenium.common.exceptions import TimeoutException
|
||||
|
||||
from . import Enricher
|
||||
from ..utils import Webdriver, UrlUtil
|
||||
from ..core import Media, Metadata
|
||||
from ..core import Media, Metadata, ArchivingContext
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
name = "screenshot_enricher"
|
||||
@@ -14,7 +14,8 @@ class ScreenshotEnricher(Enricher):
|
||||
return {
|
||||
"width": {"default": 1280, "help": "width of the screenshots"},
|
||||
"height": {"default": 720, "help": "height of the screenshots"},
|
||||
"timeout": {"default": 60, "help": "timeout for taking the screenshot"}
|
||||
"timeout": {"default": 60, "help": "timeout for taking the screenshot"},
|
||||
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
@@ -27,12 +28,11 @@ class ScreenshotEnricher(Enricher):
|
||||
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver:
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(2)
|
||||
screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
|
||||
time.sleep(int(self.sleep_before_screenshot))
|
||||
screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
|
||||
driver.save_screenshot(screenshot_file)
|
||||
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
|
||||
except TimeoutException:
|
||||
logger.info("TimeoutException loading page for screenshot")
|
||||
except Exception as e:
|
||||
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
|
||||
# return None
|
||||
|
||||
@@ -2,7 +2,7 @@ import ffmpeg, os, uuid
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Media, Metadata
|
||||
from ..core import Media, Metadata, ArchivingContext
|
||||
|
||||
|
||||
class ThumbnailEnricher(Enricher):
|
||||
@@ -23,7 +23,7 @@ class ThumbnailEnricher(Enricher):
|
||||
logger.debug(f"generating thumbnails")
|
||||
for i, m in enumerate(to_enrich.media[::]):
|
||||
if m.is_video():
|
||||
folder = os.path.join(to_enrich.get_tmp_dir(), str(uuid.uuid4()))
|
||||
folder = os.path.join(ArchivingContext.get_tmp_dir(), str(uuid.uuid4()))
|
||||
os.makedirs(folder, exist_ok=True)
|
||||
logger.debug(f"generating thumbnails for {m.filename}")
|
||||
fps, duration = 0.5, m.get("duration")
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
import os, shutil, subprocess, uuid
|
||||
from loguru import logger
|
||||
|
||||
from ..core import Media, Metadata
|
||||
from ..core import Media, Metadata, ArchivingContext
|
||||
from . import Enricher
|
||||
from ..utils import UrlUtil
|
||||
|
||||
|
||||
class WaczEnricher(Enricher):
|
||||
@@ -20,14 +21,20 @@ class WaczEnricher(Enricher):
|
||||
return {
|
||||
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
|
||||
"timeout": {"default": 90, "help": "timeout for WACZ generation in seconds"},
|
||||
"ignore_auth_wall": {"default": True, "help": "skip URL if it is behind authentication wall, set to False if you have browsertrix profile configured for private content."},
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> bool:
|
||||
# TODO: figure out support for browsertrix in docker
|
||||
url = to_enrich.get_url()
|
||||
|
||||
if UrlUtil.is_auth_wall(url):
|
||||
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
|
||||
return
|
||||
|
||||
logger.debug(f"generating WACZ for {url=}")
|
||||
collection = str(uuid.uuid4())[0:8]
|
||||
browsertrix_home = os.path.abspath(to_enrich.get_tmp_dir())
|
||||
browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
|
||||
cmd = [
|
||||
"docker", "run",
|
||||
"--rm", # delete container once it has completed running
|
||||
|
||||
121
src/auto_archiver/enrichers/whisper_enricher.py
Normal file
121
src/auto_archiver/enrichers/whisper_enricher.py
Normal file
@@ -0,0 +1,121 @@
|
||||
import traceback
|
||||
import requests, time
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
from ..storages import S3Storage
|
||||
|
||||
|
||||
class WhisperEnricher(Enricher):
|
||||
"""
|
||||
Connects with a Whisper API service to get texts out of audio
|
||||
whisper API repository: TODO
|
||||
Only works if an S3 compatible storage is used
|
||||
"""
|
||||
name = "whisper_enricher"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
assert type(self.api_key) == str and len(self.api_key) > 0, "please provide a value for the whisper_enricher api_key"
|
||||
self.timeout = int(self.timeout)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"api_endpoint": {"default": "https://whisper.spoettel.dev/api/v1", "help": "WhisperApi api endpoint"},
|
||||
"api_key": {"default": None, "help": "WhisperApi api key for authentication"},
|
||||
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
|
||||
"action": {"default": "translation", "help": "which Whisper operation to execute", "choices": ["transcript", "translation", "language_detection"]},
|
||||
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
if not self._get_s3_storage():
|
||||
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
|
||||
return
|
||||
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.")
|
||||
|
||||
job_results = {}
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
if m.is_video() or m.is_audio():
|
||||
m.store(url=url)
|
||||
try:
|
||||
job_id = self.submit_job(m)
|
||||
job_results[job_id] = False
|
||||
logger.debug(f"JOB SUBMITTED: {job_id=} for {m.key=}")
|
||||
to_enrich.media[i].set("whisper_model", {"job_id": job_id})
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to submit whisper job for {m.filename=} with error {e}\n{traceback.format_exc()}")
|
||||
|
||||
job_results = self.check_jobs(job_results)
|
||||
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
if m.is_video() or m.is_audio():
|
||||
job_id = to_enrich.media[i].get("whisper_model")["job_id"]
|
||||
to_enrich.media[i].set("whisper_model", {
|
||||
"job_id": job_id,
|
||||
**job_results[job_id]
|
||||
})
|
||||
|
||||
def submit_job(self, media: Media):
|
||||
s3 = self._get_s3_storage()
|
||||
s3_url = s3.get_cdn_url(media)
|
||||
assert s3_url in media.urls, f"Could not find S3 url ({s3_url}) in list of stored media urls "
|
||||
payload = {
|
||||
"url": s3_url,
|
||||
"type": self.action,
|
||||
# "language": "string" # may be a config
|
||||
}
|
||||
response = requests.post(f'{self.api_endpoint}/jobs', json=payload, headers={'Authorization': f'Bearer {self.api_key}'})
|
||||
assert response.status_code == 201, f"calling the whisper api {self.api_endpoint} returned a non-success code: {response.status_code}"
|
||||
logger.debug(response.json())
|
||||
return response.json()['id']
|
||||
|
||||
def check_jobs(self, job_results: dict):
|
||||
start_time = time.time()
|
||||
all_completed = False
|
||||
while not all_completed and (time.time() - start_time) <= self.timeout:
|
||||
all_completed = True
|
||||
for job_id in job_results:
|
||||
if job_results[job_id]: continue
|
||||
all_completed = False # at least one not ready
|
||||
try: job_results[job_id] = self.check_job(job_id)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to check {job_id=} with error {e}\n{traceback.format_exc()}")
|
||||
if not all_completed: time.sleep(3)
|
||||
return job_results
|
||||
|
||||
def check_job(self, job_id):
|
||||
r = requests.get(f'{self.api_endpoint}/jobs/{job_id}', headers={'Authorization': f'Bearer {self.api_key}'})
|
||||
assert r.status_code == 200, f"Job status did not respond with 200, instead with: {r.status_code}"
|
||||
j = r.json()
|
||||
logger.debug(f"Checked job {job_id=} with status='{j['status']}'")
|
||||
if j['status'] == "processing": return False
|
||||
elif j['status'] == "error": return f"Error: {j['meta']['error']}"
|
||||
elif j['status'] == "success":
|
||||
r_res = requests.get(f'{self.api_endpoint}/jobs/{job_id}/artifacts', headers={'Authorization': f'Bearer {self.api_key}'})
|
||||
assert r_res.status_code == 200, f"Job artifacts did not respond with 200, instead with: {r_res.status_code}"
|
||||
logger.success(r_res.json())
|
||||
result = {}
|
||||
for art_id, artifact in enumerate(r_res.json()):
|
||||
subtitle = []
|
||||
full_text = []
|
||||
for i, d in enumerate(artifact.get("data")):
|
||||
subtitle.append(f"{i+1}\n{d.get('start')} --> {d.get('end')}\n{d.get('text').strip()}")
|
||||
full_text.append(d.get('text').strip())
|
||||
if not len(subtitle): continue
|
||||
result[f"artifact_{art_id}_subtitle"] = "\n".join(subtitle)
|
||||
result[f"artifact_{art_id}_text"] = "\n".join(full_text)
|
||||
return result
|
||||
return False
|
||||
|
||||
def _get_s3_storage(self) -> S3Storage:
|
||||
try:
|
||||
return next(s for s in ArchivingContext.get("storages") if s.__class__ == S3Storage)
|
||||
except:
|
||||
logger.warning("No S3Storage instance found in storages")
|
||||
return
|
||||
@@ -1,7 +1,7 @@
|
||||
from loguru import logger
|
||||
|
||||
from . import Feeder
|
||||
from ..core import Metadata
|
||||
from ..core import Metadata, ArchivingContext
|
||||
|
||||
|
||||
class CLIFeeder(Feeder):
|
||||
@@ -26,5 +26,7 @@ class CLIFeeder(Feeder):
|
||||
def __iter__(self) -> Metadata:
|
||||
for url in self.urls:
|
||||
logger.debug(f"Processing {url}")
|
||||
yield Metadata().set_url(url).set("folder", "cli", True)
|
||||
yield Metadata().set_url(url)
|
||||
ArchivingContext.set("folder", "cli")
|
||||
|
||||
logger.success(f"Processed {len(self.urls)} URL(s)")
|
||||
|
||||
@@ -5,9 +5,10 @@ from slugify import slugify
|
||||
|
||||
# from . import Enricher
|
||||
from . import Feeder
|
||||
from ..core import Metadata
|
||||
from ..core import Metadata, ArchivingContext
|
||||
from ..utils import Gsheets, GWorksheet
|
||||
|
||||
|
||||
class GsheetsFeeder(Gsheets, Feeder):
|
||||
name = "gsheet_feeder"
|
||||
|
||||
@@ -31,7 +32,7 @@ class GsheetsFeeder(Gsheets, Feeder):
|
||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
},
|
||||
"use_sheet_names_in_stored_paths":{
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
}
|
||||
@@ -61,11 +62,12 @@ class GsheetsFeeder(Gsheets, Feeder):
|
||||
if status not in ['', None]: continue
|
||||
|
||||
# All checks done - archival process starts here
|
||||
m = Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True)
|
||||
m = Metadata().set_url(url)
|
||||
ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
|
||||
if self.use_sheet_names_in_stored_paths:
|
||||
m.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
|
||||
ArchivingContext.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
|
||||
yield m
|
||||
|
||||
|
||||
logger.success(f'Finished worksheet {wks.title}')
|
||||
|
||||
def should_process_sheet(self, sheet_name: str) -> bool:
|
||||
|
||||
@@ -6,7 +6,7 @@ from urllib.parse import quote
|
||||
from loguru import logger
|
||||
|
||||
from ..version import __version__
|
||||
from ..core import Metadata, Media
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
from . import Formatter
|
||||
|
||||
|
||||
@@ -40,10 +40,10 @@ class HtmlFormatter(Formatter):
|
||||
url=url,
|
||||
title=item.get_title(),
|
||||
media=item.media,
|
||||
metadata=item.get_clean_metadata(),
|
||||
metadata=item.metadata,
|
||||
version=__version__
|
||||
)
|
||||
html_path = os.path.join(item.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html")
|
||||
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html")
|
||||
with open(html_path, mode="w", encoding="utf-8") as outf:
|
||||
outf.write(content)
|
||||
return Media(filename=html_path)
|
||||
|
||||
@@ -29,7 +29,7 @@
|
||||
margin: auto;
|
||||
border: 1px solid;
|
||||
border-collapse: collapse;
|
||||
vertical-align:top;
|
||||
vertical-align: top;
|
||||
}
|
||||
|
||||
table.metadata td:first-child {
|
||||
@@ -42,7 +42,7 @@
|
||||
}
|
||||
|
||||
.copy:hover {
|
||||
font-weight: 600;
|
||||
background: aliceblue;
|
||||
cursor: copy;
|
||||
}
|
||||
|
||||
@@ -185,7 +185,11 @@
|
||||
el.addEventListener("copy", (e) => {
|
||||
e.preventDefault();
|
||||
if (e.clipboardData) {
|
||||
e.clipboardData.setData("text/plain", el.textContent);
|
||||
if (el.hasAttribute("copy-value")) {
|
||||
e.clipboardData.setData("text/plain", el.getAttribute("copy-value"));
|
||||
} else {
|
||||
e.clipboardData.setData("text/plain", el.textContent);
|
||||
}
|
||||
console.log(e.clipboardData.getData("text"))
|
||||
showNotification("copied!")
|
||||
}
|
||||
|
||||
@@ -46,14 +46,16 @@ No preview available for {{ m.key }}.
|
||||
{% endif %}
|
||||
{% if links %}
|
||||
<a href="{{ url }}">open</a> or
|
||||
<a href="{{ url }}" download="">download</a>
|
||||
<a href="{{ url }}" download="">download</a> or
|
||||
{{ copy_urlize(url, "copy") }}
|
||||
|
||||
<br>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
{%- endmacro -%}
|
||||
|
||||
{% macro copy_urlize(val) -%}
|
||||
{% macro copy_urlize(val, href_text) -%}
|
||||
|
||||
{% if val is mapping %}
|
||||
<ul>
|
||||
@@ -65,7 +67,11 @@ No preview available for {{ m.key }}.
|
||||
</ul>
|
||||
|
||||
{% else %}
|
||||
{% if href_text | length == 0 %}
|
||||
<span class="copy">{{ val | string | urlize }}</span>
|
||||
{% else %}
|
||||
<span class="copy" copy-value="{{val}}">{{ href_text | string | urlize }}</span>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
{%- endmacro -%}
|
||||
@@ -1,10 +1,10 @@
|
||||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from dataclasses import dataclass
|
||||
import hashlib
|
||||
from typing import IO, Any
|
||||
from typing import IO
|
||||
|
||||
from ..core import Media, Metadata, Step
|
||||
from ..core import Media, Step, ArchivingContext
|
||||
from ..enrichers import HashEnricher
|
||||
from loguru import logger
|
||||
import os, uuid
|
||||
from slugify import slugify
|
||||
@@ -41,8 +41,11 @@ class Storage(Step):
|
||||
# only for typing...
|
||||
return Step.init(name, config, Storage)
|
||||
|
||||
def store(self, media: Media, item: Metadata) -> None:
|
||||
self.set_key(media, item)
|
||||
def store(self, media: Media, url: str) -> None:
|
||||
if media.is_stored():
|
||||
logger.debug(f"{self.key} already stored, skipping")
|
||||
return
|
||||
self.set_key(media, url)
|
||||
self.upload(media)
|
||||
media.add_url(self.get_cdn_url(media))
|
||||
|
||||
@@ -57,25 +60,25 @@ class Storage(Step):
|
||||
with open(media.filename, 'rb') as f:
|
||||
return self.uploadf(f, media, **kwargs)
|
||||
|
||||
def set_key(self, media: Media, item: Metadata) -> None:
|
||||
def set_key(self, media: Media, url) -> None:
|
||||
"""takes the media and optionally item info and generates a key"""
|
||||
if media.key is not None and len(media.key) > 0: return
|
||||
folder = item.get("folder", "")
|
||||
folder = ArchivingContext.get("folder", "")
|
||||
filename, ext = os.path.splitext(media.filename)
|
||||
|
||||
# path_generator logic
|
||||
if self.path_generator == "flat":
|
||||
if self.path_generator == "flat":
|
||||
path = ""
|
||||
filename = slugify(filename) # in case it comes with os.sep
|
||||
elif self.path_generator == "url": path = slugify(item.get_url())
|
||||
filename = slugify(filename) # in case it comes with os.sep
|
||||
elif self.path_generator == "url": path = slugify(url)
|
||||
elif self.path_generator == "random":
|
||||
path = item.get("random_path", str(uuid.uuid4())[:16], True)
|
||||
path = ArchivingContext.get("random_path", str(uuid.uuid4())[:16], True)
|
||||
|
||||
# filename_generator logic
|
||||
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
|
||||
elif self.filename_generator == "static":
|
||||
with open(media.filename, "rb") as f:
|
||||
bytes = f.read() # read entire file as bytes
|
||||
filename = hashlib.sha256(bytes).hexdigest()[:24]
|
||||
elif self.filename_generator == "static":
|
||||
he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}})
|
||||
hd = he.calculate_hash(media.filename)
|
||||
filename = hd[:24]
|
||||
|
||||
media.key = os.path.join(folder, path, f"{filename}{ext}")
|
||||
media.key = os.path.join(folder, path, f"{filename}{ext}")
|
||||
|
||||
@@ -40,11 +40,11 @@ class GWorksheet:
|
||||
|
||||
def _col_index(self, col: str):
|
||||
self._check_col_exists(col)
|
||||
return self.headers.index(self.columns[col])
|
||||
return self.headers.index(self.columns[col].lower())
|
||||
|
||||
def col_exists(self, col: str):
|
||||
self._check_col_exists(col)
|
||||
return self.columns[col] in self.headers
|
||||
return self.columns[col].lower() in self.headers
|
||||
|
||||
def count_rows(self):
|
||||
return len(self.values)
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
|
||||
_MAJOR = "0"
|
||||
_MINOR = "4"
|
||||
_MINOR = "5"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "1"
|
||||
_PATCH = "7"
|
||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||
_SUFFIX = ""
|
||||
|
||||
Reference in New Issue
Block a user