Compare commits

..

32 Commits

Author SHA1 Message Date
msramalho
0ecbed0df0 Bump version to v0.5.6 for release 2023-04-18 18:49:08 +01:00
msramalho
69bcfea2eb to_json fix 2023-04-18 18:48:51 +01:00
msramalho
2e2e695444 whisper enricher 2023-03-23 18:50:37 +00:00
msramalho
493055a8d9 cleanup 2023-03-23 18:50:30 +00:00
msramalho
6f6eb2db7a Archiving Context refactor complete 2023-03-23 14:28:45 +00:00
msramalho
906ed0f6e0 creating global context and refactoring tmp_dir logic 2023-03-23 11:17:38 +00:00
msramalho
39818e648a Bump version to v0.4.5 for release 2023-03-16 15:05:42 +00:00
Miguel Sozinho Ramalho
2bbf534d67 Merge pull request #72 from milesmcc/patch-1
Fix hash enricher for flatfile output (closes #71)
2023-03-16 15:04:55 +00:00
R. Miles McCain
6be7536fad Fix hash enricher for flatfile output (closes #71) 2023-03-14 13:37:54 -07:00
msramalho
0654e8c5c6 hash calculation in chunks to avoid exhausting RAM 2023-03-10 11:34:29 +00:00
msramalho
0e3c427371 Bump version to v0.4.3 for release 2023-02-27 10:30:06 +01:00
msramalho
7497bc08c0 Bump version to v0.4.2 for release 2023-02-23 17:14:29 +01:00
msramalho
49863768fe vk updates 2023-02-22 18:35:15 +01:00
msramalho
7b9483bbf9 yt-dlp update 2023-02-22 18:28:20 +01:00
msramalho
cd81cae559 auth wall for WACZ 2023-02-20 16:08:45 +00:00
msramalho
23894fad51 normalize columns 2023-02-20 16:08:35 +00:00
msramalho
876988b587 detect invalid url messages instagram bot 2023-02-20 12:22:52 +00:00
msramalho
f95293b84b support for multiple media instagram 2023-02-20 11:25:02 +00:00
msramalho
2fbcbe4e8b double session issues 2023-02-20 11:11:39 +00:00
msramalho
d1e4574c6c readme updates 2023-02-17 16:30:50 +00:00
msramalho
d347b26d37 updating example config 2023-02-17 16:26:23 +00:00
msramalho
1970fa3c82 new instagram archiver via telegram bot 2023-02-17 16:15:25 +00:00
msramalho
aa5430451e instagram archiver via telegram bot 2023-02-17 15:46:29 +00:00
msramalho
f35875a94c name fix 2023-02-17 15:46:05 +00:00
msramalho
5505255ea3 url auth wall detect 2023-02-17 15:45:58 +00:00
msramalho
da17b3f68a name fix 2023-02-17 15:45:35 +00:00
msramalho
d6dbdec6ac example 2023-02-09 12:32:55 +00:00
msramalho
224ebe7ee8 links 2023-02-08 22:27:56 +00:00
msramalho
54a1bc2172 update readme 2023-02-08 22:26:24 +00:00
msramalho
77948207d1 update 2023-02-08 22:24:40 +00:00
msramalho
60552ae0ea update readme 2023-02-08 22:23:25 +00:00
msramalho
f255271ecb update README 2023-02-08 22:17:22 +00:00
35 changed files with 779 additions and 294 deletions

View File

@@ -14,7 +14,6 @@ loguru = "*"
ffmpeg-python = "*" ffmpeg-python = "*"
selenium = "*" selenium = "*"
snscrape = "*" snscrape = "*"
yt-dlp = "*"
telethon = "*" telethon = "*"
google-api-python-client = "*" google-api-python-client = "*"
google-auth-httplib2 = "*" google-auth-httplib2 = "*"
@@ -23,13 +22,14 @@ oauth2client = "*"
python-slugify = "*" python-slugify = "*"
pyyaml = "*" pyyaml = "*"
dateparser = "*" dateparser = "*"
vk-url-scraper = "*"
python-twitter-v2 = "*" python-twitter-v2 = "*"
instaloader = "*" instaloader = "*"
tqdm = "*" tqdm = "*"
jinja2 = "*" jinja2 = "*"
cryptography = "==38.0.4" cryptography = "==38.0.4"
dataclasses-json = "*" dataclasses-json = "*"
yt-dlp = ">=2023.2.17"
vk-url-scraper = "*"
[requires] [requires]
python_version = "3.9" python_version = "3.9"

248
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "e2f5d017d9bc9eef90cced189b6e3017d740c35d204962479417109a4deeb7f4" "sha256": "7176a6666639452dbf30939fa095ff23518aee6da7d9561de0f12ba0aceed527"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
@@ -57,19 +57,19 @@
}, },
"boto3": { "boto3": {
"hashes": [ "hashes": [
"sha256:3a1ffeecfe6e61d414617294b822b008e604ccfd83434c483f429a2922db314d", "sha256:17f0d782487275cac12676a61b3f1a4900954cc454c842b8551ca47a3dcd59b4",
"sha256:ebea98f3054b467caf6c8aead9f0ef78395a78bce78b04db12fde452c02b3734" "sha256:bf808f7433629650128ab577a9d4a0f4daf072d9f2f3a907b9d567a6952d9154"
], ],
"index": "pypi", "index": "pypi",
"version": "==1.26.66" "version": "==1.26.77"
}, },
"botocore": { "botocore": {
"hashes": [ "hashes": [
"sha256:4d1ac019e677cc39e615f9d473fa658ea22a8d906c1c562f9406b5d0cd854cbd", "sha256:9d94a02f2584b52c65fb3cb309fb1b29d6d0c36d69062722b0275c1c382c44c9",
"sha256:772da07d2a49a9d2dc8d23e060e88eb72881e58074be7c813aa946ecdbd0e5b5" "sha256:d8aa7bffe2422de282b2d02945b7b45d5fecf00f67b65eebb0b1fa3de1abc6d0"
], ],
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==1.29.66" "version": "==1.29.77"
}, },
"brotli": { "brotli": {
"hashes": [ "hashes": [
@@ -176,11 +176,11 @@
}, },
"certifi": { "certifi": {
"hashes": [ "hashes": [
"sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d", "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3",
"sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412" "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"
], ],
"markers": "python_version >= '3.6'", "markers": "python_version >= '3.6'",
"version": "==2022.6.15" "version": "==2022.12.7"
}, },
"cffi": { "cffi": {
"hashes": [ "hashes": [
@@ -253,11 +253,97 @@
}, },
"charset-normalizer": { "charset-normalizer": {
"hashes": [ "hashes": [
"sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597", "sha256:00d3ffdaafe92a5dc603cb9bd5111aaa36dfa187c8285c543be562e61b755f6b",
"sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df" "sha256:024e606be3ed92216e2b6952ed859d86b4cfa52cd5bc5f050e7dc28f9b43ec42",
"sha256:0298eafff88c99982a4cf66ba2efa1128e4ddaca0b05eec4c456bbc7db691d8d",
"sha256:02a51034802cbf38db3f89c66fb5d2ec57e6fe7ef2f4a44d070a593c3688667b",
"sha256:083c8d17153ecb403e5e1eb76a7ef4babfc2c48d58899c98fcaa04833e7a2f9a",
"sha256:0a11e971ed097d24c534c037d298ad32c6ce81a45736d31e0ff0ad37ab437d59",
"sha256:0bf2dae5291758b6f84cf923bfaa285632816007db0330002fa1de38bfcb7154",
"sha256:0c0a590235ccd933d9892c627dec5bc7511ce6ad6c1011fdf5b11363022746c1",
"sha256:0f438ae3532723fb6ead77e7c604be7c8374094ef4ee2c5e03a3a17f1fca256c",
"sha256:109487860ef6a328f3eec66f2bf78b0b72400280d8f8ea05f69c51644ba6521a",
"sha256:11b53acf2411c3b09e6af37e4b9005cba376c872503c8f28218c7243582df45d",
"sha256:12db3b2c533c23ab812c2b25934f60383361f8a376ae272665f8e48b88e8e1c6",
"sha256:14e76c0f23218b8f46c4d87018ca2e441535aed3632ca134b10239dfb6dadd6b",
"sha256:16a8663d6e281208d78806dbe14ee9903715361cf81f6d4309944e4d1e59ac5b",
"sha256:292d5e8ba896bbfd6334b096e34bffb56161c81408d6d036a7dfa6929cff8783",
"sha256:2c03cc56021a4bd59be889c2b9257dae13bf55041a3372d3295416f86b295fb5",
"sha256:2e396d70bc4ef5325b72b593a72c8979999aa52fb8bcf03f701c1b03e1166918",
"sha256:2edb64ee7bf1ed524a1da60cdcd2e1f6e2b4f66ef7c077680739f1641f62f555",
"sha256:31a9ddf4718d10ae04d9b18801bd776693487cbb57d74cc3458a7673f6f34639",
"sha256:356541bf4381fa35856dafa6a965916e54bed415ad8a24ee6de6e37deccf2786",
"sha256:358a7c4cb8ba9b46c453b1dd8d9e431452d5249072e4f56cfda3149f6ab1405e",
"sha256:37f8febc8ec50c14f3ec9637505f28e58d4f66752207ea177c1d67df25da5aed",
"sha256:39049da0ffb96c8cbb65cbf5c5f3ca3168990adf3551bd1dee10c48fce8ae820",
"sha256:39cf9ed17fe3b1bc81f33c9ceb6ce67683ee7526e65fde1447c772afc54a1bb8",
"sha256:3ae1de54a77dc0d6d5fcf623290af4266412a7c4be0b1ff7444394f03f5c54e3",
"sha256:3b590df687e3c5ee0deef9fc8c547d81986d9a1b56073d82de008744452d6541",
"sha256:3e45867f1f2ab0711d60c6c71746ac53537f1684baa699f4f668d4c6f6ce8e14",
"sha256:3fc1c4a2ffd64890aebdb3f97e1278b0cc72579a08ca4de8cd2c04799a3a22be",
"sha256:4457ea6774b5611f4bed5eaa5df55f70abde42364d498c5134b7ef4c6958e20e",
"sha256:44ba614de5361b3e5278e1241fda3dc1838deed864b50a10d7ce92983797fa76",
"sha256:4a8fcf28c05c1f6d7e177a9a46a1c52798bfe2ad80681d275b10dcf317deaf0b",
"sha256:4b0d02d7102dd0f997580b51edc4cebcf2ab6397a7edf89f1c73b586c614272c",
"sha256:502218f52498a36d6bf5ea77081844017bf7982cdbe521ad85e64cabee1b608b",
"sha256:503e65837c71b875ecdd733877d852adbc465bd82c768a067badd953bf1bc5a3",
"sha256:5995f0164fa7df59db4746112fec3f49c461dd6b31b841873443bdb077c13cfc",
"sha256:59e5686dd847347e55dffcc191a96622f016bc0ad89105e24c14e0d6305acbc6",
"sha256:601f36512f9e28f029d9481bdaf8e89e5148ac5d89cffd3b05cd533eeb423b59",
"sha256:608862a7bf6957f2333fc54ab4399e405baad0163dc9f8d99cb236816db169d4",
"sha256:62595ab75873d50d57323a91dd03e6966eb79c41fa834b7a1661ed043b2d404d",
"sha256:70990b9c51340e4044cfc394a81f614f3f90d41397104d226f21e66de668730d",
"sha256:71140351489970dfe5e60fc621ada3e0f41104a5eddaca47a7acb3c1b851d6d3",
"sha256:72966d1b297c741541ca8cf1223ff262a6febe52481af742036a0b296e35fa5a",
"sha256:74292fc76c905c0ef095fe11e188a32ebd03bc38f3f3e9bcb85e4e6db177b7ea",
"sha256:761e8904c07ad053d285670f36dd94e1b6ab7f16ce62b9805c475b7aa1cffde6",
"sha256:772b87914ff1152b92a197ef4ea40efe27a378606c39446ded52c8f80f79702e",
"sha256:79909e27e8e4fcc9db4addea88aa63f6423ebb171db091fb4373e3312cb6d603",
"sha256:7e189e2e1d3ed2f4aebabd2d5b0f931e883676e51c7624826e0a4e5fe8a0bf24",
"sha256:7eb33a30d75562222b64f569c642ff3dc6689e09adda43a082208397f016c39a",
"sha256:81d6741ab457d14fdedc215516665050f3822d3e56508921cc7239f8c8e66a58",
"sha256:8499ca8f4502af841f68135133d8258f7b32a53a1d594aa98cc52013fff55678",
"sha256:84c3990934bae40ea69a82034912ffe5a62c60bbf6ec5bc9691419641d7d5c9a",
"sha256:87701167f2a5c930b403e9756fab1d31d4d4da52856143b609e30a1ce7160f3c",
"sha256:88600c72ef7587fe1708fd242b385b6ed4b8904976d5da0893e31df8b3480cb6",
"sha256:8ac7b6a045b814cf0c47f3623d21ebd88b3e8cf216a14790b455ea7ff0135d18",
"sha256:8b8af03d2e37866d023ad0ddea594edefc31e827fee64f8de5611a1dbc373174",
"sha256:8c7fe7afa480e3e82eed58e0ca89f751cd14d767638e2550c77a92a9e749c317",
"sha256:8eade758719add78ec36dc13201483f8e9b5d940329285edcd5f70c0a9edbd7f",
"sha256:911d8a40b2bef5b8bbae2e36a0b103f142ac53557ab421dc16ac4aafee6f53dc",
"sha256:93ad6d87ac18e2a90b0fe89df7c65263b9a99a0eb98f0a3d2e079f12a0735837",
"sha256:95dea361dd73757c6f1c0a1480ac499952c16ac83f7f5f4f84f0658a01b8ef41",
"sha256:9ab77acb98eba3fd2a85cd160851816bfce6871d944d885febf012713f06659c",
"sha256:9cb3032517f1627cc012dbc80a8ec976ae76d93ea2b5feaa9d2a5b8882597579",
"sha256:9cf4e8ad252f7c38dd1f676b46514f92dc0ebeb0db5552f5f403509705e24753",
"sha256:9d9153257a3f70d5f69edf2325357251ed20f772b12e593f3b3377b5f78e7ef8",
"sha256:a152f5f33d64a6be73f1d30c9cc82dfc73cec6477ec268e7c6e4c7d23c2d2291",
"sha256:a16418ecf1329f71df119e8a65f3aa68004a3f9383821edcb20f0702934d8087",
"sha256:a60332922359f920193b1d4826953c507a877b523b2395ad7bc716ddd386d866",
"sha256:a8d0fc946c784ff7f7c3742310cc8a57c5c6dc31631269876a88b809dbeff3d3",
"sha256:ab5de034a886f616a5668aa5d098af2b5385ed70142090e2a31bcbd0af0fdb3d",
"sha256:c22d3fe05ce11d3671297dc8973267daa0f938b93ec716e12e0f6dee81591dc1",
"sha256:c2ac1b08635a8cd4e0cbeaf6f5e922085908d48eb05d44c5ae9eabab148512ca",
"sha256:c512accbd6ff0270939b9ac214b84fb5ada5f0409c44298361b2f5e13f9aed9e",
"sha256:c75ffc45f25324e68ab238cb4b5c0a38cd1c3d7f1fb1f72b5541de469e2247db",
"sha256:c95a03c79bbe30eec3ec2b7f076074f4281526724c8685a42872974ef4d36b72",
"sha256:cadaeaba78750d58d3cc6ac4d1fd867da6fc73c88156b7a3212a3cd4819d679d",
"sha256:cd6056167405314a4dc3c173943f11249fa0f1b204f8b51ed4bde1a9cd1834dc",
"sha256:db72b07027db150f468fbada4d85b3b2729a3db39178abf5c543b784c1254539",
"sha256:df2c707231459e8a4028eabcd3cfc827befd635b3ef72eada84ab13b52e1574d",
"sha256:e62164b50f84e20601c1ff8eb55620d2ad25fb81b59e3cd776a1902527a788af",
"sha256:e696f0dd336161fca9adbb846875d40752e6eba585843c768935ba5c9960722b",
"sha256:eaa379fcd227ca235d04152ca6704c7cb55564116f8bc52545ff357628e10602",
"sha256:ebea339af930f8ca5d7a699b921106c6e29c617fe9606fa7baa043c1cdae326f",
"sha256:f4c39b0e3eac288fedc2b43055cfc2ca7a60362d0e5e87a637beac5d801ef478",
"sha256:f5057856d21e7586765171eac8b9fc3f7d44ef39425f85dbcccb13b3ebea806c",
"sha256:f6f45710b4459401609ebebdbcfb34515da4fc2aa886f95107f556ac69a9147e",
"sha256:f97e83fa6c25693c7a35de154681fcc257c1c41b38beb0304b9c4d2d9e164479",
"sha256:f9d0c5c045a3ca9bedfc35dca8526798eb91a07aa7a2c0fee134c6c6f321cbd7",
"sha256:ff6f3db31555657f3163b15a6b7c6938d08df7adbfc9dd13d9d19edad678f1e8"
], ],
"markers": "python_version >= '3.5'", "markers": "python_version >= '3.6'",
"version": "==2.0.12" "version": "==3.0.1"
}, },
"click": { "click": {
"hashes": [ "hashes": [
@@ -348,11 +434,11 @@
}, },
"flask": { "flask": {
"hashes": [ "hashes": [
"sha256:642c450d19c4ad482f96729bd2a8f6d32554aa1e231f4f6b4e7e5264b16cca2b", "sha256:7eb373984bf1c770023fce9db164ed0c3353cd0b53f130f4693da0ca756a2e6d",
"sha256:b9c46cc36662a7949f34b52d8ec7bb59c0d74ba08ba6cb9ce9adc1d8676d9526" "sha256:c0bec9477df1cb867e5a67c9e1ab758de9cb4a3e52dd70681f59fa40a62b3f2d"
], ],
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==2.2.2" "version": "==2.2.3"
}, },
"future": { "future": {
"hashes": [ "hashes": [
@@ -371,19 +457,19 @@
}, },
"google-api-python-client": { "google-api-python-client": {
"hashes": [ "hashes": [
"sha256:42a44e9adfca6bb27540ce52348aa1d3b81e214bcc53d454a76ebfbe8eee1483", "sha256:577c0aeae1eb3c754eacb9122d369d67609fef759bc6a4fa16cafeab4f30019b",
"sha256:f18e9dbb365f0485194a8daf5d60da2cff6a80ce2c9a694efc2b279922cb3dd0" "sha256:b9b6dc5f139892310093ba75d0df4c78f48655078953c923957dab1ec86129e7"
], ],
"index": "pypi", "index": "pypi",
"version": "==2.77.0" "version": "==2.79.0"
}, },
"google-auth": { "google-auth": {
"hashes": [ "hashes": [
"sha256:5045648c821fb72384cdc0e82cc326df195f113a33049d9b62b74589243d2acc", "sha256:5fd170986bce6bfd7bb5c845c4b8362edb1e0cba901e062196e83f8bb5d5d32c",
"sha256:ed7057a101af1146f0554a769930ac9de506aeca4fd5af6543ebe791851a9fbd" "sha256:75d76ea857df65938e1f71dcbcd7d0cd48e3f80b34b8870ba229c9292081f7ef"
], ],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==2.16.0" "version": "==2.16.1"
}, },
"google-auth-httplib2": { "google-auth-httplib2": {
"hashes": [ "hashes": [
@@ -435,18 +521,18 @@
}, },
"idna": { "idna": {
"hashes": [ "hashes": [
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff", "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4",
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d" "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"
], ],
"markers": "python_version >= '3.5'", "markers": "python_version >= '3.5'",
"version": "==3.3" "version": "==3.4"
}, },
"instaloader": { "instaloader": {
"hashes": [ "hashes": [
"sha256:ba925a87e2c305a3d24173d1bb0457d5a7e2e77dbac7206eeeb46f9104ecb08e" "sha256:16040c170fb5230c1981a47e1990261e3c0ecffe0417be95fa265632244e7c01"
], ],
"index": "pypi", "index": "pypi",
"version": "==4.9.5" "version": "==4.9.6"
}, },
"itsdangerous": { "itsdangerous": {
"hashes": [ "hashes": [
@@ -565,11 +651,11 @@
}, },
"markdown-it-py": { "markdown-it-py": {
"hashes": [ "hashes": [
"sha256:93de681e5c021a432c63147656fe21790bc01231e0cd2da73626f1aa3ac0fe27", "sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30",
"sha256:cf7e59fed14b5ae17c0006eff14a2d9a00ed5f3a846148153899a0224e2c07da" "sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1"
], ],
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==2.1.0" "version": "==2.2.0"
}, },
"markupsafe": { "markupsafe": {
"hashes": [ "hashes": [
@@ -700,23 +786,22 @@
}, },
"protobuf": { "protobuf": {
"hashes": [ "hashes": [
"sha256:1f22ac0ca65bb70a876060d96d914dae09ac98d114294f77584b0d2644fa9c30", "sha256:1669cb7524221a8e2d9008d0842453dbefdd0fcdd64d67672f657244867635fb",
"sha256:237216c3326d46808a9f7c26fd1bd4b20015fb6867dc5d263a493ef9a539293b", "sha256:29288813aacaa302afa2381db1d6e0482165737b0afdf2811df5fa99185c457b",
"sha256:27f4d15021da6d2b706ddc3860fac0a5ddaba34ab679dc182b60a8bb4e1121cc", "sha256:47d31bdf58222dd296976aa1646c68c6ee80b96d22e0a3c336c9174e253fd35e",
"sha256:299ea899484ee6f44604deb71f424234f654606b983cb496ea2a53e3c63ab791", "sha256:652d8dfece122a24d98eebfef30e31e455d300efa41999d1182e015984ac5930",
"sha256:3d164928ff0727d97022957c2b849250ca0e64777ee31efd7d6de2e07c494717", "sha256:7c535d126e7dcc714105ab20b418c4fedbd28f8b8afc42b7350b1e317bbbcc71",
"sha256:6ab80df09e3208f742c98443b6166bcb70d65f52cfeb67357d52032ea1ae9bec", "sha256:86c3d20428b007537ba6792b475c0853bba7f66b1f60e610d913b77d94b486e4",
"sha256:78a28c9fa223998472886c77042e9b9afb6fe4242bd2a2a5aced88e3f4422aa7", "sha256:a33a273d21852f911b8bda47f39f4383fe7c061eb1814db2c76c9875c89c2491",
"sha256:7cd532c4566d0e6feafecc1059d04c7915aec8e182d1cf7adee8b24ef1e2e6ab", "sha256:ab4d043865dd04e6b09386981fe8f80b39a1e46139fb4a3c206229d6b9f36ff6",
"sha256:89f9149e4a0169cddfc44c74f230d7743002e3aa0b9472d8c28f0388102fc4c2", "sha256:b2fea9dc8e3c0f32c38124790ef16cba2ee0628fe2022a52e435e1117bfef9b1",
"sha256:a53fd3f03e578553623272dc46ac2f189de23862e68565e83dde203d41b76fc5", "sha256:c27f371f0159feb70e6ea52ed7e768b3f3a4c5676c1900a7e51a24740381650e",
"sha256:b135410244ebe777db80298297a97fbb4c862c881b4403b71bac9d4107d61fd1", "sha256:c3325803095fb4c2a48649c321d2fbde59f8fbfcb9bfc7a86df27d112831c571",
"sha256:b98d0148f84e3a3c569e19f52103ca1feacdac0d2df8d6533cf983d1fda28462", "sha256:e474b63bab0a2ea32a7b26a4d8eec59e33e709321e5e16fb66e766b61b82a95e",
"sha256:d1736130bce8cf131ac7957fa26880ca19227d4ad68b4888b3be0dea1f95df97", "sha256:e894e9ae603e963f0842498c4cd5d39c6a60f0d7e4c103df50ee939564298658"
"sha256:f45460f9ee70a0ec1b6694c6e4e348ad2019275680bd68a1d9314b8c7e01e574"
], ],
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==4.21.12" "version": "==4.22.0"
}, },
"pyaes": { "pyaes": {
"hashes": [ "hashes": [
@@ -838,14 +923,6 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.8.2" "version": "==2.8.2"
}, },
"python-dotenv": {
"hashes": [
"sha256:b7e3b04a59693c42c36f9ab1cc2acc46fa5df8c78e178fc33a8d4cd05c8d498f",
"sha256:d92a187be61fe482e4fd675b6d52200e7be63a12b724abbf931a40ce4fa92938"
],
"markers": "python_version >= '3.5'",
"version": "==0.20.0"
},
"python-slugify": { "python-slugify": {
"hashes": [ "hashes": [
"sha256:51f217508df20a6c166c7821683384b998560adcf8f19a6c2ca8b460528ccd9c", "sha256:51f217508df20a6c166c7821683384b998560adcf8f19a6c2ca8b460528ccd9c",
@@ -1019,11 +1096,11 @@
}, },
"requests": { "requests": {
"hashes": [ "hashes": [
"sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f", "sha256:64299f4909223da747622c030b781c0d7811e359c37124b4bd368fb8c6518baa",
"sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b" "sha256:98b1b2782e3c6c4904938b84c0eb932721069dfdb9134313beff7c83c2df24bf"
], ],
"markers": "python_version >= '3.7' and python_version < '4'", "markers": "python_version >= '3.7' and python_version < '4'",
"version": "==2.28.0" "version": "==2.28.2"
}, },
"requests-oauthlib": { "requests-oauthlib": {
"hashes": [ "hashes": [
@@ -1067,11 +1144,11 @@
}, },
"selenium": { "selenium": {
"hashes": [ "hashes": [
"sha256:20f28ee4ea9b273b4112a7df5276ebb3052f79ff6eff42a564db6143e5926683", "sha256:bd04eb41395605d9b2b65fe587f3fed21431da75512985c52772529e5e210c60",
"sha256:fee36724d6cf0b18c73781bb8ec7be4a35ab1e2564e64e64e64da75e50e052af" "sha256:c48372905bffcc3b24bd55ab4683a07ee5e1f30fe918c59558ea5ee44cedf6c3"
], ],
"index": "pypi", "index": "pypi",
"version": "==4.8.0" "version": "==4.8.2"
}, },
"six": { "six": {
"hashes": [ "hashes": [
@@ -1106,11 +1183,11 @@
}, },
"soupsieve": { "soupsieve": {
"hashes": [ "hashes": [
"sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759", "sha256:49e5368c2cda80ee7e84da9dbe3e110b70a4575f196efb74e51b94549d921955",
"sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d" "sha256:e28dba9ca6c7c00173e34e4ba57448f0688bb681b7c5e8bf4971daafc093d69a"
], ],
"markers": "python_version >= '3.6'", "markers": "python_version >= '3.7'",
"version": "==2.3.2.post1" "version": "==2.4"
}, },
"telethon": { "telethon": {
"hashes": [ "hashes": [
@@ -1160,11 +1237,11 @@
}, },
"typing-extensions": { "typing-extensions": {
"hashes": [ "hashes": [
"sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa", "sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb",
"sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e" "sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4"
], ],
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==4.4.0" "version": "==4.5.0"
}, },
"typing-inspect": { "typing-inspect": {
"hashes": [ "hashes": [
@@ -1198,27 +1275,30 @@
"version": "==4.1.1" "version": "==4.1.1"
}, },
"urllib3": { "urllib3": {
"hashes": [ "extras": [
"sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14", "socks"
"sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"
], ],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", "hashes": [
"version": "==1.26.9" "sha256:076907bf8fd355cde77728471316625a4d2f7e713c125f51953bb5b3eecf4f72",
"sha256:75edcdc2f7d85b137124a6c3c9fc3933cdeaa12ecb9a6a959f22797a0feca7e1"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==1.26.14"
}, },
"vk-api": { "vk-api": {
"hashes": [ "hashes": [
"sha256:11c731e214ebc7fa911db81efb021f97587493a5402b992f24748fe1cd9d7afc", "sha256:c71021506449afe5b9bbb1c4acb0d86b35a007ddc21678478e46fbbeabd1f3ef",
"sha256:d0ae766fa93a40d47c5da045d94201721bf766dbde122a1d2253516b35c5edf3" "sha256:c7741e40bc05980c91ed94c84542e1e7e7370e101b5eaa74222958d4130fe3c2"
], ],
"version": "==11.9.8" "version": "==11.9.9"
}, },
"vk-url-scraper": { "vk-url-scraper": {
"hashes": [ "hashes": [
"sha256:1cd6daad89a1f920902cb68c5952c5ab5e80ba2bf4a8c3657c781b5b0f9d406b", "sha256:5a32fb5419f7bb8bd35de8548948fe27a06f857a4d086c87e142bf07aabc3fd7",
"sha256:d430de947575e321cedceecfdf198b8bd14db3026038b924547e8b1c7c6a09ed" "sha256:a87c5aa7c1570c3aa87031e78c2052105e3681f57503fd4cb56470c3ab6106d6"
], ],
"index": "pypi", "index": "pypi",
"version": "==0.3.10" "version": "==0.3.15"
}, },
"websockets": { "websockets": {
"hashes": [ "hashes": [
@@ -1297,11 +1377,11 @@
}, },
"werkzeug": { "werkzeug": {
"hashes": [ "hashes": [
"sha256:7ea2d48322cc7c0f8b3a215ed73eabd7b5d75d0b50e31ab006286ccff9e00b8f", "sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe",
"sha256:f979ab81f58d7318e064e99c4506445d60135ac5cd2e177a2de0089bfd4c9bd5" "sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"
], ],
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==2.2.2" "version": "==2.2.3"
}, },
"wsproto": { "wsproto": {
"hashes": [ "hashes": [
@@ -1313,11 +1393,11 @@
}, },
"yt-dlp": { "yt-dlp": {
"hashes": [ "hashes": [
"sha256:0e7b81fc6ac8d1b7d3fffa79f9044ca4163784422582c9a3593305da2a69ec02", "sha256:3b2df037c80922f0f83f63ee2f9253496b4a8668c0fe8d2a836ba9040f853b07",
"sha256:d7d1f81d230756f094b4d9ee59b37b2c13b2e63ff5fb72cda53625edb072cdae" "sha256:9af92de5effc193bdb51216d9ebf28874d96180d202fae752b0d9f2a63380f3a"
], ],
"index": "pypi", "index": "pypi",
"version": "==2022.7.18" "version": "==2023.2.17"
} }
}, },
"develop": { "develop": {

View File

@@ -1,4 +1,12 @@
# Auto Archiver <h1 align="center">Auto Archiver</h1>
[![PyPI version](https://badge.fury.io/py/auto-archiver.svg)](https://badge.fury.io/py/auto-archiver)
[![Docker Image Version (latest by date)](https://img.shields.io/docker/v/bellingcat/auto-archiver?label=version&logo=docker)](https://pypi.org/project/auto-archiver/)
<!-- ![Docker Pulls](https://img.shields.io/docker/pulls/bellingcat/auto-archiver) -->
<!-- [![PyPI download month](https://img.shields.io/pypi/dm/auto-archiver.svg)](https://pypi.python.org/pypi/auto-archiver/) -->
<!-- [![Documentation Status](https://readthedocs.org/projects/vk-url-scraper/badge/?version=latest)](https://vk-url-scraper.readthedocs.io/en/latest/?badge=latest) -->
Read the [article about Auto Archiver on bellingcat.com](https://www.bellingcat.com/resources/2022/09/22/preserve-vital-online-content-with-bellingcats-auto-archiver-tool/). Read the [article about Auto Archiver on bellingcat.com](https://www.bellingcat.com/resources/2022/09/22/preserve-vital-online-content-with-bellingcats-auto-archiver-tool/).
@@ -15,6 +23,11 @@ But **you always need a configuration/orchestration file**, which is where you'l
## How to run the auto-archiver ## How to run the auto-archiver
### Option 1 - docker ### Option 1 - docker
<details><summary><code>Docker instructions</code></summary>
[![dockeri.co](https://dockerico.blankenship.io/image/bellingcat/auto-archiver)](https://hub.docker.com/r/bellingcat/auto-archiver)
Docker works like a virtual machine running inside your computer, it isolates everything and makes installation simple. Since it is an isolated environment when you need to pass it your orchestration file or get downloaded media out of docker you will need to connect folders on your machine with folders inside docker with the `-v` volume flag. Docker works like a virtual machine running inside your computer, it isolates everything and makes installation simple. Since it is an isolated environment when you need to pass it your orchestration file or get downloaded media out of docker you will need to connect folders on your machine with folders inside docker with the `-v` volume flag.
@@ -32,14 +45,20 @@ Docker works like a virtual machine running inside your computer, it isolates ev
2. `$PWD/local_archive` is a folder `local_archive/` in case you want to archive locally and have the files accessible outside docker 2. `$PWD/local_archive` is a folder `local_archive/` in case you want to archive locally and have the files accessible outside docker
3. `/app/local_archive` is a folder inside docker that you can reference in your orchestration.yml file 3. `/app/local_archive` is a folder inside docker that you can reference in your orchestration.yml file
</details>
### Option 2 - python package ### Option 2 - python package
<details><summary><code>Python package instructions</code></summary>
1. make sure you have python 3.8 or higher installed 1. make sure you have python 3.8 or higher installed
2. install the package `pip/pipenv/conda install auto-archiver` 2. install the package `pip/pipenv/conda install auto-archiver`
3. test it's installed with `auto-archiver --help` 3. test it's installed with `auto-archiver --help`
4. run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml` 4. run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml`
1. if your orchestration file is inside a `secrets/` which we advise 1. if your orchestration file is inside a `secrets/` which we advise
</details>
### Option 3 - local installation ### Option 3 - local installation
This can also be used for development. This can also be used for development.
@@ -60,13 +79,6 @@ Clone and run:
</details><br/> </details><br/>
### Examples
# Orchestration # Orchestration
The archiver work is orchestrated by the following workflow (we call each a **step**): The archiver work is orchestrated by the following workflow (we call each a **step**):
1. **Feeder** gets the links (from a spreadsheet, from the console, ...) 1. **Feeder** gets the links (from a spreadsheet, from the console, ...)
@@ -85,7 +97,7 @@ The structure of orchestration file is split into 2 parts: `steps` (what **steps
steps: steps:
feeder: gsheet_feeder feeder: gsheet_feeder
archivers: # order matters archivers: # order matters
- youtubedl_enricher - youtubedl_archiver
enrichers: enrichers:
- thumbnail_enricher - thumbnail_enricher
formatter: html_formatter formatter: html_formatter
@@ -141,11 +153,11 @@ These assume you've installed with pipenv, see docker section above for how to r
# all the configurations come from ./orchestration.yaml # all the configurations come from ./orchestration.yaml
auto-archiver auto-archiver
# all the configurations come from ./secrets/orchestration.yaml # all the configurations come from ./secrets/orchestration.yaml
auto-archiver --config orchestration.yaml auto-archiver --config secrets/orchestration.yaml
# uses the configurations but for another google docs sheet # uses the same configurations but for another google docs sheet
# with a header on row 2 and with some different column names # with a header on row 2 and with some different column names
# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided # notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided
auto-archiver --config orchestration.yaml --gsheets_feeder.sheet="use it on another sheets doc" --gsheets_feeder.header=2 --gsheets_feeder.columns='{"url": "link"}' auto-archiver --config orchestration.yaml --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}'
# all the configurations come from orchestration.yaml and specifies that s3 files should be private # all the configurations come from orchestration.yaml and specifies that s3 files should be private
auto-archiver --s3_storage.private=1 auto-archiver --s3_storage.private=1
``` ```
@@ -154,11 +166,11 @@ auto-archiver --s3_storage.private=1
#### Google Drive #### Google Drive
To use Google Drive storage you need the id of the shared folder in the `config.yaml` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com` and then you can use `--storage=gd` To use Google Drive storage you need the id of the shared folder in the `config.yaml` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com` and then you can use `--storage=gd`
#### Telethon (Telegrams API Library) #### Telethon + Instagram with telegram bot
The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root. The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root.
## Running on Google Sheets Feeder (gsheets_feeder) ## Running on Google Sheets Feeder (gsheet_feeder)
The `--gseets_feeder.sheet` property is the name of the Google Sheet to check for URLs. The `--gseets_feeder.sheet` property is the name of the Google Sheet to check for URLs.
This sheet must have been shared with the Google Service account used by `gspread`. This sheet must have been shared with the Google Service account used by `gspread`.
This sheet must also have specific columns (case-insensitive) in the `header` row - see [Gsheet.configs](src/auto_archiver/utils/gsheet.py) for all their names. This sheet must also have specific columns (case-insensitive) in the `header` row - see [Gsheet.configs](src/auto_archiver/utils/gsheet.py) for all their names.
@@ -171,23 +183,25 @@ When the auto archiver starts running, it updates the "Archive status" column.
![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Media URL" column. The auto archiver has added "archive in progress" to one of the status columns.](docs/demo-progress.png) ![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Media URL" column. The auto archiver has added "archive in progress" to one of the status columns.](docs/demo-progress.png)
The links are downloaded and archived, and the spreadsheet is updated to the following: The links are downloaded and archived, and the spreadsheet is updated to the following:
![A screenshot of a Google Spreadsheet with videos archived and metadata added per the description of the columns above.](docs/demo-after.png) ![A screenshot of a Google Spreadsheet with videos archived and metadata added per the description of the columns above.](docs/demo-after.png)
Note that the first row is skipped, as it is assumed to be a header row (`--gsheets_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked. Note that the first row is skipped, as it is assumed to be a header row (`--gsheet_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked.
--- ---
## Development ## Development
Use `python -m src.auto_archiver --config secrets/orchestration.yaml` to run from the local development environment. Use `python -m src.auto_archiver --config secrets/orchestration.yaml` to run from the local development environment.
# Docker development #### Docker development
* working with docker locally: working with docker locally:
* `docker build . -t auto-archiver` to build a local image * `docker build . -t auto-archiver` to build a local image
* `docker run --rm -v $PWD/secrets:/app/secrets aa --config secrets/config.yaml` * `docker run --rm -v $PWD/secrets:/app/secrets aa --config secrets/config.yaml`
* to use local archive, also create a volume `-v` for it by adding `-v $PWD/local_archive:/app/local_archive` * to use local archive, also create a volume `-v` for it by adding `-v $PWD/local_archive:/app/local_archive`
* release to docker hub
release to docker hub
* `docker image tag auto-archiver bellingcat/auto-archiver:latest` * `docker image tag auto-archiver bellingcat/auto-archiver:latest`
* `docker push bellingcat/auto-archiver` * `docker push bellingcat/auto-archiver`
# RELEASE #### RELEASE
* update version in [version.py](src/auto_archiver/version.py) * update version in [version.py](src/auto_archiver/version.py)
* run `bash ./scripts/release.sh` and confirm * run `bash ./scripts/release.sh` and confirm
* package is automatically updated in pypi * package is automatically updated in pypi

View File

@@ -1,80 +1,123 @@
steps: steps:
# only 1 feeder allowed # only 1 feeder allowed
# a feeder could be in an "infinite loop" for example: gsheets_infinite feeder which holds-> this could be an easy logic addiction by modifying for each to while not feeder.done() if it becomes necessary feeder: gsheet_feeder # defaults to cli_feeder
feeder: gsheet_feeder # default -> only expects URL from CLI archivers: # order matters, uncomment to activate
archivers: # order matters # - vk_archiver
- telethon # - telethon_archiver
# - tiktok # - telegram_archiver
# - twitter # - twitter_archiver
# - instagram # - twitter_api_archiver
# - webarchive # this way it runs as a failsafe only # - instagram_tbot_archiver
# enrichers: # - instagram_archiver
# - screenshot # - tiktok_archiver
# - wacz - youtubedl_archiver
# - webarchive # this way it runs for every case, webarchive extends archiver and enrichment - wayback_archiver_enricher
# - thumbnails enrichers:
formatters: - hash_enricher
- HTMLFormater # - screenshot_enricher
- PdfFormater # - thumbnail_enricher
# - wayback_archiver_enricher
# - wacz_enricher
formatter: html_formatter # defaults to mute_formatter
storages: storages:
- local_storage - local_storage
- s3 # - s3_storage
# - gdrive_storage
databases: databases:
- gsheets_db - console_db
- mongo_db # - csv_db
# - gsheet_db
# - mongo_db
configurations: configurations:
gsheet_feeder: gsheet_feeder:
sheet: my-auto-archiver sheet: "your sheet name"
header: 2 # defaults to 1 in GSheetsFeeder header: 1
service_account: "secrets/service_account.json" service_account: "secrets/service_account.json"
# allow_worksheets: "allowed" # allow_worksheets: "only parse this worksheet"
# block_worksheets: "blocked1,blocked2" # block_worksheets: "blocked sheet 1,blocked sheet 2"
use_sheet_names_in_stored_paths: false
columns: columns:
'url': 'link' url: link
'status': 'archive status' status: archive status
'folder': 'destination folder' folder: destination folder
'archive': 'archive location' archive: archive location
'date': 'archive date' date: archive date
'thumbnail': 'thumbnail' thumbnail: thumbnail
'thumbnail_index': 'thumbnail index' thumbnail_index: thumbnail index
'timestamp': 'upload timestamp' timestamp: upload timestamp
'title': 'upload title' title: upload title
'duration': 'duration' text: textual content
'screenshot': 'screenshot' duration: duration
'hash': 'hash' screenshot: screenshot
'wacz': 'wacz' hash: hash
'replaywebpage': 'replaywebpage' wacz: wacz
telethon: replaywebpage: replaywebpage
api_id: "1234567" instagram_tbot_archiver:
api_hash: "examplehash" api_id: "TELEGRAM_BOT_API_ID"
session_file: "secrets/anon" api_hash: "TELEGRAM_BOT_API_HASH"
channel_invites: # session_file: "secrets/anon"
- invite: https://t.me/+XXXXXXXXXXXXXX telethon_archiver:
id: 1000000000 api_id: "TELEGRAM_BOT_API_ID"
- invite: https://t.me/joinchat/XXXXXXXXXXXXXX api_hash: "TELEGRAM_BOT_API_HASH"
id: 1000000001 # session_file: "secrets/anon"
join_channels: false
channel_invites: # if you want to archive from private channels
- invite: https://t.me/+123456789
id: 0000000001
- invite: https://t.me/+123456788
id: 0000000002
tiktok: twitter_api_archiver:
api_keys: # either bearer_token only
- username: 1 bearer_token: "TWITTER_BEARER_TOKEN"
password: 2 # OR all of the below
- username: 3 # consumer_key: ""
password: 4 # consumer_secret: ""
username: "abc" # access_token: ""
password: "123" # access_secret: ""
token: "here" instagram_archiver:
screenshot: username: "INSTAGRAM_USERNAME"
password: "INSTAGRAM_PASSWORD"
# session_file: "secrets/instaloader.session"
vk_archiver:
username: "or phone number"
password: "vk pass"
session_file: "secrets/vk_config.v2.json"
screenshot_enricher:
width: 1280 width: 1280
height: 4600 height: 2300
wacz: wayback_archiver_enricher:
timeout: 10
key: "wayback key"
secret: "wayback secret"
hash_enricher:
algorithm: "SHA3-512" # can also be SHA-256
wacz_enricher:
profile: secrets/profile.tar.gz profile: secrets/profile.tar.gz
webarchive: local_storage:
api_key: "12345" save_to: "./local_archive"
s3: save_absolute: true
- bucket: 123 filename_generator: static
- region: "nyc3" path_generator: flat
- cdn: "{region}{bucket}" s3_storage:
bucket: your-bucket-name
region: reg1
key: S3_KEY
secret: S3_SECRET
endpoint_url: "https://{region}.digitaloceanspaces.com"
cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
# if private:true S3 urls will not be readable online
private: false
# with 'random' you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files, alternative is 'default' or not omitted from config
key_path: random
gdrive_storage:
path_generator: url
filename_generator: random
root_folder_id: folder_id_from_url
oauth_token: secrets/gd-token.json # needs to be generated with scripts/create_update_gdrive_oauth_token.py
service_account: "secrets/service_account.json"

View File

@@ -3,6 +3,7 @@ from .telethon_archiver import TelethonArchiver
from .twitter_archiver import TwitterArchiver from .twitter_archiver import TwitterArchiver
from .twitter_api_archiver import TwitterApiArchiver from .twitter_api_archiver import TwitterApiArchiver
from .instagram_archiver import InstagramArchiver from .instagram_archiver import InstagramArchiver
from .instagram_tbot_archiver import InstagramTbotArchiver
from .tiktok_archiver import TiktokArchiver from .tiktok_archiver import TiktokArchiver
from .telegram_archiver import TelegramArchiver from .telegram_archiver import TelegramArchiver
from .vk_archiver import VkArchiver from .vk_archiver import VkArchiver

View File

@@ -3,8 +3,8 @@ from abc import abstractmethod
from dataclasses import dataclass from dataclasses import dataclass
import os import os
import mimetypes, requests import mimetypes, requests
from ..core import Metadata
from ..core import Step from ..core import Metadata, Step, ArchivingContext
@dataclass @dataclass
@@ -51,7 +51,7 @@ class Archiver(Step):
if len(to_filename) > 64: if len(to_filename) > 64:
to_filename = to_filename[-64:] to_filename = to_filename[-64:]
if item: if item:
to_filename = os.path.join(item.get_tmp_dir(), to_filename) to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
} }

View File

@@ -0,0 +1,77 @@
from telethon.sync import TelegramClient
from loguru import logger
import time, os
from sqlite3 import OperationalError
from . import Archiver
from ..core import Metadata, Media, ArchivingContext
class InstagramTbotArchiver(Archiver):
"""
calls a telegram bot to fetch instagram posts/stories... and gets available media from it
https://github.com/adw0rd/instagrapi
https://t.me/instagram_load_bot
"""
name = "instagram_tbot_archiver"
def __init__(self, config: dict) -> None:
super().__init__(config)
self.assert_valid_string("api_id")
self.assert_valid_string("api_hash")
self.timeout = int(self.timeout)
try:
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
except OperationalError as e:
logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_archiver. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}")
@staticmethod
def configs() -> dict:
return {
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
"session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
"timeout": {"default": 15, "help": "timeout to fetch the instagram content in seconds."},
}
def setup(self) -> None:
logger.info(f"SETUP {self.name} checking login...")
with self.client.start():
logger.success(f"SETUP {self.name} login works.")
def download(self, item: Metadata) -> Metadata:
url = item.get_url()
if not "instagram.com" in url: return False
result = Metadata()
tmp_dir = ArchivingContext.get_tmp_dir()
with self.client.start():
chat = self.client.get_entity("instagram_load_bot")
since_id = self.client.send_message(entity=chat, message=url).id
attempts = 0
seen_media = []
message = ""
time.sleep(4)
# media is added before text by the bot so it can be used as a stop-logic mechanism
while attempts < self.timeout and (not message or not len(seen_media)):
attempts += 1
time.sleep(1)
for post in self.client.iter_messages(chat, min_id=since_id):
since_id = max(since_id, post.id)
if post.media and post.id not in seen_media:
filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
media = self.client.download_media(post.media, filename_dest)
if media:
result.add_media(Media(media))
seen_media.append(post.id)
if post.message: message += post.message
if "You must enter a URL to a post" in message:
logger.debug(f"invalid link {url=} for {self.name}: {message}")
return False
if message:
result.set_content(message).set_title(message[:128])
return result.success("insta-via-bot")

View File

@@ -8,7 +8,7 @@ from tqdm import tqdm
import re, time, json, os import re, time, json, os
from . import Archiver from . import Archiver
from ..core import Metadata, Media from ..core import Metadata, Media, ArchivingContext
class TelethonArchiver(Archiver): class TelethonArchiver(Archiver):
@@ -114,7 +114,7 @@ class TelethonArchiver(Archiver):
with self.client.start(): with self.client.start():
# with self.client.start(bot_token=self.bot_token): # with self.client.start(bot_token=self.bot_token):
try: try:
post = self.client.get_messages(chat, ids=post_id) post = self.client.get_messages(chat, ids=post_id)
except ValueError as e: except ValueError as e:
logger.error(f"Could not fetch telegram {url} possibly it's private: {e}") logger.error(f"Could not fetch telegram {url} possibly it's private: {e}")
return False return False
@@ -128,7 +128,7 @@ class TelethonArchiver(Archiver):
media_posts = self._get_media_posts_in_group(chat, post) media_posts = self._get_media_posts_in_group(chat, post)
logger.debug(f'got {len(media_posts)=} for {url=}') logger.debug(f'got {len(media_posts)=} for {url=}')
tmp_dir = item.get_tmp_dir() tmp_dir = ArchivingContext.get_tmp_dir()
group_id = post.grouped_id if post.grouped_id is not None else post.id group_id = post.grouped_id if post.grouped_id is not None else post.id
title = post.message title = post.message

View File

@@ -3,7 +3,7 @@ import tiktok_downloader
from loguru import logger from loguru import logger
from . import Archiver from . import Archiver
from ..core import Metadata, Media from ..core import Metadata, Media, ArchivingContext
class TiktokArchiver(Archiver): class TiktokArchiver(Archiver):
@@ -41,7 +41,7 @@ class TiktokArchiver(Archiver):
logger.warning(f'Other Tiktok error {error}') logger.warning(f'Other Tiktok error {error}')
try: try:
filename = os.path.join(item.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4') filename = os.path.join(ArchivingContext.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4')
tiktok_media = tiktok_downloader.snaptik(url).get_media() tiktok_media = tiktok_downloader.snaptik(url).get_media()
if len(tiktok_media) <= 0: if len(tiktok_media) <= 0:

View File

@@ -37,7 +37,7 @@ class TwitterArchiver(Archiver):
return self.link_clean_pattern.sub("\\1", url) return self.link_clean_pattern.sub("\\1", url)
def is_rearchivable(self, url: str) -> bool: def is_rearchivable(self, url: str) -> bool:
# Twitter posts are static # Twitter posts are static (for now)
return False return False
def download(self, item: Metadata) -> Metadata: def download(self, item: Metadata) -> Metadata:
@@ -86,7 +86,7 @@ class TwitterArchiver(Archiver):
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item) media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
result.add_media(media) result.add_media(media)
return result.success("twitter") return result.success("twitter-snscrape")
def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata: def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
""" """

View File

@@ -3,7 +3,7 @@ from vk_url_scraper import VkScraper
from ..utils.misc import dump_payload from ..utils.misc import dump_payload
from . import Archiver from . import Archiver
from ..core import Metadata, Media from ..core import Metadata, Media, ArchivingContext
class VkArchiver(Archiver): class VkArchiver(Archiver):
@@ -50,7 +50,7 @@ class VkArchiver(Archiver):
result.set_content(dump_payload(vk_scrapes)) result.set_content(dump_payload(vk_scrapes))
filenames = self.vks.download_media(vk_scrapes, item.get_tmp_dir()) filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir())
for filename in filenames: for filename in filenames:
result.add_media(Media(filename)) result.add_media(Media(filename))

View File

@@ -2,11 +2,11 @@ import datetime, os, yt_dlp
from loguru import logger from loguru import logger
from . import Archiver from . import Archiver
from ..core import Metadata, Media from ..core import Metadata, Media, ArchivingContext
class YoutubeDLArchiver(Archiver): class YoutubeDLArchiver(Archiver):
name = "youtubedl_enricher" name = "youtubedl_archiver"
def __init__(self, config: dict) -> None: def __init__(self, config: dict) -> None:
super().__init__(config) super().__init__(config)
@@ -25,7 +25,7 @@ class YoutubeDLArchiver(Archiver):
logger.debug('Using Facebook cookie') logger.debug('Using Facebook cookie')
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(item.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False}) ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False})
try: try:
# don'd download since it can be a live stream # don'd download since it can be a live stream

View File

@@ -1,6 +1,7 @@
from .media import Media
from .metadata import Metadata from .metadata import Metadata
from .media import Media
from .step import Step from .step import Step
from .context import ArchivingContext
# cannot import ArchivingOrchestrator/Config to avoid circular dep # cannot import ArchivingOrchestrator/Config to avoid circular dep
# from .orchestrator import ArchivingOrchestrator # from .orchestrator import ArchivingOrchestrator

View File

@@ -0,0 +1,53 @@
from loguru import logger
class ArchivingContext:
"""
Singleton context class.
ArchivingContext._get_instance() to retrieve it if needed
otherwise just
ArchivingContext.set(key, value)
and
ArchivingContext.get(key, default)
When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True)
reset(full_reset=True) will recreate everything including the keep_on_reset status
"""
_instance = None
def __init__(self):
self.configs = {}
self.keep_on_reset = set()
@staticmethod
def get_instance():
if ArchivingContext._instance is None:
ArchivingContext._instance = ArchivingContext()
return ArchivingContext._instance
@staticmethod
def set(key, value, keep_on_reset: bool = False):
logger.error(f"SET [{key}]={value}")
ac = ArchivingContext.get_instance()
ac.configs[key] = value
if keep_on_reset: ac.keep_on_reset.add(key)
@staticmethod
def get(key: str, default=None):
return ArchivingContext.get_instance().configs.get(key, default)
@staticmethod
def reset(full_reset: bool = False):
ac = ArchivingContext.get_instance()
if full_reset: ac.keep_on_reset = set()
ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
# ---- custom getters/setters for widely used context values
@staticmethod
def set_tmp_dir(tmp_dir: str):
ArchivingContext.get_instance().configs["tmp_dir"] = tmp_dir
@staticmethod
def get_tmp_dir() -> str:
return ArchivingContext.get_instance().configs.get("tmp_dir")

View File

@@ -3,18 +3,46 @@ from __future__ import annotations
from ast import List from ast import List
from typing import Any from typing import Any
from dataclasses import dataclass, field from dataclasses import dataclass, field
from dataclasses_json import dataclass_json from dataclasses_json import dataclass_json, config
import mimetypes import mimetypes
# annotation order matters from .context import ArchivingContext
@dataclass_json
from loguru import logger
@dataclass_json # annotation order matters
@dataclass @dataclass
class Media: class Media:
filename: str filename: str
key: str = None key: str = None
urls: List[str] = field(default_factory=list) urls: List[str] = field(default_factory=list)
_mimetype: str = None # eg: image/jpeg
properties: dict = field(default_factory=dict) properties: dict = field(default_factory=dict)
_mimetype: str = None # eg: image/jpeg
_stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True)) # always exclude
def store(self: Media, override_storages: List = None, url: str = "url-not-available"):
# stores the media into the provided/available storages [Storage]
# repeats the process for its properties, in case they have inner media themselves
# for now it only goes down 1 level but it's easy to make it recursive if needed
storages = override_storages or ArchivingContext.get("storages")
if not len(storages):
logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
return
for s in storages:
s.store(self, url)
# Media can be inside media properties, examples include transformations on original media
for prop in self.properties.values():
if isinstance(prop, Media):
s.store(prop, url)
if isinstance(prop, list):
for prop_media in prop:
if isinstance(prop_media, Media):
s.store(prop_media, url)
def is_stored(self) -> bool:
return len(self.urls) > 0
def set(self, key: str, value: Any) -> Media: def set(self, key: str, value: Any) -> Media:
self.properties[key] = value self.properties[key] = value
@@ -40,3 +68,6 @@ class Media:
def is_video(self) -> bool: def is_video(self) -> bool:
return self.mimetype.startswith("video") return self.mimetype.startswith("video")
def is_audio(self) -> bool:
return self.mimetype.startswith("audio")

View File

@@ -3,24 +3,25 @@ from __future__ import annotations
from ast import List, Set from ast import List, Set
from typing import Any, Union, Dict from typing import Any, Union, Dict
from dataclasses import dataclass, field from dataclasses import dataclass, field
from dataclasses_json import dataclass_json from dataclasses_json import dataclass_json, config
import datetime import datetime
from urllib.parse import urlparse from urllib.parse import urlparse
from dateutil.parser import parse as parse_dt from dateutil.parser import parse as parse_dt
from .media import Media from .media import Media
from .context import ArchivingContext
# annotation order matters @dataclass_json # annotation order matters
@dataclass_json
@dataclass @dataclass
class Metadata: class Metadata:
status: str = "no archiver" status: str = "no archiver"
_processed_at: datetime = field(default_factory=datetime.datetime.utcnow)
metadata: Dict[str, Any] = field(default_factory=dict) metadata: Dict[str, Any] = field(default_factory=dict)
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude": True}) # keys that are not to be saved in DBs
media: List[Media] = field(default_factory=list) media: List[Media] = field(default_factory=list)
rearchivable: bool = True # defaults to true, archivers can overwrite rearchivable: bool = True # defaults to true, archivers can overwrite
def __post_init__(self):
self.set("_processed_at", datetime.datetime.utcnow())
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata: def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
""" """
merges two Metadata instances, will overwrite according to overwrite_left flag merges two Metadata instances, will overwrite according to overwrite_left flag
@@ -30,7 +31,6 @@ class Metadata:
if right.status and len(right.status): if right.status and len(right.status):
self.status = right.status self.status = right.status
self.rearchivable |= right.rearchivable self.rearchivable |= right.rearchivable
self.tmp_keys |= right.tmp_keys
for k, v in right.metadata.items(): for k, v in right.metadata.items():
assert k not in self.metadata or type(v) == type(self.get(k)) assert k not in self.metadata or type(v) == type(self.get(k))
if type(v) not in [dict, list, set] or k not in self.metadata: if type(v) not in [dict, list, set] or k not in self.metadata:
@@ -43,10 +43,14 @@ class Metadata:
return right.merge(self) return right.merge(self)
return self return self
def set(self, key: str, val: Any, is_tmp=False) -> Metadata: def store(self: Metadata, override_storages: List = None):
# if not self.metadata: self.metadata = {} # calls .store for all contained media. storages [Storage]
storages = override_storages or ArchivingContext.get("storages")
for media in self.media:
media.store(override_storages=storages)
def set(self, key: str, val: Any) -> Metadata:
self.metadata[key] = val self.metadata[key] = val
if is_tmp: self.tmp_keys.add(key)
return self return self
def get(self, key: str, default: Any = None, create_if_missing=False) -> Union[Metadata, str]: def get(self, key: str, default: Any = None, create_if_missing=False) -> Union[Metadata, str]:
@@ -63,6 +67,9 @@ class Metadata:
def is_success(self) -> bool: def is_success(self) -> bool:
return "success" in self.status return "success" in self.status
def is_empty(self) -> bool:
return not self.is_success() and len(self.media) == 0 and len(self.metadata) <= 2 # url, processed_at
@property # getter .netloc @property # getter .netloc
def netloc(self) -> str: def netloc(self) -> str:
return urlparse(self.get_url()).netloc return urlparse(self.get_url()).netloc
@@ -90,12 +97,6 @@ class Metadata:
def get_title(self) -> str: def get_title(self) -> str:
return self.get("title") return self.get("title")
def set_tmp_dir(self, tmp_dir: str) -> Metadata:
return self.set("tmp_dir", tmp_dir, True)
def get_tmp_dir(self) -> str:
return self.get("tmp_dir")
def set_timestamp(self, timestamp: datetime.datetime) -> Metadata: def set_timestamp(self, timestamp: datetime.datetime) -> Metadata:
if type(timestamp) == str: if type(timestamp) == str:
timestamp = parse_dt(timestamp) timestamp = parse_dt(timestamp)
@@ -122,7 +123,7 @@ class Metadata:
for m in self.media: for m in self.media:
if m.get("id") == id: return m if m.get("id") == id: return m
return default return default
def get_first_image(self, default=None) -> Media: def get_first_image(self, default=None) -> Media:
for m in self.media: for m in self.media:
if "image" in m.mimetype: return m if "image" in m.mimetype: return m
@@ -136,8 +137,5 @@ class Metadata:
_default = self.media[0] if len(self.media) else None _default = self.media[0] if len(self.media) else None
return self.get_media_by_id("_final_media", _default) return self.get_media_by_id("_final_media", _default)
def get_clean_metadata(self) -> Metadata: def __str__(self) -> str:
return dict( return self.__repr__()
{k: v for k, v in self.metadata.items() if k not in self.tmp_keys},
**{"processed_at": self._processed_at}
)

View File

@@ -2,6 +2,8 @@ from __future__ import annotations
from ast import List from ast import List
from typing import Union from typing import Union
from .context import ArchivingContext
from ..archivers import Archiver from ..archivers import Archiver
from ..feeders import Feeder from ..feeders import Feeder
from ..formatters import Formatter from ..formatters import Formatter
@@ -23,6 +25,7 @@ class ArchivingOrchestrator:
self.archivers: List[Archiver] = config.archivers self.archivers: List[Archiver] = config.archivers
self.databases: List[Database] = config.databases self.databases: List[Database] = config.databases
self.storages: List[Storage] = config.storages self.storages: List[Storage] = config.storages
ArchivingContext.set("storages", self.storages, keep_on_reset=True)
for a in self.archivers: a.setup() for a in self.archivers: a.setup()
@@ -32,8 +35,9 @@ class ArchivingOrchestrator:
def feed_item(self, item: Metadata) -> Metadata: def feed_item(self, item: Metadata) -> Metadata:
try: try:
ArchivingContext.reset()
with tempfile.TemporaryDirectory(dir="./") as tmp_dir: with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
item.set_tmp_dir(tmp_dir) ArchivingContext.set_tmp_dir(tmp_dir)
return self.archive(item) return self.archive(item)
except KeyboardInterrupt: except KeyboardInterrupt:
# catches keyboard interruptions to do a clean exit # catches keyboard interruptions to do a clean exit
@@ -105,24 +109,17 @@ class ArchivingOrchestrator:
# 5 - store media # 5 - store media
# looks for Media in result.media and also result.media[x].properties (as list or dict values) # looks for Media in result.media and also result.media[x].properties (as list or dict values)
for s in self.storages: result.store()
for m in result.media:
s.store(m, result) # modifies media
# Media can be inside media properties, examples include transformations on original media
for prop in m.properties.values():
if isinstance(prop, Media):
s.store(prop, result)
if isinstance(prop, list) and len(prop) > 0 and isinstance(prop[0], Media):
for prop_media in prop:
s.store(prop_media, result)
# 6 - format and store formatted if needed # 6 - format and store formatted if needed
# enrichers typically need access to already stored URLs etc # enrichers typically need access to already stored URLs etc
if (final_media := self.formatter.format(result)): if (final_media := self.formatter.format(result)):
for s in self.storages: final_media.store()
s.store(final_media, result)
result.set_final_media(final_media) result.set_final_media(final_media)
if result.is_empty():
result.status = "nothing archived"
# signal completion to databases (DBs, Google Sheets, CSV, ...) # signal completion to databases (DBs, Google Sheets, CSV, ...)
for d in self.databases: d.done(result) for d in self.databases: d.done(result)

View File

@@ -2,13 +2,10 @@ from typing import Union, Tuple
import datetime import datetime
from urllib.parse import quote from urllib.parse import quote
# from metadata import Metadata
from loguru import logger from loguru import logger
# from . import Enricher
from . import Database from . import Database
from ..core import Metadata from ..core import Metadata, Media, ArchivingContext
from ..core import Media
from ..utils import GWorksheet from ..utils import GWorksheet
@@ -61,13 +58,13 @@ class GsheetsDb(Database):
cell_updates.append((row, 'status', item.status)) cell_updates.append((row, 'status', item.status))
media: Media = item.get_final_media() media: Media = item.get_final_media()
if hasattr(media, "urls"):
batch_if_valid('archive', "\n".join(media.urls)) batch_if_valid('archive', "\n".join(media.urls))
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()) batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
batch_if_valid('title', item.get_title()) batch_if_valid('title', item.get_title())
batch_if_valid('text', item.get("content", "")[:500]) batch_if_valid('text', item.get("content", "")[:500])
batch_if_valid('timestamp', item.get_timestamp()) batch_if_valid('timestamp', item.get_timestamp())
if (screenshot := item.get_media_by_id("screenshot")): if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
batch_if_valid('screenshot', "\n".join(screenshot.urls)) batch_if_valid('screenshot', "\n".join(screenshot.urls))
if (thumbnail := item.get_first_image("thumbnail")): if (thumbnail := item.get_first_image("thumbnail")):
@@ -88,7 +85,7 @@ class GsheetsDb(Database):
logger.debug(f"Unable to update sheet: {e}") logger.debug(f"Unable to update sheet: {e}")
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
gw: GWorksheet = item.get("gsheet").get("worksheet") gw: GWorksheet = ArchivingContext.get("gsheet").get("worksheet")
row: int = item.get("gsheet").get("row") row: int = ArchivingContext.get("gsheet").get("row")
return gw, row return gw, row

View File

@@ -3,4 +3,5 @@ from .screenshot_enricher import ScreenshotEnricher
from .wayback_enricher import WaybackArchiverEnricher from .wayback_enricher import WaybackArchiverEnricher
from .hash_enricher import HashEnricher from .hash_enricher import HashEnricher
from .thumbnail_enricher import ThumbnailEnricher from .thumbnail_enricher import ThumbnailEnricher
from .wacz_enricher import WaczEnricher from .wacz_enricher import WaczEnricher
from .whisper_enricher import WhisperEnricher

View File

@@ -16,11 +16,13 @@ class HashEnricher(Enricher):
super().__init__(config) super().__init__(config)
algo_choices = self.configs()["algorithm"]["choices"] algo_choices = self.configs()["algorithm"]["choices"]
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})." assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
self.chunksize = int(self.chunksize)
@staticmethod @staticmethod
def configs() -> dict: def configs() -> dict:
return { return {
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]} "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
"chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
} }
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
@@ -28,12 +30,19 @@ class HashEnricher(Enricher):
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})") logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
for i, m in enumerate(to_enrich.media): for i, m in enumerate(to_enrich.media):
with open(m.filename, "rb") as f: if len(hd := self.calculate_hash(m.filename)):
bytes = f.read() # read entire file as bytes to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
hash = None
if self.algorithm == "SHA-256": def calculate_hash(self, filename):
hash = hashlib.sha256(bytes) hash = None
elif self.algorithm == "SHA3-512": if self.algorithm == "SHA-256":
hash = hashlib.sha3_512(bytes) hash = hashlib.sha256()
else: continue elif self.algorithm == "SHA3-512":
to_enrich.media[i].set("hash", f"{self.algorithm}:{hash.hexdigest()}") hash = hashlib.sha3_512()
else: return ""
with open(filename, "rb") as f:
while True:
buf = f.read(self.chunksize)
if not buf: break
hash.update(buf)
return hash.hexdigest()

View File

@@ -3,8 +3,8 @@ import time, uuid, os
from selenium.common.exceptions import TimeoutException from selenium.common.exceptions import TimeoutException
from . import Enricher from . import Enricher
from ..utils import Webdriver from ..utils import Webdriver, UrlUtil
from ..core import Media, Metadata from ..core import Media, Metadata, ArchivingContext
class ScreenshotEnricher(Enricher): class ScreenshotEnricher(Enricher):
name = "screenshot_enricher" name = "screenshot_enricher"
@@ -14,21 +14,25 @@ class ScreenshotEnricher(Enricher):
return { return {
"width": {"default": 1280, "help": "width of the screenshots"}, "width": {"default": 1280, "help": "width of the screenshots"},
"height": {"default": 720, "help": "height of the screenshots"}, "height": {"default": 720, "help": "height of the screenshots"},
"timeout": {"default": 60, "help": "timeout for taking the screenshot"} "timeout": {"default": 60, "help": "timeout for taking the screenshot"},
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}
} }
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url() url = to_enrich.get_url()
if UrlUtil.is_auth_wall(url):
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
return
logger.debug(f"Enriching screenshot for {url=}") logger.debug(f"Enriching screenshot for {url=}")
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver: with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver:
try: try:
driver.get(url) driver.get(url)
time.sleep(2) time.sleep(int(self.sleep_before_screenshot))
screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png") screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
driver.save_screenshot(screenshot_file) driver.save_screenshot(screenshot_file)
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot") to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
except TimeoutException: except TimeoutException:
logger.info("TimeoutException loading page for screenshot") logger.info("TimeoutException loading page for screenshot")
except Exception as e: except Exception as e:
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}") logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
# return None

View File

@@ -2,7 +2,7 @@ import ffmpeg, os, uuid
from loguru import logger from loguru import logger
from . import Enricher from . import Enricher
from ..core import Media, Metadata from ..core import Media, Metadata, ArchivingContext
class ThumbnailEnricher(Enricher): class ThumbnailEnricher(Enricher):
@@ -23,7 +23,7 @@ class ThumbnailEnricher(Enricher):
logger.debug(f"generating thumbnails") logger.debug(f"generating thumbnails")
for i, m in enumerate(to_enrich.media[::]): for i, m in enumerate(to_enrich.media[::]):
if m.is_video(): if m.is_video():
folder = os.path.join(to_enrich.get_tmp_dir(), str(uuid.uuid4())) folder = os.path.join(ArchivingContext.get_tmp_dir(), str(uuid.uuid4()))
os.makedirs(folder, exist_ok=True) os.makedirs(folder, exist_ok=True)
logger.debug(f"generating thumbnails for {m.filename}") logger.debug(f"generating thumbnails for {m.filename}")
fps, duration = 0.5, m.get("duration") fps, duration = 0.5, m.get("duration")

View File

@@ -1,8 +1,9 @@
import os, shutil, subprocess, uuid import os, shutil, subprocess, uuid
from loguru import logger from loguru import logger
from ..core import Media, Metadata from ..core import Media, Metadata, ArchivingContext
from . import Enricher from . import Enricher
from ..utils import UrlUtil
class WaczEnricher(Enricher): class WaczEnricher(Enricher):
@@ -20,14 +21,20 @@ class WaczEnricher(Enricher):
return { return {
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."}, "profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
"timeout": {"default": 90, "help": "timeout for WACZ generation in seconds"}, "timeout": {"default": 90, "help": "timeout for WACZ generation in seconds"},
"ignore_auth_wall": {"default": True, "help": "skip URL if it is behind authentication wall, set to False if you have browsertrix profile configured for private content."},
} }
def enrich(self, to_enrich: Metadata) -> bool: def enrich(self, to_enrich: Metadata) -> bool:
# TODO: figure out support for browsertrix in docker # TODO: figure out support for browsertrix in docker
url = to_enrich.get_url() url = to_enrich.get_url()
if UrlUtil.is_auth_wall(url):
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
return
logger.debug(f"generating WACZ for {url=}") logger.debug(f"generating WACZ for {url=}")
collection = str(uuid.uuid4())[0:8] collection = str(uuid.uuid4())[0:8]
browsertrix_home = os.path.abspath(to_enrich.get_tmp_dir()) browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
cmd = [ cmd = [
"docker", "run", "docker", "run",
"--rm", # delete container once it has completed running "--rm", # delete container once it has completed running

View File

@@ -1,8 +1,10 @@
from loguru import logger from loguru import logger
import time, requests import time, requests
from . import Enricher from . import Enricher
from ..archivers import Archiver from ..archivers import Archiver
from ..utils import UrlUtil
from ..core import Metadata from ..core import Metadata
class WaybackArchiverEnricher(Enricher, Archiver): class WaybackArchiverEnricher(Enricher, Archiver):
@@ -33,6 +35,10 @@ class WaybackArchiverEnricher(Enricher, Archiver):
def enrich(self, to_enrich: Metadata) -> bool: def enrich(self, to_enrich: Metadata) -> bool:
url = to_enrich.get_url() url = to_enrich.get_url()
if UrlUtil.is_auth_wall(url):
logger.debug(f"[SKIP] WAYBACK since url is behind AUTH WALL: {url=}")
return
logger.debug(f"calling wayback for {url=}") logger.debug(f"calling wayback for {url=}")
if to_enrich.get("wayback"): if to_enrich.get("wayback"):

View File

@@ -0,0 +1,123 @@
import traceback
import requests, time
from loguru import logger
from . import Enricher
from ..core import Metadata, Media, ArchivingContext
from ..storages import S3Storage
class WhisperEnricher(Enricher):
"""
Connects with a Whisper API service to get texts out of audio
whisper API repository: TODO
Only works if an S3 compatible storage is used
"""
name = "whisper_enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
assert type(self.api_key) == str and len(self.api_key) > 0, "please provide a value for the whisper_enricher api_key"
self.timeout = int(self.timeout)
@staticmethod
def configs() -> dict:
return {
"api_endpoint": {"default": "https://whisper.spoettel.dev/api/v1", "help": "WhisperApi api endpoint"},
"api_key": {"default": None, "help": "WhisperApi api key for authentication"},
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
"action": {"default": "translation", "help": "which Whisper operation to execute", "choices": ["transcript", "translation", "language_detection"]},
}
def enrich(self, to_enrich: Metadata) -> None:
if not self._get_s3_storage():
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
return
url = to_enrich.get_url()
logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.")
job_results = {}
for i, m in enumerate(to_enrich.media):
if m.is_video() or m.is_audio():
m.store()
try:
job_id = self.submit_job(m)
job_results[job_id] = False
logger.debug(f"JOB SUBMITTED: {job_id=} for {m.key=}")
to_enrich.media[i].set("whisper_model", {"job_id": job_id})
except Exception as e:
logger.error(f"Failed to submit whisper job for {m.filename=} with error {e}\n{traceback.format_exc()}")
job_results = self.check_jobs(job_results)
for i, m in enumerate(to_enrich.media):
if m.is_video() or m.is_audio():
job_id = to_enrich.media[i].get("whisper_model")["job_id"]
to_enrich.media[i].set("whisper_model", {
"job_id": job_id,
self.action: job_results[job_id]
})
def submit_job(self, media: Media):
s3 = self._get_s3_storage()
s3_url = s3.get_cdn_url(media)
assert s3_url in media.urls, f"Could not find S3 url ({s3_url}) in list of stored media urls "
payload = {
"url": s3_url,
"type": self.action,
# "language": "string" # may be a config
}
response = requests.post(f'{self.api_endpoint}/jobs', json=payload, headers={'Authorization': f'Bearer {self.api_key}'})
assert response.status_code == 201, f"calling the whisper api {self.api_endpoint} returned a non-success code: {response.status_code}"
logger.debug(response.json())
return response.json()['id']
def check_jobs(self, job_results: dict):
start_time = time.time()
all_completed = False
while not all_completed and (time.time() - start_time) <= self.timeout:
all_completed = True
for job_id in job_results:
if job_results[job_id]: continue
all_completed = False # at least one not ready
try: job_results[job_id] = self.check_job(job_id)
except Exception as e:
logger.error(f"Failed to check {job_id=} with error {e}\n{traceback.format_exc()}")
if not all_completed: time.sleep(3)
return job_results
def check_job(self, job_id):
r = requests.get(f'{self.api_endpoint}/jobs/{job_id}', headers={'Authorization': f'Bearer {self.api_key}'})
assert r.status_code == 200, f"Job status did not respond with 200, instead with: {r.status_code}"
j = r.json()
logger.debug(f"Checked job {job_id=} with status='{j['status']}'")
if j['status'] == "processing": return False
elif j['status'] == "error": return f"Error: {j['meta']['error']}"
elif j['status'] == "success":
r_res = requests.get(f'{self.api_endpoint}/jobs/{job_id}/artifacts', headers={'Authorization': f'Bearer {self.api_key}'})
assert r_res.status_code == 200, f"Job artifacts did not respond with 200, instead with: {r_res.status_code}"
logger.success(r_res.json())
result = []
for artifact in r_res.json():
subtitle = []
full_text = []
for i, d in enumerate(artifact.get("data")):
subtitle.append(f"{i+1}\n{d.get('start')} --> {d.get('end')}\n{d.get('text').strip()}")
full_text.append(d.get('text').strip())
if not len(subtitle): continue
result.append({
"subtitle": "\n".join(subtitle),
"full_text": "\n".join(full_text),
})
return result
return False
def _get_s3_storage(self) -> S3Storage:
try:
return next(s for s in ArchivingContext.get("storages") if s.__class__ == S3Storage)
except:
logger.warning("No S3Storage instance found in storages")
return

View File

@@ -1,7 +1,7 @@
from loguru import logger from loguru import logger
from . import Feeder from . import Feeder
from ..core import Metadata from ..core import Metadata, ArchivingContext
class CLIFeeder(Feeder): class CLIFeeder(Feeder):
@@ -26,5 +26,7 @@ class CLIFeeder(Feeder):
def __iter__(self) -> Metadata: def __iter__(self) -> Metadata:
for url in self.urls: for url in self.urls:
logger.debug(f"Processing {url}") logger.debug(f"Processing {url}")
yield Metadata().set_url(url).set("folder", "cli", True) yield Metadata().set_url(url)
ArchivingContext.set("folder", "cli")
logger.success(f"Processed {len(self.urls)} URL(s)") logger.success(f"Processed {len(self.urls)} URL(s)")

View File

@@ -5,9 +5,10 @@ from slugify import slugify
# from . import Enricher # from . import Enricher
from . import Feeder from . import Feeder
from ..core import Metadata from ..core import Metadata, ArchivingContext
from ..utils import Gsheets, GWorksheet from ..utils import Gsheets, GWorksheet
class GsheetsFeeder(Gsheets, Feeder): class GsheetsFeeder(Gsheets, Feeder):
name = "gsheet_feeder" name = "gsheet_feeder"
@@ -31,7 +32,7 @@ class GsheetsFeeder(Gsheets, Feeder):
"help": "(CSV) explicitly block some worksheets from being processed", "help": "(CSV) explicitly block some worksheets from being processed",
"cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
}, },
"use_sheet_names_in_stored_paths":{ "use_sheet_names_in_stored_paths": {
"default": True, "default": True,
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
} }
@@ -61,11 +62,12 @@ class GsheetsFeeder(Gsheets, Feeder):
if status not in ['', None]: continue if status not in ['', None]: continue
# All checks done - archival process starts here # All checks done - archival process starts here
m = Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True) m = Metadata().set_url(url)
ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
if self.use_sheet_names_in_stored_paths: if self.use_sheet_names_in_stored_paths:
m.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True) ArchivingContext.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
yield m yield m
logger.success(f'Finished worksheet {wks.title}') logger.success(f'Finished worksheet {wks.title}')
def should_process_sheet(self, sheet_name: str) -> bool: def should_process_sheet(self, sheet_name: str) -> bool:

View File

@@ -3,9 +3,10 @@ from dataclasses import dataclass
import mimetypes, uuid, os, pathlib import mimetypes, uuid, os, pathlib
from jinja2 import Environment, FileSystemLoader from jinja2 import Environment, FileSystemLoader
from urllib.parse import quote from urllib.parse import quote
from loguru import logger
from ..version import __version__ from ..version import __version__
from ..core import Metadata, Media from ..core import Metadata, Media, ArchivingContext
from . import Formatter from . import Formatter
@@ -26,18 +27,23 @@ class HtmlFormatter(Formatter):
@staticmethod @staticmethod
def configs() -> dict: def configs() -> dict:
return { return {
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}, "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
} }
def format(self, item: Metadata) -> Media: def format(self, item: Metadata) -> Media:
url = item.get_url()
if item.is_empty():
logger.debug(f"[SKIP] FORMAT there is no media or metadata to format: {url=}")
return
content = self.template.render( content = self.template.render(
url=item.get_url(), url=url,
title=item.get_title(), title=item.get_title(),
media=item.media, media=item.media,
metadata=item.get_clean_metadata(), metadata=item.metadata,
version=__version__ version=__version__
) )
html_path = os.path.join(item.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html") html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html")
with open(html_path, mode="w", encoding="utf-8") as outf: with open(html_path, mode="w", encoding="utf-8") as outf:
outf.write(content) outf.write(content)
return Media(filename=html_path) return Media(filename=html_path)

View File

@@ -29,7 +29,7 @@
margin: auto; margin: auto;
border: 1px solid; border: 1px solid;
border-collapse: collapse; border-collapse: collapse;
vertical-align:top; vertical-align: top;
} }
table.metadata td:first-child { table.metadata td:first-child {
@@ -185,7 +185,11 @@
el.addEventListener("copy", (e) => { el.addEventListener("copy", (e) => {
e.preventDefault(); e.preventDefault();
if (e.clipboardData) { if (e.clipboardData) {
e.clipboardData.setData("text/plain", el.textContent); if (el.hasAttribute("copy-value")) {
e.clipboardData.setData("text/plain", el.getAttribute("copy-value"));
} else {
e.clipboardData.setData("text/plain", el.textContent);
}
console.log(e.clipboardData.getData("text")) console.log(e.clipboardData.getData("text"))
showNotification("copied!") showNotification("copied!")
} }

View File

@@ -46,14 +46,16 @@ No preview available for {{ m.key }}.
{% endif %} {% endif %}
{% if links %} {% if links %}
<a href="{{ url }}">open</a> or <a href="{{ url }}">open</a> or
<a href="{{ url }}" download="">download</a> <a href="{{ url }}" download="">download</a> or
{{ copy_urlize(url, "copy") }}
<br> <br>
{% endif %} {% endif %}
{% endfor %} {% endfor %}
{%- endmacro -%} {%- endmacro -%}
{% macro copy_urlize(val) -%} {% macro copy_urlize(val, href_text) -%}
{% if val is mapping %} {% if val is mapping %}
<ul> <ul>
@@ -65,7 +67,11 @@ No preview available for {{ m.key }}.
</ul> </ul>
{% else %} {% else %}
{% if href_text | length == 0 %}
<span class="copy">{{ val | string | urlize }}</span> <span class="copy">{{ val | string | urlize }}</span>
{% else %}
<span class="copy" copy-value="{{val}}">{{ href_text | string | urlize }}</span>
{% endif %}
{% endif %} {% endif %}
{%- endmacro -%} {%- endmacro -%}

View File

@@ -1,10 +1,10 @@
from __future__ import annotations from __future__ import annotations
from abc import abstractmethod from abc import abstractmethod
from dataclasses import dataclass from dataclasses import dataclass
import hashlib from typing import IO
from typing import IO, Any
from ..core import Media, Metadata, Step from ..core import Media, Step, ArchivingContext
from ..enrichers import HashEnricher
from loguru import logger from loguru import logger
import os, uuid import os, uuid
from slugify import slugify from slugify import slugify
@@ -41,8 +41,11 @@ class Storage(Step):
# only for typing... # only for typing...
return Step.init(name, config, Storage) return Step.init(name, config, Storage)
def store(self, media: Media, item: Metadata) -> None: def store(self, media: Media, url: str) -> None:
self.set_key(media, item) if media.is_stored():
logger.debug(f"{self.key} already stored, skipping")
return
self.set_key(media, url)
self.upload(media) self.upload(media)
media.add_url(self.get_cdn_url(media)) media.add_url(self.get_cdn_url(media))
@@ -57,25 +60,25 @@ class Storage(Step):
with open(media.filename, 'rb') as f: with open(media.filename, 'rb') as f:
return self.uploadf(f, media, **kwargs) return self.uploadf(f, media, **kwargs)
def set_key(self, media: Media, item: Metadata) -> None: def set_key(self, media: Media, url) -> None:
"""takes the media and optionally item info and generates a key""" """takes the media and optionally item info and generates a key"""
if media.key is not None and len(media.key) > 0: return if media.key is not None and len(media.key) > 0: return
folder = item.get("folder", "") folder = ArchivingContext.get("folder", "")
filename, ext = os.path.splitext(media.filename) filename, ext = os.path.splitext(media.filename)
# path_generator logic # path_generator logic
if self.path_generator == "flat": if self.path_generator == "flat":
path = "" path = ""
filename = slugify(filename) # in case it comes with os.sep filename = slugify(filename) # in case it comes with os.sep
elif self.path_generator == "url": path = slugify(item.get_url()) elif self.path_generator == "url": path = slugify(url)
elif self.path_generator == "random": elif self.path_generator == "random":
path = item.get("random_path", str(uuid.uuid4())[:16], True) path = ArchivingContext.get("random_path", str(uuid.uuid4())[:16], True)
# filename_generator logic # filename_generator logic
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16] if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
elif self.filename_generator == "static": elif self.filename_generator == "static":
with open(media.filename, "rb") as f: he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}})
bytes = f.read() # read entire file as bytes hd = he.calculate_hash(media.filename)
filename = hashlib.sha256(bytes).hexdigest()[:24] filename = hd[:24]
media.key = os.path.join(folder, path, f"{filename}{ext}") media.key = os.path.join(folder, path, f"{filename}{ext}")

View File

@@ -2,4 +2,5 @@
from .gworksheet import GWorksheet from .gworksheet import GWorksheet
from .misc import * from .misc import *
from .webdriver import Webdriver from .webdriver import Webdriver
from .gsheet import Gsheets from .gsheet import Gsheets
from .url import UrlUtil

View File

@@ -40,11 +40,11 @@ class GWorksheet:
def _col_index(self, col: str): def _col_index(self, col: str):
self._check_col_exists(col) self._check_col_exists(col)
return self.headers.index(self.columns[col]) return self.headers.index(self.columns[col].lower())
def col_exists(self, col: str): def col_exists(self, col: str):
self._check_col_exists(col) self._check_col_exists(col)
return self.columns[col] in self.headers return self.columns[col].lower() in self.headers
def count_rows(self): def count_rows(self):
return len(self.values) return len(self.values)

View File

@@ -0,0 +1,19 @@
import re
class UrlUtil:
telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)")
is_istagram = re.compile(r"https:\/\/www\.instagram\.com")
@staticmethod
def clean(url): return url
@staticmethod
def is_auth_wall(url):
"""
checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
"""
if UrlUtil.telegram_private.match(url): return True
if UrlUtil.is_istagram.match(url): return True
return False

View File

@@ -1,9 +1,9 @@
_MAJOR = "0" _MAJOR = "0"
_MINOR = "3" _MINOR = "5"
# On main and in a nightly release the patch should be one ahead of the last # On main and in a nightly release the patch should be one ahead of the last
# released build. # released build.
_PATCH = "0" _PATCH = "6"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See # This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics. # https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = "" _SUFFIX = ""