Compare commits

..

15 Commits

Author SHA1 Message Date
msramalho
39818e648a Bump version to v0.4.5 for release 2023-03-16 15:05:42 +00:00
Miguel Sozinho Ramalho
2bbf534d67 Merge pull request #72 from milesmcc/patch-1
Fix hash enricher for flatfile output (closes #71)
2023-03-16 15:04:55 +00:00
R. Miles McCain
6be7536fad Fix hash enricher for flatfile output (closes #71) 2023-03-14 13:37:54 -07:00
msramalho
0654e8c5c6 hash calculation in chunks to avoid exhausting RAM 2023-03-10 11:34:29 +00:00
msramalho
0e3c427371 Bump version to v0.4.3 for release 2023-02-27 10:30:06 +01:00
msramalho
7497bc08c0 Bump version to v0.4.2 for release 2023-02-23 17:14:29 +01:00
msramalho
49863768fe vk updates 2023-02-22 18:35:15 +01:00
msramalho
7b9483bbf9 yt-dlp update 2023-02-22 18:28:20 +01:00
msramalho
cd81cae559 auth wall for WACZ 2023-02-20 16:08:45 +00:00
msramalho
23894fad51 normalize columns 2023-02-20 16:08:35 +00:00
msramalho
876988b587 detect invalid url messages instagram bot 2023-02-20 12:22:52 +00:00
msramalho
f95293b84b support for multiple media instagram 2023-02-20 11:25:02 +00:00
msramalho
2fbcbe4e8b double session issues 2023-02-20 11:11:39 +00:00
msramalho
d1e4574c6c readme updates 2023-02-17 16:30:50 +00:00
msramalho
d347b26d37 updating example config 2023-02-17 16:26:23 +00:00
11 changed files with 292 additions and 141 deletions

View File

@@ -14,7 +14,6 @@ loguru = "*"
ffmpeg-python = "*" ffmpeg-python = "*"
selenium = "*" selenium = "*"
snscrape = "*" snscrape = "*"
yt-dlp = "*"
telethon = "*" telethon = "*"
google-api-python-client = "*" google-api-python-client = "*"
google-auth-httplib2 = "*" google-auth-httplib2 = "*"
@@ -23,13 +22,14 @@ oauth2client = "*"
python-slugify = "*" python-slugify = "*"
pyyaml = "*" pyyaml = "*"
dateparser = "*" dateparser = "*"
vk-url-scraper = "*"
python-twitter-v2 = "*" python-twitter-v2 = "*"
instaloader = "*" instaloader = "*"
tqdm = "*" tqdm = "*"
jinja2 = "*" jinja2 = "*"
cryptography = "==38.0.4" cryptography = "==38.0.4"
dataclasses-json = "*" dataclasses-json = "*"
yt-dlp = ">=2023.2.17"
vk-url-scraper = "*"
[requires] [requires]
python_version = "3.9" python_version = "3.9"

248
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "e2f5d017d9bc9eef90cced189b6e3017d740c35d204962479417109a4deeb7f4" "sha256": "7176a6666639452dbf30939fa095ff23518aee6da7d9561de0f12ba0aceed527"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
@@ -57,19 +57,19 @@
}, },
"boto3": { "boto3": {
"hashes": [ "hashes": [
"sha256:3a1ffeecfe6e61d414617294b822b008e604ccfd83434c483f429a2922db314d", "sha256:17f0d782487275cac12676a61b3f1a4900954cc454c842b8551ca47a3dcd59b4",
"sha256:ebea98f3054b467caf6c8aead9f0ef78395a78bce78b04db12fde452c02b3734" "sha256:bf808f7433629650128ab577a9d4a0f4daf072d9f2f3a907b9d567a6952d9154"
], ],
"index": "pypi", "index": "pypi",
"version": "==1.26.66" "version": "==1.26.77"
}, },
"botocore": { "botocore": {
"hashes": [ "hashes": [
"sha256:4d1ac019e677cc39e615f9d473fa658ea22a8d906c1c562f9406b5d0cd854cbd", "sha256:9d94a02f2584b52c65fb3cb309fb1b29d6d0c36d69062722b0275c1c382c44c9",
"sha256:772da07d2a49a9d2dc8d23e060e88eb72881e58074be7c813aa946ecdbd0e5b5" "sha256:d8aa7bffe2422de282b2d02945b7b45d5fecf00f67b65eebb0b1fa3de1abc6d0"
], ],
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==1.29.66" "version": "==1.29.77"
}, },
"brotli": { "brotli": {
"hashes": [ "hashes": [
@@ -176,11 +176,11 @@
}, },
"certifi": { "certifi": {
"hashes": [ "hashes": [
"sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d", "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3",
"sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412" "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"
], ],
"markers": "python_version >= '3.6'", "markers": "python_version >= '3.6'",
"version": "==2022.6.15" "version": "==2022.12.7"
}, },
"cffi": { "cffi": {
"hashes": [ "hashes": [
@@ -253,11 +253,97 @@
}, },
"charset-normalizer": { "charset-normalizer": {
"hashes": [ "hashes": [
"sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597", "sha256:00d3ffdaafe92a5dc603cb9bd5111aaa36dfa187c8285c543be562e61b755f6b",
"sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df" "sha256:024e606be3ed92216e2b6952ed859d86b4cfa52cd5bc5f050e7dc28f9b43ec42",
"sha256:0298eafff88c99982a4cf66ba2efa1128e4ddaca0b05eec4c456bbc7db691d8d",
"sha256:02a51034802cbf38db3f89c66fb5d2ec57e6fe7ef2f4a44d070a593c3688667b",
"sha256:083c8d17153ecb403e5e1eb76a7ef4babfc2c48d58899c98fcaa04833e7a2f9a",
"sha256:0a11e971ed097d24c534c037d298ad32c6ce81a45736d31e0ff0ad37ab437d59",
"sha256:0bf2dae5291758b6f84cf923bfaa285632816007db0330002fa1de38bfcb7154",
"sha256:0c0a590235ccd933d9892c627dec5bc7511ce6ad6c1011fdf5b11363022746c1",
"sha256:0f438ae3532723fb6ead77e7c604be7c8374094ef4ee2c5e03a3a17f1fca256c",
"sha256:109487860ef6a328f3eec66f2bf78b0b72400280d8f8ea05f69c51644ba6521a",
"sha256:11b53acf2411c3b09e6af37e4b9005cba376c872503c8f28218c7243582df45d",
"sha256:12db3b2c533c23ab812c2b25934f60383361f8a376ae272665f8e48b88e8e1c6",
"sha256:14e76c0f23218b8f46c4d87018ca2e441535aed3632ca134b10239dfb6dadd6b",
"sha256:16a8663d6e281208d78806dbe14ee9903715361cf81f6d4309944e4d1e59ac5b",
"sha256:292d5e8ba896bbfd6334b096e34bffb56161c81408d6d036a7dfa6929cff8783",
"sha256:2c03cc56021a4bd59be889c2b9257dae13bf55041a3372d3295416f86b295fb5",
"sha256:2e396d70bc4ef5325b72b593a72c8979999aa52fb8bcf03f701c1b03e1166918",
"sha256:2edb64ee7bf1ed524a1da60cdcd2e1f6e2b4f66ef7c077680739f1641f62f555",
"sha256:31a9ddf4718d10ae04d9b18801bd776693487cbb57d74cc3458a7673f6f34639",
"sha256:356541bf4381fa35856dafa6a965916e54bed415ad8a24ee6de6e37deccf2786",
"sha256:358a7c4cb8ba9b46c453b1dd8d9e431452d5249072e4f56cfda3149f6ab1405e",
"sha256:37f8febc8ec50c14f3ec9637505f28e58d4f66752207ea177c1d67df25da5aed",
"sha256:39049da0ffb96c8cbb65cbf5c5f3ca3168990adf3551bd1dee10c48fce8ae820",
"sha256:39cf9ed17fe3b1bc81f33c9ceb6ce67683ee7526e65fde1447c772afc54a1bb8",
"sha256:3ae1de54a77dc0d6d5fcf623290af4266412a7c4be0b1ff7444394f03f5c54e3",
"sha256:3b590df687e3c5ee0deef9fc8c547d81986d9a1b56073d82de008744452d6541",
"sha256:3e45867f1f2ab0711d60c6c71746ac53537f1684baa699f4f668d4c6f6ce8e14",
"sha256:3fc1c4a2ffd64890aebdb3f97e1278b0cc72579a08ca4de8cd2c04799a3a22be",
"sha256:4457ea6774b5611f4bed5eaa5df55f70abde42364d498c5134b7ef4c6958e20e",
"sha256:44ba614de5361b3e5278e1241fda3dc1838deed864b50a10d7ce92983797fa76",
"sha256:4a8fcf28c05c1f6d7e177a9a46a1c52798bfe2ad80681d275b10dcf317deaf0b",
"sha256:4b0d02d7102dd0f997580b51edc4cebcf2ab6397a7edf89f1c73b586c614272c",
"sha256:502218f52498a36d6bf5ea77081844017bf7982cdbe521ad85e64cabee1b608b",
"sha256:503e65837c71b875ecdd733877d852adbc465bd82c768a067badd953bf1bc5a3",
"sha256:5995f0164fa7df59db4746112fec3f49c461dd6b31b841873443bdb077c13cfc",
"sha256:59e5686dd847347e55dffcc191a96622f016bc0ad89105e24c14e0d6305acbc6",
"sha256:601f36512f9e28f029d9481bdaf8e89e5148ac5d89cffd3b05cd533eeb423b59",
"sha256:608862a7bf6957f2333fc54ab4399e405baad0163dc9f8d99cb236816db169d4",
"sha256:62595ab75873d50d57323a91dd03e6966eb79c41fa834b7a1661ed043b2d404d",
"sha256:70990b9c51340e4044cfc394a81f614f3f90d41397104d226f21e66de668730d",
"sha256:71140351489970dfe5e60fc621ada3e0f41104a5eddaca47a7acb3c1b851d6d3",
"sha256:72966d1b297c741541ca8cf1223ff262a6febe52481af742036a0b296e35fa5a",
"sha256:74292fc76c905c0ef095fe11e188a32ebd03bc38f3f3e9bcb85e4e6db177b7ea",
"sha256:761e8904c07ad053d285670f36dd94e1b6ab7f16ce62b9805c475b7aa1cffde6",
"sha256:772b87914ff1152b92a197ef4ea40efe27a378606c39446ded52c8f80f79702e",
"sha256:79909e27e8e4fcc9db4addea88aa63f6423ebb171db091fb4373e3312cb6d603",
"sha256:7e189e2e1d3ed2f4aebabd2d5b0f931e883676e51c7624826e0a4e5fe8a0bf24",
"sha256:7eb33a30d75562222b64f569c642ff3dc6689e09adda43a082208397f016c39a",
"sha256:81d6741ab457d14fdedc215516665050f3822d3e56508921cc7239f8c8e66a58",
"sha256:8499ca8f4502af841f68135133d8258f7b32a53a1d594aa98cc52013fff55678",
"sha256:84c3990934bae40ea69a82034912ffe5a62c60bbf6ec5bc9691419641d7d5c9a",
"sha256:87701167f2a5c930b403e9756fab1d31d4d4da52856143b609e30a1ce7160f3c",
"sha256:88600c72ef7587fe1708fd242b385b6ed4b8904976d5da0893e31df8b3480cb6",
"sha256:8ac7b6a045b814cf0c47f3623d21ebd88b3e8cf216a14790b455ea7ff0135d18",
"sha256:8b8af03d2e37866d023ad0ddea594edefc31e827fee64f8de5611a1dbc373174",
"sha256:8c7fe7afa480e3e82eed58e0ca89f751cd14d767638e2550c77a92a9e749c317",
"sha256:8eade758719add78ec36dc13201483f8e9b5d940329285edcd5f70c0a9edbd7f",
"sha256:911d8a40b2bef5b8bbae2e36a0b103f142ac53557ab421dc16ac4aafee6f53dc",
"sha256:93ad6d87ac18e2a90b0fe89df7c65263b9a99a0eb98f0a3d2e079f12a0735837",
"sha256:95dea361dd73757c6f1c0a1480ac499952c16ac83f7f5f4f84f0658a01b8ef41",
"sha256:9ab77acb98eba3fd2a85cd160851816bfce6871d944d885febf012713f06659c",
"sha256:9cb3032517f1627cc012dbc80a8ec976ae76d93ea2b5feaa9d2a5b8882597579",
"sha256:9cf4e8ad252f7c38dd1f676b46514f92dc0ebeb0db5552f5f403509705e24753",
"sha256:9d9153257a3f70d5f69edf2325357251ed20f772b12e593f3b3377b5f78e7ef8",
"sha256:a152f5f33d64a6be73f1d30c9cc82dfc73cec6477ec268e7c6e4c7d23c2d2291",
"sha256:a16418ecf1329f71df119e8a65f3aa68004a3f9383821edcb20f0702934d8087",
"sha256:a60332922359f920193b1d4826953c507a877b523b2395ad7bc716ddd386d866",
"sha256:a8d0fc946c784ff7f7c3742310cc8a57c5c6dc31631269876a88b809dbeff3d3",
"sha256:ab5de034a886f616a5668aa5d098af2b5385ed70142090e2a31bcbd0af0fdb3d",
"sha256:c22d3fe05ce11d3671297dc8973267daa0f938b93ec716e12e0f6dee81591dc1",
"sha256:c2ac1b08635a8cd4e0cbeaf6f5e922085908d48eb05d44c5ae9eabab148512ca",
"sha256:c512accbd6ff0270939b9ac214b84fb5ada5f0409c44298361b2f5e13f9aed9e",
"sha256:c75ffc45f25324e68ab238cb4b5c0a38cd1c3d7f1fb1f72b5541de469e2247db",
"sha256:c95a03c79bbe30eec3ec2b7f076074f4281526724c8685a42872974ef4d36b72",
"sha256:cadaeaba78750d58d3cc6ac4d1fd867da6fc73c88156b7a3212a3cd4819d679d",
"sha256:cd6056167405314a4dc3c173943f11249fa0f1b204f8b51ed4bde1a9cd1834dc",
"sha256:db72b07027db150f468fbada4d85b3b2729a3db39178abf5c543b784c1254539",
"sha256:df2c707231459e8a4028eabcd3cfc827befd635b3ef72eada84ab13b52e1574d",
"sha256:e62164b50f84e20601c1ff8eb55620d2ad25fb81b59e3cd776a1902527a788af",
"sha256:e696f0dd336161fca9adbb846875d40752e6eba585843c768935ba5c9960722b",
"sha256:eaa379fcd227ca235d04152ca6704c7cb55564116f8bc52545ff357628e10602",
"sha256:ebea339af930f8ca5d7a699b921106c6e29c617fe9606fa7baa043c1cdae326f",
"sha256:f4c39b0e3eac288fedc2b43055cfc2ca7a60362d0e5e87a637beac5d801ef478",
"sha256:f5057856d21e7586765171eac8b9fc3f7d44ef39425f85dbcccb13b3ebea806c",
"sha256:f6f45710b4459401609ebebdbcfb34515da4fc2aa886f95107f556ac69a9147e",
"sha256:f97e83fa6c25693c7a35de154681fcc257c1c41b38beb0304b9c4d2d9e164479",
"sha256:f9d0c5c045a3ca9bedfc35dca8526798eb91a07aa7a2c0fee134c6c6f321cbd7",
"sha256:ff6f3db31555657f3163b15a6b7c6938d08df7adbfc9dd13d9d19edad678f1e8"
], ],
"markers": "python_version >= '3.5'", "markers": "python_version >= '3.6'",
"version": "==2.0.12" "version": "==3.0.1"
}, },
"click": { "click": {
"hashes": [ "hashes": [
@@ -348,11 +434,11 @@
}, },
"flask": { "flask": {
"hashes": [ "hashes": [
"sha256:642c450d19c4ad482f96729bd2a8f6d32554aa1e231f4f6b4e7e5264b16cca2b", "sha256:7eb373984bf1c770023fce9db164ed0c3353cd0b53f130f4693da0ca756a2e6d",
"sha256:b9c46cc36662a7949f34b52d8ec7bb59c0d74ba08ba6cb9ce9adc1d8676d9526" "sha256:c0bec9477df1cb867e5a67c9e1ab758de9cb4a3e52dd70681f59fa40a62b3f2d"
], ],
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==2.2.2" "version": "==2.2.3"
}, },
"future": { "future": {
"hashes": [ "hashes": [
@@ -371,19 +457,19 @@
}, },
"google-api-python-client": { "google-api-python-client": {
"hashes": [ "hashes": [
"sha256:42a44e9adfca6bb27540ce52348aa1d3b81e214bcc53d454a76ebfbe8eee1483", "sha256:577c0aeae1eb3c754eacb9122d369d67609fef759bc6a4fa16cafeab4f30019b",
"sha256:f18e9dbb365f0485194a8daf5d60da2cff6a80ce2c9a694efc2b279922cb3dd0" "sha256:b9b6dc5f139892310093ba75d0df4c78f48655078953c923957dab1ec86129e7"
], ],
"index": "pypi", "index": "pypi",
"version": "==2.77.0" "version": "==2.79.0"
}, },
"google-auth": { "google-auth": {
"hashes": [ "hashes": [
"sha256:5045648c821fb72384cdc0e82cc326df195f113a33049d9b62b74589243d2acc", "sha256:5fd170986bce6bfd7bb5c845c4b8362edb1e0cba901e062196e83f8bb5d5d32c",
"sha256:ed7057a101af1146f0554a769930ac9de506aeca4fd5af6543ebe791851a9fbd" "sha256:75d76ea857df65938e1f71dcbcd7d0cd48e3f80b34b8870ba229c9292081f7ef"
], ],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==2.16.0" "version": "==2.16.1"
}, },
"google-auth-httplib2": { "google-auth-httplib2": {
"hashes": [ "hashes": [
@@ -435,18 +521,18 @@
}, },
"idna": { "idna": {
"hashes": [ "hashes": [
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff", "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4",
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d" "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"
], ],
"markers": "python_version >= '3.5'", "markers": "python_version >= '3.5'",
"version": "==3.3" "version": "==3.4"
}, },
"instaloader": { "instaloader": {
"hashes": [ "hashes": [
"sha256:ba925a87e2c305a3d24173d1bb0457d5a7e2e77dbac7206eeeb46f9104ecb08e" "sha256:16040c170fb5230c1981a47e1990261e3c0ecffe0417be95fa265632244e7c01"
], ],
"index": "pypi", "index": "pypi",
"version": "==4.9.5" "version": "==4.9.6"
}, },
"itsdangerous": { "itsdangerous": {
"hashes": [ "hashes": [
@@ -565,11 +651,11 @@
}, },
"markdown-it-py": { "markdown-it-py": {
"hashes": [ "hashes": [
"sha256:93de681e5c021a432c63147656fe21790bc01231e0cd2da73626f1aa3ac0fe27", "sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30",
"sha256:cf7e59fed14b5ae17c0006eff14a2d9a00ed5f3a846148153899a0224e2c07da" "sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1"
], ],
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==2.1.0" "version": "==2.2.0"
}, },
"markupsafe": { "markupsafe": {
"hashes": [ "hashes": [
@@ -700,23 +786,22 @@
}, },
"protobuf": { "protobuf": {
"hashes": [ "hashes": [
"sha256:1f22ac0ca65bb70a876060d96d914dae09ac98d114294f77584b0d2644fa9c30", "sha256:1669cb7524221a8e2d9008d0842453dbefdd0fcdd64d67672f657244867635fb",
"sha256:237216c3326d46808a9f7c26fd1bd4b20015fb6867dc5d263a493ef9a539293b", "sha256:29288813aacaa302afa2381db1d6e0482165737b0afdf2811df5fa99185c457b",
"sha256:27f4d15021da6d2b706ddc3860fac0a5ddaba34ab679dc182b60a8bb4e1121cc", "sha256:47d31bdf58222dd296976aa1646c68c6ee80b96d22e0a3c336c9174e253fd35e",
"sha256:299ea899484ee6f44604deb71f424234f654606b983cb496ea2a53e3c63ab791", "sha256:652d8dfece122a24d98eebfef30e31e455d300efa41999d1182e015984ac5930",
"sha256:3d164928ff0727d97022957c2b849250ca0e64777ee31efd7d6de2e07c494717", "sha256:7c535d126e7dcc714105ab20b418c4fedbd28f8b8afc42b7350b1e317bbbcc71",
"sha256:6ab80df09e3208f742c98443b6166bcb70d65f52cfeb67357d52032ea1ae9bec", "sha256:86c3d20428b007537ba6792b475c0853bba7f66b1f60e610d913b77d94b486e4",
"sha256:78a28c9fa223998472886c77042e9b9afb6fe4242bd2a2a5aced88e3f4422aa7", "sha256:a33a273d21852f911b8bda47f39f4383fe7c061eb1814db2c76c9875c89c2491",
"sha256:7cd532c4566d0e6feafecc1059d04c7915aec8e182d1cf7adee8b24ef1e2e6ab", "sha256:ab4d043865dd04e6b09386981fe8f80b39a1e46139fb4a3c206229d6b9f36ff6",
"sha256:89f9149e4a0169cddfc44c74f230d7743002e3aa0b9472d8c28f0388102fc4c2", "sha256:b2fea9dc8e3c0f32c38124790ef16cba2ee0628fe2022a52e435e1117bfef9b1",
"sha256:a53fd3f03e578553623272dc46ac2f189de23862e68565e83dde203d41b76fc5", "sha256:c27f371f0159feb70e6ea52ed7e768b3f3a4c5676c1900a7e51a24740381650e",
"sha256:b135410244ebe777db80298297a97fbb4c862c881b4403b71bac9d4107d61fd1", "sha256:c3325803095fb4c2a48649c321d2fbde59f8fbfcb9bfc7a86df27d112831c571",
"sha256:b98d0148f84e3a3c569e19f52103ca1feacdac0d2df8d6533cf983d1fda28462", "sha256:e474b63bab0a2ea32a7b26a4d8eec59e33e709321e5e16fb66e766b61b82a95e",
"sha256:d1736130bce8cf131ac7957fa26880ca19227d4ad68b4888b3be0dea1f95df97", "sha256:e894e9ae603e963f0842498c4cd5d39c6a60f0d7e4c103df50ee939564298658"
"sha256:f45460f9ee70a0ec1b6694c6e4e348ad2019275680bd68a1d9314b8c7e01e574"
], ],
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==4.21.12" "version": "==4.22.0"
}, },
"pyaes": { "pyaes": {
"hashes": [ "hashes": [
@@ -838,14 +923,6 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.8.2" "version": "==2.8.2"
}, },
"python-dotenv": {
"hashes": [
"sha256:b7e3b04a59693c42c36f9ab1cc2acc46fa5df8c78e178fc33a8d4cd05c8d498f",
"sha256:d92a187be61fe482e4fd675b6d52200e7be63a12b724abbf931a40ce4fa92938"
],
"markers": "python_version >= '3.5'",
"version": "==0.20.0"
},
"python-slugify": { "python-slugify": {
"hashes": [ "hashes": [
"sha256:51f217508df20a6c166c7821683384b998560adcf8f19a6c2ca8b460528ccd9c", "sha256:51f217508df20a6c166c7821683384b998560adcf8f19a6c2ca8b460528ccd9c",
@@ -1019,11 +1096,11 @@
}, },
"requests": { "requests": {
"hashes": [ "hashes": [
"sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f", "sha256:64299f4909223da747622c030b781c0d7811e359c37124b4bd368fb8c6518baa",
"sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b" "sha256:98b1b2782e3c6c4904938b84c0eb932721069dfdb9134313beff7c83c2df24bf"
], ],
"markers": "python_version >= '3.7' and python_version < '4'", "markers": "python_version >= '3.7' and python_version < '4'",
"version": "==2.28.0" "version": "==2.28.2"
}, },
"requests-oauthlib": { "requests-oauthlib": {
"hashes": [ "hashes": [
@@ -1067,11 +1144,11 @@
}, },
"selenium": { "selenium": {
"hashes": [ "hashes": [
"sha256:20f28ee4ea9b273b4112a7df5276ebb3052f79ff6eff42a564db6143e5926683", "sha256:bd04eb41395605d9b2b65fe587f3fed21431da75512985c52772529e5e210c60",
"sha256:fee36724d6cf0b18c73781bb8ec7be4a35ab1e2564e64e64e64da75e50e052af" "sha256:c48372905bffcc3b24bd55ab4683a07ee5e1f30fe918c59558ea5ee44cedf6c3"
], ],
"index": "pypi", "index": "pypi",
"version": "==4.8.0" "version": "==4.8.2"
}, },
"six": { "six": {
"hashes": [ "hashes": [
@@ -1106,11 +1183,11 @@
}, },
"soupsieve": { "soupsieve": {
"hashes": [ "hashes": [
"sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759", "sha256:49e5368c2cda80ee7e84da9dbe3e110b70a4575f196efb74e51b94549d921955",
"sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d" "sha256:e28dba9ca6c7c00173e34e4ba57448f0688bb681b7c5e8bf4971daafc093d69a"
], ],
"markers": "python_version >= '3.6'", "markers": "python_version >= '3.7'",
"version": "==2.3.2.post1" "version": "==2.4"
}, },
"telethon": { "telethon": {
"hashes": [ "hashes": [
@@ -1160,11 +1237,11 @@
}, },
"typing-extensions": { "typing-extensions": {
"hashes": [ "hashes": [
"sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa", "sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb",
"sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e" "sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4"
], ],
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==4.4.0" "version": "==4.5.0"
}, },
"typing-inspect": { "typing-inspect": {
"hashes": [ "hashes": [
@@ -1198,27 +1275,30 @@
"version": "==4.1.1" "version": "==4.1.1"
}, },
"urllib3": { "urllib3": {
"hashes": [ "extras": [
"sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14", "socks"
"sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"
], ],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", "hashes": [
"version": "==1.26.9" "sha256:076907bf8fd355cde77728471316625a4d2f7e713c125f51953bb5b3eecf4f72",
"sha256:75edcdc2f7d85b137124a6c3c9fc3933cdeaa12ecb9a6a959f22797a0feca7e1"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==1.26.14"
}, },
"vk-api": { "vk-api": {
"hashes": [ "hashes": [
"sha256:11c731e214ebc7fa911db81efb021f97587493a5402b992f24748fe1cd9d7afc", "sha256:c71021506449afe5b9bbb1c4acb0d86b35a007ddc21678478e46fbbeabd1f3ef",
"sha256:d0ae766fa93a40d47c5da045d94201721bf766dbde122a1d2253516b35c5edf3" "sha256:c7741e40bc05980c91ed94c84542e1e7e7370e101b5eaa74222958d4130fe3c2"
], ],
"version": "==11.9.8" "version": "==11.9.9"
}, },
"vk-url-scraper": { "vk-url-scraper": {
"hashes": [ "hashes": [
"sha256:1cd6daad89a1f920902cb68c5952c5ab5e80ba2bf4a8c3657c781b5b0f9d406b", "sha256:5a32fb5419f7bb8bd35de8548948fe27a06f857a4d086c87e142bf07aabc3fd7",
"sha256:d430de947575e321cedceecfdf198b8bd14db3026038b924547e8b1c7c6a09ed" "sha256:a87c5aa7c1570c3aa87031e78c2052105e3681f57503fd4cb56470c3ab6106d6"
], ],
"index": "pypi", "index": "pypi",
"version": "==0.3.10" "version": "==0.3.15"
}, },
"websockets": { "websockets": {
"hashes": [ "hashes": [
@@ -1297,11 +1377,11 @@
}, },
"werkzeug": { "werkzeug": {
"hashes": [ "hashes": [
"sha256:7ea2d48322cc7c0f8b3a215ed73eabd7b5d75d0b50e31ab006286ccff9e00b8f", "sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe",
"sha256:f979ab81f58d7318e064e99c4506445d60135ac5cd2e177a2de0089bfd4c9bd5" "sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"
], ],
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==2.2.2" "version": "==2.2.3"
}, },
"wsproto": { "wsproto": {
"hashes": [ "hashes": [
@@ -1313,11 +1393,11 @@
}, },
"yt-dlp": { "yt-dlp": {
"hashes": [ "hashes": [
"sha256:0e7b81fc6ac8d1b7d3fffa79f9044ca4163784422582c9a3593305da2a69ec02", "sha256:3b2df037c80922f0f83f63ee2f9253496b4a8668c0fe8d2a836ba9040f853b07",
"sha256:d7d1f81d230756f094b4d9ee59b37b2c13b2e63ff5fb72cda53625edb072cdae" "sha256:9af92de5effc193bdb51216d9ebf28874d96180d202fae752b0d9f2a63380f3a"
], ],
"index": "pypi", "index": "pypi",
"version": "==2022.7.18" "version": "==2023.2.17"
} }
}, },
"develop": { "develop": {

View File

@@ -153,11 +153,11 @@ These assume you've installed with pipenv, see docker section above for how to r
# all the configurations come from ./orchestration.yaml # all the configurations come from ./orchestration.yaml
auto-archiver auto-archiver
# all the configurations come from ./secrets/orchestration.yaml # all the configurations come from ./secrets/orchestration.yaml
auto-archiver --config orchestration.yaml auto-archiver --config secrets/orchestration.yaml
# uses the configurations but for another google docs sheet # uses the same configurations but for another google docs sheet
# with a header on row 2 and with some different column names # with a header on row 2 and with some different column names
# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided # notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided
auto-archiver --config orchestration.yaml --gsheets_feeder.sheet="use it on another sheets doc" --gsheets_feeder.header=2 --gsheets_feeder.columns='{"url": "link"}' auto-archiver --config orchestration.yaml --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}'
# all the configurations come from orchestration.yaml and specifies that s3 files should be private # all the configurations come from orchestration.yaml and specifies that s3 files should be private
auto-archiver --s3_storage.private=1 auto-archiver --s3_storage.private=1
``` ```
@@ -166,11 +166,11 @@ auto-archiver --s3_storage.private=1
#### Google Drive #### Google Drive
To use Google Drive storage you need the id of the shared folder in the `config.yaml` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com` and then you can use `--storage=gd` To use Google Drive storage you need the id of the shared folder in the `config.yaml` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com` and then you can use `--storage=gd`
#### Telethon (Telegrams API Library) #### Telethon + Instagram with telegram bot
The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root. The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root.
## Running on Google Sheets Feeder (gsheets_feeder) ## Running on Google Sheets Feeder (gsheet_feeder)
The `--gseets_feeder.sheet` property is the name of the Google Sheet to check for URLs. The `--gseets_feeder.sheet` property is the name of the Google Sheet to check for URLs.
This sheet must have been shared with the Google Service account used by `gspread`. This sheet must have been shared with the Google Service account used by `gspread`.
This sheet must also have specific columns (case-insensitive) in the `header` row - see [Gsheet.configs](src/auto_archiver/utils/gsheet.py) for all their names. This sheet must also have specific columns (case-insensitive) in the `header` row - see [Gsheet.configs](src/auto_archiver/utils/gsheet.py) for all their names.
@@ -183,7 +183,7 @@ When the auto archiver starts running, it updates the "Archive status" column.
![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Media URL" column. The auto archiver has added "archive in progress" to one of the status columns.](docs/demo-progress.png) ![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Media URL" column. The auto archiver has added "archive in progress" to one of the status columns.](docs/demo-progress.png)
The links are downloaded and archived, and the spreadsheet is updated to the following: The links are downloaded and archived, and the spreadsheet is updated to the following:
![A screenshot of a Google Spreadsheet with videos archived and metadata added per the description of the columns above.](docs/demo-after.png) ![A screenshot of a Google Spreadsheet with videos archived and metadata added per the description of the columns above.](docs/demo-after.png)
Note that the first row is skipped, as it is assumed to be a header row (`--gsheets_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked. Note that the first row is skipped, as it is assumed to be a header row (`--gsheet_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked.
--- ---

View File

@@ -1,22 +1,21 @@
steps: steps:
# only 1 feeder allowed # only 1 feeder allowed
# feeder: cli_feeder # default feeder feeder: gsheet_feeder # defaults to cli_feeder
feeder: gsheet_feeder # default -> only expects URL from CLI archivers: # order matters, uncomment to activate
archivers: # order matters
# - vk_archiver # - vk_archiver
# - telethon_archiver # - telethon_archiver
# - telegram_archiver # - telegram_archiver
# - twitter_archiver # - twitter_archiver
# - twitter_api_archiver # - twitter_api_archiver
# - instagram_archiver
# - instagram_tbot_archiver # - instagram_tbot_archiver
# - instagram_archiver
# - tiktok_archiver # - tiktok_archiver
- youtubedl_archiver - youtubedl_archiver
# - wayback_archiver_enricher - wayback_archiver_enricher
enrichers: enrichers:
- hash_enricher - hash_enricher
- screenshot_enricher # - screenshot_enricher
- thumbnail_enricher # - thumbnail_enricher
# - wayback_archiver_enricher # - wayback_archiver_enricher
# - wacz_enricher # - wacz_enricher
@@ -26,16 +25,18 @@ steps:
# - s3_storage # - s3_storage
# - gdrive_storage # - gdrive_storage
databases: databases:
# - console_db - console_db
# - csv_db # - csv_db
- gsheet_db # - gsheet_db
# - mongo_db # - mongo_db
configurations: configurations:
gsheet_feeder: gsheet_feeder:
sheet: auto-archiver-test sheet: "your sheet name"
header: 2 # defaults to 1 in GSheetsFeeder header: 1
service_account: "secrets/service_account.json" service_account: "secrets/service_account.json"
# allow_worksheets: "only parse this worksheet"
# block_worksheets: "blocked sheet 1,blocked sheet 2"
use_sheet_names_in_stored_paths: false use_sheet_names_in_stored_paths: false
columns: columns:
url: link url: link
@@ -53,27 +54,70 @@ configurations:
hash: hash hash: hash
wacz: wacz wacz: wacz
replaywebpage: replaywebpage replaywebpage: replaywebpage
instagram_tbot_archiver:
api_id: "TELEGRAM_BOT_API_ID"
api_hash: "TELEGRAM_BOT_API_HASH"
# session_file: "secrets/anon"
telethon_archiver:
api_id: "TELEGRAM_BOT_API_ID"
api_hash: "TELEGRAM_BOT_API_HASH"
# session_file: "secrets/anon"
join_channels: false
channel_invites: # if you want to archive from private channels
- invite: https://t.me/+123456789
id: 0000000001
- invite: https://t.me/+123456788
id: 0000000002
twitter_api_archiver:
# either bearer_token only
bearer_token: "TWITTER_BEARER_TOKEN"
# OR all of the below
# consumer_key: ""
# consumer_secret: ""
# access_token: ""
# access_secret: ""
instagram_archiver:
username: "INSTAGRAM_USERNAME"
password: "INSTAGRAM_PASSWORD"
# session_file: "secrets/instaloader.session"
vk_archiver:
username: "or phone number"
password: "vk pass"
session_file: "secrets/vk_config.v2.json"
screenshot_enricher: screenshot_enricher:
width: 1280 width: 1280
height: 2300 height: 2300
wayback_archiver_enricher: wayback_archiver_enricher:
timeout: 10 timeout: 10
key: "" key: "wayback key"
secret: "" secret: "wayback secret"
hash_enricher: hash_enricher:
algorithm: "SHA3-512" algorithm: "SHA3-512" # can also be SHA-256
# wacz: wacz_enricher:
# profile: secrets/profile.tar.gz profile: secrets/profile.tar.gz
local_storage: local_storage:
save_to: "./local_archive" save_to: "./local_archive"
save_absolute: true save_absolute: true
filename_generator: static filename_generator: static
path_generator: flat path_generator: flat
s3_storage:
bucket: your-bucket-name
region: reg1
key: S3_KEY
secret: S3_SECRET
endpoint_url: "https://{region}.digitaloceanspaces.com"
cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
# if private:true S3 urls will not be readable online
private: false
# with 'random' you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files, alternative is 'default' or not omitted from config
key_path: random
gdrive_storage: gdrive_storage:
path_generator: url path_generator: url
filename_generator: random filename_generator: random
root_folder_id: TODO root_folder_id: folder_id_from_url
oauth_token: secrets/gd-token.json oauth_token: secrets/gd-token.json # needs to be generated with scripts/create_update_gdrive_oauth_token.py
service_account: "secrets/service_account.json" service_account: "secrets/service_account.json"

View File

@@ -2,14 +2,14 @@
from telethon.sync import TelegramClient from telethon.sync import TelegramClient
from loguru import logger from loguru import logger
import time, os import time, os
from sqlite3 import OperationalError
from . import Archiver from . import Archiver
from ..core import Metadata, Media from ..core import Metadata, Media
class InstagramTbotArchiver(Archiver): class InstagramTbotArchiver(Archiver):
""" """
calls a telegram bot to fetch instagram posts/stories... calls a telegram bot to fetch instagram posts/stories... and gets available media from it
https://github.com/adw0rd/instagrapi https://github.com/adw0rd/instagrapi
https://t.me/instagram_load_bot https://t.me/instagram_load_bot
""" """
@@ -20,14 +20,17 @@ class InstagramTbotArchiver(Archiver):
self.assert_valid_string("api_id") self.assert_valid_string("api_id")
self.assert_valid_string("api_hash") self.assert_valid_string("api_hash")
self.timeout = int(self.timeout) self.timeout = int(self.timeout)
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash) try:
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
except OperationalError as e:
logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_archiver. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}")
@staticmethod @staticmethod
def configs() -> dict: def configs() -> dict:
return { return {
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"}, "api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"}, "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
"session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."}, "session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
"timeout": {"default": 15, "help": "timeout to fetch the instagram content in seconds."}, "timeout": {"default": 15, "help": "timeout to fetch the instagram content in seconds."},
} }
@@ -47,20 +50,27 @@ class InstagramTbotArchiver(Archiver):
since_id = self.client.send_message(entity=chat, message=url).id since_id = self.client.send_message(entity=chat, message=url).id
attempts = 0 attempts = 0
media = None seen_media = []
message = "" message = ""
time.sleep(4) time.sleep(4)
while attempts < self.timeout and (not message or not media): # media is added before text by the bot so it can be used as a stop-logic mechanism
while attempts < self.timeout and (not message or not len(seen_media)):
attempts += 1 attempts += 1
time.sleep(1) time.sleep(1)
for post in self.client.iter_messages(chat, min_id=since_id): for post in self.client.iter_messages(chat, min_id=since_id):
since_id = max(since_id, post.id) since_id = max(since_id, post.id)
if post.media and not media: if post.media and post.id not in seen_media:
filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}') filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
media = self.client.download_media(post.media, filename_dest) media = self.client.download_media(post.media, filename_dest)
if media: result.add_media(Media(media)) if media:
result.add_media(Media(media))
seen_media.append(post.id)
if post.message: message += post.message if post.message: message += post.message
if "You must enter a URL to a post" in message:
logger.debug(f"invalid link {url=} for {self.name}: {message}")
return False
if message: if message:
result.set_content(message).set_title(message[:128]) result.set_content(message).set_title(message[:128])

View File

@@ -16,11 +16,13 @@ class HashEnricher(Enricher):
super().__init__(config) super().__init__(config)
algo_choices = self.configs()["algorithm"]["choices"] algo_choices = self.configs()["algorithm"]["choices"]
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})." assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
self.chunksize = int(self.chunksize)
@staticmethod @staticmethod
def configs() -> dict: def configs() -> dict:
return { return {
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]} "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
"chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
} }
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
@@ -28,12 +30,19 @@ class HashEnricher(Enricher):
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})") logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
for i, m in enumerate(to_enrich.media): for i, m in enumerate(to_enrich.media):
with open(m.filename, "rb") as f: if len(hd := self.calculate_hash(m.filename)):
bytes = f.read() # read entire file as bytes to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
hash = None
if self.algorithm == "SHA-256": def calculate_hash(self, filename):
hash = hashlib.sha256(bytes) hash = None
elif self.algorithm == "SHA3-512": if self.algorithm == "SHA-256":
hash = hashlib.sha3_512(bytes) hash = hashlib.sha256()
else: continue elif self.algorithm == "SHA3-512":
to_enrich.media[i].set("hash", f"{self.algorithm}:{hash.hexdigest()}") hash = hashlib.sha3_512()
else: return ""
with open(filename, "rb") as f:
while True:
buf = f.read(self.chunksize)
if not buf: break
hash.update(buf)
return hash.hexdigest()

View File

@@ -14,7 +14,8 @@ class ScreenshotEnricher(Enricher):
return { return {
"width": {"default": 1280, "help": "width of the screenshots"}, "width": {"default": 1280, "help": "width of the screenshots"},
"height": {"default": 720, "help": "height of the screenshots"}, "height": {"default": 720, "help": "height of the screenshots"},
"timeout": {"default": 60, "help": "timeout for taking the screenshot"} "timeout": {"default": 60, "help": "timeout for taking the screenshot"},
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}
} }
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
@@ -27,7 +28,7 @@ class ScreenshotEnricher(Enricher):
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver: with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver:
try: try:
driver.get(url) driver.get(url)
time.sleep(2) time.sleep(int(self.sleep_before_screenshot))
screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png") screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
driver.save_screenshot(screenshot_file) driver.save_screenshot(screenshot_file)
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot") to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
@@ -35,4 +36,3 @@ class ScreenshotEnricher(Enricher):
logger.info("TimeoutException loading page for screenshot") logger.info("TimeoutException loading page for screenshot")
except Exception as e: except Exception as e:
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}") logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
# return None

View File

@@ -3,6 +3,7 @@ from loguru import logger
from ..core import Media, Metadata from ..core import Media, Metadata
from . import Enricher from . import Enricher
from ..utils import UrlUtil
class WaczEnricher(Enricher): class WaczEnricher(Enricher):
@@ -20,11 +21,17 @@ class WaczEnricher(Enricher):
return { return {
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."}, "profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
"timeout": {"default": 90, "help": "timeout for WACZ generation in seconds"}, "timeout": {"default": 90, "help": "timeout for WACZ generation in seconds"},
"ignore_auth_wall": {"default": True, "help": "skip URL if it is behind authentication wall, set to False if you have browsertrix profile configured for private content."},
} }
def enrich(self, to_enrich: Metadata) -> bool: def enrich(self, to_enrich: Metadata) -> bool:
# TODO: figure out support for browsertrix in docker # TODO: figure out support for browsertrix in docker
url = to_enrich.get_url() url = to_enrich.get_url()
if UrlUtil.is_auth_wall(url):
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
return
logger.debug(f"generating WACZ for {url=}") logger.debug(f"generating WACZ for {url=}")
collection = str(uuid.uuid4())[0:8] collection = str(uuid.uuid4())[0:8]
browsertrix_home = os.path.abspath(to_enrich.get_tmp_dir()) browsertrix_home = os.path.abspath(to_enrich.get_tmp_dir())

View File

@@ -5,6 +5,7 @@ import hashlib
from typing import IO, Any from typing import IO, Any
from ..core import Media, Metadata, Step from ..core import Media, Metadata, Step
from ..enrichers import HashEnricher
from loguru import logger from loguru import logger
import os, uuid import os, uuid
from slugify import slugify from slugify import slugify
@@ -64,18 +65,18 @@ class Storage(Step):
filename, ext = os.path.splitext(media.filename) filename, ext = os.path.splitext(media.filename)
# path_generator logic # path_generator logic
if self.path_generator == "flat": if self.path_generator == "flat":
path = "" path = ""
filename = slugify(filename) # in case it comes with os.sep filename = slugify(filename) # in case it comes with os.sep
elif self.path_generator == "url": path = slugify(item.get_url()) elif self.path_generator == "url": path = slugify(item.get_url())
elif self.path_generator == "random": elif self.path_generator == "random":
path = item.get("random_path", str(uuid.uuid4())[:16], True) path = item.get("random_path", str(uuid.uuid4())[:16], True)
# filename_generator logic # filename_generator logic
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16] if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
elif self.filename_generator == "static": elif self.filename_generator == "static":
with open(media.filename, "rb") as f: he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}})
bytes = f.read() # read entire file as bytes hd = he.calculate_hash(media.filename)
filename = hashlib.sha256(bytes).hexdigest()[:24] filename = hd[:24]
media.key = os.path.join(folder, path, f"{filename}{ext}") media.key = os.path.join(folder, path, f"{filename}{ext}")

View File

@@ -40,11 +40,11 @@ class GWorksheet:
def _col_index(self, col: str): def _col_index(self, col: str):
self._check_col_exists(col) self._check_col_exists(col)
return self.headers.index(self.columns[col]) return self.headers.index(self.columns[col].lower())
def col_exists(self, col: str): def col_exists(self, col: str):
self._check_col_exists(col) self._check_col_exists(col)
return self.columns[col] in self.headers return self.columns[col].lower() in self.headers
def count_rows(self): def count_rows(self):
return len(self.values) return len(self.values)

View File

@@ -3,7 +3,7 @@ _MAJOR = "0"
_MINOR = "4" _MINOR = "4"
# On main and in a nightly release the patch should be one ahead of the last # On main and in a nightly release the patch should be one ahead of the last
# released build. # released build.
_PATCH = "1" _PATCH = "5"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See # This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics. # https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = "" _SUFFIX = ""