Compare commits

...

22 Commits

Author SHA1 Message Date
msramalho
0e3c427371 Bump version to v0.4.3 for release 2023-02-27 10:30:06 +01:00
msramalho
7497bc08c0 Bump version to v0.4.2 for release 2023-02-23 17:14:29 +01:00
msramalho
49863768fe vk updates 2023-02-22 18:35:15 +01:00
msramalho
7b9483bbf9 yt-dlp update 2023-02-22 18:28:20 +01:00
msramalho
cd81cae559 auth wall for WACZ 2023-02-20 16:08:45 +00:00
msramalho
23894fad51 normalize columns 2023-02-20 16:08:35 +00:00
msramalho
876988b587 detect invalid url messages instagram bot 2023-02-20 12:22:52 +00:00
msramalho
f95293b84b support for multiple media instagram 2023-02-20 11:25:02 +00:00
msramalho
2fbcbe4e8b double session issues 2023-02-20 11:11:39 +00:00
msramalho
d1e4574c6c readme updates 2023-02-17 16:30:50 +00:00
msramalho
d347b26d37 updating example config 2023-02-17 16:26:23 +00:00
msramalho
1970fa3c82 new instagram archiver via telegram bot 2023-02-17 16:15:25 +00:00
msramalho
aa5430451e instagram archiver via telegram bot 2023-02-17 15:46:29 +00:00
msramalho
f35875a94c name fix 2023-02-17 15:46:05 +00:00
msramalho
5505255ea3 url auth wall detect 2023-02-17 15:45:58 +00:00
msramalho
da17b3f68a name fix 2023-02-17 15:45:35 +00:00
msramalho
d6dbdec6ac example 2023-02-09 12:32:55 +00:00
msramalho
224ebe7ee8 links 2023-02-08 22:27:56 +00:00
msramalho
54a1bc2172 update readme 2023-02-08 22:26:24 +00:00
msramalho
77948207d1 update 2023-02-08 22:24:40 +00:00
msramalho
60552ae0ea update readme 2023-02-08 22:23:25 +00:00
msramalho
f255271ecb update README 2023-02-08 22:17:22 +00:00
20 changed files with 455 additions and 193 deletions

View File

@@ -14,7 +14,6 @@ loguru = "*"
ffmpeg-python = "*"
selenium = "*"
snscrape = "*"
yt-dlp = "*"
telethon = "*"
google-api-python-client = "*"
google-auth-httplib2 = "*"
@@ -23,13 +22,14 @@ oauth2client = "*"
python-slugify = "*"
pyyaml = "*"
dateparser = "*"
vk-url-scraper = "*"
python-twitter-v2 = "*"
instaloader = "*"
tqdm = "*"
jinja2 = "*"
cryptography = "==38.0.4"
dataclasses-json = "*"
yt-dlp = ">=2023.2.17"
vk-url-scraper = "*"
[requires]
python_version = "3.9"

248
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "e2f5d017d9bc9eef90cced189b6e3017d740c35d204962479417109a4deeb7f4"
"sha256": "7176a6666639452dbf30939fa095ff23518aee6da7d9561de0f12ba0aceed527"
},
"pipfile-spec": 6,
"requires": {
@@ -57,19 +57,19 @@
},
"boto3": {
"hashes": [
"sha256:3a1ffeecfe6e61d414617294b822b008e604ccfd83434c483f429a2922db314d",
"sha256:ebea98f3054b467caf6c8aead9f0ef78395a78bce78b04db12fde452c02b3734"
"sha256:17f0d782487275cac12676a61b3f1a4900954cc454c842b8551ca47a3dcd59b4",
"sha256:bf808f7433629650128ab577a9d4a0f4daf072d9f2f3a907b9d567a6952d9154"
],
"index": "pypi",
"version": "==1.26.66"
"version": "==1.26.77"
},
"botocore": {
"hashes": [
"sha256:4d1ac019e677cc39e615f9d473fa658ea22a8d906c1c562f9406b5d0cd854cbd",
"sha256:772da07d2a49a9d2dc8d23e060e88eb72881e58074be7c813aa946ecdbd0e5b5"
"sha256:9d94a02f2584b52c65fb3cb309fb1b29d6d0c36d69062722b0275c1c382c44c9",
"sha256:d8aa7bffe2422de282b2d02945b7b45d5fecf00f67b65eebb0b1fa3de1abc6d0"
],
"markers": "python_version >= '3.7'",
"version": "==1.29.66"
"version": "==1.29.77"
},
"brotli": {
"hashes": [
@@ -176,11 +176,11 @@
},
"certifi": {
"hashes": [
"sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d",
"sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412"
"sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3",
"sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"
],
"markers": "python_version >= '3.6'",
"version": "==2022.6.15"
"version": "==2022.12.7"
},
"cffi": {
"hashes": [
@@ -253,11 +253,97 @@
},
"charset-normalizer": {
"hashes": [
"sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
"sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"
"sha256:00d3ffdaafe92a5dc603cb9bd5111aaa36dfa187c8285c543be562e61b755f6b",
"sha256:024e606be3ed92216e2b6952ed859d86b4cfa52cd5bc5f050e7dc28f9b43ec42",
"sha256:0298eafff88c99982a4cf66ba2efa1128e4ddaca0b05eec4c456bbc7db691d8d",
"sha256:02a51034802cbf38db3f89c66fb5d2ec57e6fe7ef2f4a44d070a593c3688667b",
"sha256:083c8d17153ecb403e5e1eb76a7ef4babfc2c48d58899c98fcaa04833e7a2f9a",
"sha256:0a11e971ed097d24c534c037d298ad32c6ce81a45736d31e0ff0ad37ab437d59",
"sha256:0bf2dae5291758b6f84cf923bfaa285632816007db0330002fa1de38bfcb7154",
"sha256:0c0a590235ccd933d9892c627dec5bc7511ce6ad6c1011fdf5b11363022746c1",
"sha256:0f438ae3532723fb6ead77e7c604be7c8374094ef4ee2c5e03a3a17f1fca256c",
"sha256:109487860ef6a328f3eec66f2bf78b0b72400280d8f8ea05f69c51644ba6521a",
"sha256:11b53acf2411c3b09e6af37e4b9005cba376c872503c8f28218c7243582df45d",
"sha256:12db3b2c533c23ab812c2b25934f60383361f8a376ae272665f8e48b88e8e1c6",
"sha256:14e76c0f23218b8f46c4d87018ca2e441535aed3632ca134b10239dfb6dadd6b",
"sha256:16a8663d6e281208d78806dbe14ee9903715361cf81f6d4309944e4d1e59ac5b",
"sha256:292d5e8ba896bbfd6334b096e34bffb56161c81408d6d036a7dfa6929cff8783",
"sha256:2c03cc56021a4bd59be889c2b9257dae13bf55041a3372d3295416f86b295fb5",
"sha256:2e396d70bc4ef5325b72b593a72c8979999aa52fb8bcf03f701c1b03e1166918",
"sha256:2edb64ee7bf1ed524a1da60cdcd2e1f6e2b4f66ef7c077680739f1641f62f555",
"sha256:31a9ddf4718d10ae04d9b18801bd776693487cbb57d74cc3458a7673f6f34639",
"sha256:356541bf4381fa35856dafa6a965916e54bed415ad8a24ee6de6e37deccf2786",
"sha256:358a7c4cb8ba9b46c453b1dd8d9e431452d5249072e4f56cfda3149f6ab1405e",
"sha256:37f8febc8ec50c14f3ec9637505f28e58d4f66752207ea177c1d67df25da5aed",
"sha256:39049da0ffb96c8cbb65cbf5c5f3ca3168990adf3551bd1dee10c48fce8ae820",
"sha256:39cf9ed17fe3b1bc81f33c9ceb6ce67683ee7526e65fde1447c772afc54a1bb8",
"sha256:3ae1de54a77dc0d6d5fcf623290af4266412a7c4be0b1ff7444394f03f5c54e3",
"sha256:3b590df687e3c5ee0deef9fc8c547d81986d9a1b56073d82de008744452d6541",
"sha256:3e45867f1f2ab0711d60c6c71746ac53537f1684baa699f4f668d4c6f6ce8e14",
"sha256:3fc1c4a2ffd64890aebdb3f97e1278b0cc72579a08ca4de8cd2c04799a3a22be",
"sha256:4457ea6774b5611f4bed5eaa5df55f70abde42364d498c5134b7ef4c6958e20e",
"sha256:44ba614de5361b3e5278e1241fda3dc1838deed864b50a10d7ce92983797fa76",
"sha256:4a8fcf28c05c1f6d7e177a9a46a1c52798bfe2ad80681d275b10dcf317deaf0b",
"sha256:4b0d02d7102dd0f997580b51edc4cebcf2ab6397a7edf89f1c73b586c614272c",
"sha256:502218f52498a36d6bf5ea77081844017bf7982cdbe521ad85e64cabee1b608b",
"sha256:503e65837c71b875ecdd733877d852adbc465bd82c768a067badd953bf1bc5a3",
"sha256:5995f0164fa7df59db4746112fec3f49c461dd6b31b841873443bdb077c13cfc",
"sha256:59e5686dd847347e55dffcc191a96622f016bc0ad89105e24c14e0d6305acbc6",
"sha256:601f36512f9e28f029d9481bdaf8e89e5148ac5d89cffd3b05cd533eeb423b59",
"sha256:608862a7bf6957f2333fc54ab4399e405baad0163dc9f8d99cb236816db169d4",
"sha256:62595ab75873d50d57323a91dd03e6966eb79c41fa834b7a1661ed043b2d404d",
"sha256:70990b9c51340e4044cfc394a81f614f3f90d41397104d226f21e66de668730d",
"sha256:71140351489970dfe5e60fc621ada3e0f41104a5eddaca47a7acb3c1b851d6d3",
"sha256:72966d1b297c741541ca8cf1223ff262a6febe52481af742036a0b296e35fa5a",
"sha256:74292fc76c905c0ef095fe11e188a32ebd03bc38f3f3e9bcb85e4e6db177b7ea",
"sha256:761e8904c07ad053d285670f36dd94e1b6ab7f16ce62b9805c475b7aa1cffde6",
"sha256:772b87914ff1152b92a197ef4ea40efe27a378606c39446ded52c8f80f79702e",
"sha256:79909e27e8e4fcc9db4addea88aa63f6423ebb171db091fb4373e3312cb6d603",
"sha256:7e189e2e1d3ed2f4aebabd2d5b0f931e883676e51c7624826e0a4e5fe8a0bf24",
"sha256:7eb33a30d75562222b64f569c642ff3dc6689e09adda43a082208397f016c39a",
"sha256:81d6741ab457d14fdedc215516665050f3822d3e56508921cc7239f8c8e66a58",
"sha256:8499ca8f4502af841f68135133d8258f7b32a53a1d594aa98cc52013fff55678",
"sha256:84c3990934bae40ea69a82034912ffe5a62c60bbf6ec5bc9691419641d7d5c9a",
"sha256:87701167f2a5c930b403e9756fab1d31d4d4da52856143b609e30a1ce7160f3c",
"sha256:88600c72ef7587fe1708fd242b385b6ed4b8904976d5da0893e31df8b3480cb6",
"sha256:8ac7b6a045b814cf0c47f3623d21ebd88b3e8cf216a14790b455ea7ff0135d18",
"sha256:8b8af03d2e37866d023ad0ddea594edefc31e827fee64f8de5611a1dbc373174",
"sha256:8c7fe7afa480e3e82eed58e0ca89f751cd14d767638e2550c77a92a9e749c317",
"sha256:8eade758719add78ec36dc13201483f8e9b5d940329285edcd5f70c0a9edbd7f",
"sha256:911d8a40b2bef5b8bbae2e36a0b103f142ac53557ab421dc16ac4aafee6f53dc",
"sha256:93ad6d87ac18e2a90b0fe89df7c65263b9a99a0eb98f0a3d2e079f12a0735837",
"sha256:95dea361dd73757c6f1c0a1480ac499952c16ac83f7f5f4f84f0658a01b8ef41",
"sha256:9ab77acb98eba3fd2a85cd160851816bfce6871d944d885febf012713f06659c",
"sha256:9cb3032517f1627cc012dbc80a8ec976ae76d93ea2b5feaa9d2a5b8882597579",
"sha256:9cf4e8ad252f7c38dd1f676b46514f92dc0ebeb0db5552f5f403509705e24753",
"sha256:9d9153257a3f70d5f69edf2325357251ed20f772b12e593f3b3377b5f78e7ef8",
"sha256:a152f5f33d64a6be73f1d30c9cc82dfc73cec6477ec268e7c6e4c7d23c2d2291",
"sha256:a16418ecf1329f71df119e8a65f3aa68004a3f9383821edcb20f0702934d8087",
"sha256:a60332922359f920193b1d4826953c507a877b523b2395ad7bc716ddd386d866",
"sha256:a8d0fc946c784ff7f7c3742310cc8a57c5c6dc31631269876a88b809dbeff3d3",
"sha256:ab5de034a886f616a5668aa5d098af2b5385ed70142090e2a31bcbd0af0fdb3d",
"sha256:c22d3fe05ce11d3671297dc8973267daa0f938b93ec716e12e0f6dee81591dc1",
"sha256:c2ac1b08635a8cd4e0cbeaf6f5e922085908d48eb05d44c5ae9eabab148512ca",
"sha256:c512accbd6ff0270939b9ac214b84fb5ada5f0409c44298361b2f5e13f9aed9e",
"sha256:c75ffc45f25324e68ab238cb4b5c0a38cd1c3d7f1fb1f72b5541de469e2247db",
"sha256:c95a03c79bbe30eec3ec2b7f076074f4281526724c8685a42872974ef4d36b72",
"sha256:cadaeaba78750d58d3cc6ac4d1fd867da6fc73c88156b7a3212a3cd4819d679d",
"sha256:cd6056167405314a4dc3c173943f11249fa0f1b204f8b51ed4bde1a9cd1834dc",
"sha256:db72b07027db150f468fbada4d85b3b2729a3db39178abf5c543b784c1254539",
"sha256:df2c707231459e8a4028eabcd3cfc827befd635b3ef72eada84ab13b52e1574d",
"sha256:e62164b50f84e20601c1ff8eb55620d2ad25fb81b59e3cd776a1902527a788af",
"sha256:e696f0dd336161fca9adbb846875d40752e6eba585843c768935ba5c9960722b",
"sha256:eaa379fcd227ca235d04152ca6704c7cb55564116f8bc52545ff357628e10602",
"sha256:ebea339af930f8ca5d7a699b921106c6e29c617fe9606fa7baa043c1cdae326f",
"sha256:f4c39b0e3eac288fedc2b43055cfc2ca7a60362d0e5e87a637beac5d801ef478",
"sha256:f5057856d21e7586765171eac8b9fc3f7d44ef39425f85dbcccb13b3ebea806c",
"sha256:f6f45710b4459401609ebebdbcfb34515da4fc2aa886f95107f556ac69a9147e",
"sha256:f97e83fa6c25693c7a35de154681fcc257c1c41b38beb0304b9c4d2d9e164479",
"sha256:f9d0c5c045a3ca9bedfc35dca8526798eb91a07aa7a2c0fee134c6c6f321cbd7",
"sha256:ff6f3db31555657f3163b15a6b7c6938d08df7adbfc9dd13d9d19edad678f1e8"
],
"markers": "python_version >= '3.5'",
"version": "==2.0.12"
"markers": "python_version >= '3.6'",
"version": "==3.0.1"
},
"click": {
"hashes": [
@@ -348,11 +434,11 @@
},
"flask": {
"hashes": [
"sha256:642c450d19c4ad482f96729bd2a8f6d32554aa1e231f4f6b4e7e5264b16cca2b",
"sha256:b9c46cc36662a7949f34b52d8ec7bb59c0d74ba08ba6cb9ce9adc1d8676d9526"
"sha256:7eb373984bf1c770023fce9db164ed0c3353cd0b53f130f4693da0ca756a2e6d",
"sha256:c0bec9477df1cb867e5a67c9e1ab758de9cb4a3e52dd70681f59fa40a62b3f2d"
],
"markers": "python_version >= '3.7'",
"version": "==2.2.2"
"version": "==2.2.3"
},
"future": {
"hashes": [
@@ -371,19 +457,19 @@
},
"google-api-python-client": {
"hashes": [
"sha256:42a44e9adfca6bb27540ce52348aa1d3b81e214bcc53d454a76ebfbe8eee1483",
"sha256:f18e9dbb365f0485194a8daf5d60da2cff6a80ce2c9a694efc2b279922cb3dd0"
"sha256:577c0aeae1eb3c754eacb9122d369d67609fef759bc6a4fa16cafeab4f30019b",
"sha256:b9b6dc5f139892310093ba75d0df4c78f48655078953c923957dab1ec86129e7"
],
"index": "pypi",
"version": "==2.77.0"
"version": "==2.79.0"
},
"google-auth": {
"hashes": [
"sha256:5045648c821fb72384cdc0e82cc326df195f113a33049d9b62b74589243d2acc",
"sha256:ed7057a101af1146f0554a769930ac9de506aeca4fd5af6543ebe791851a9fbd"
"sha256:5fd170986bce6bfd7bb5c845c4b8362edb1e0cba901e062196e83f8bb5d5d32c",
"sha256:75d76ea857df65938e1f71dcbcd7d0cd48e3f80b34b8870ba229c9292081f7ef"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==2.16.0"
"version": "==2.16.1"
},
"google-auth-httplib2": {
"hashes": [
@@ -435,18 +521,18 @@
},
"idna": {
"hashes": [
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
"sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4",
"sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"
],
"markers": "python_version >= '3.5'",
"version": "==3.3"
"version": "==3.4"
},
"instaloader": {
"hashes": [
"sha256:ba925a87e2c305a3d24173d1bb0457d5a7e2e77dbac7206eeeb46f9104ecb08e"
"sha256:16040c170fb5230c1981a47e1990261e3c0ecffe0417be95fa265632244e7c01"
],
"index": "pypi",
"version": "==4.9.5"
"version": "==4.9.6"
},
"itsdangerous": {
"hashes": [
@@ -565,11 +651,11 @@
},
"markdown-it-py": {
"hashes": [
"sha256:93de681e5c021a432c63147656fe21790bc01231e0cd2da73626f1aa3ac0fe27",
"sha256:cf7e59fed14b5ae17c0006eff14a2d9a00ed5f3a846148153899a0224e2c07da"
"sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30",
"sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1"
],
"markers": "python_version >= '3.7'",
"version": "==2.1.0"
"version": "==2.2.0"
},
"markupsafe": {
"hashes": [
@@ -700,23 +786,22 @@
},
"protobuf": {
"hashes": [
"sha256:1f22ac0ca65bb70a876060d96d914dae09ac98d114294f77584b0d2644fa9c30",
"sha256:237216c3326d46808a9f7c26fd1bd4b20015fb6867dc5d263a493ef9a539293b",
"sha256:27f4d15021da6d2b706ddc3860fac0a5ddaba34ab679dc182b60a8bb4e1121cc",
"sha256:299ea899484ee6f44604deb71f424234f654606b983cb496ea2a53e3c63ab791",
"sha256:3d164928ff0727d97022957c2b849250ca0e64777ee31efd7d6de2e07c494717",
"sha256:6ab80df09e3208f742c98443b6166bcb70d65f52cfeb67357d52032ea1ae9bec",
"sha256:78a28c9fa223998472886c77042e9b9afb6fe4242bd2a2a5aced88e3f4422aa7",
"sha256:7cd532c4566d0e6feafecc1059d04c7915aec8e182d1cf7adee8b24ef1e2e6ab",
"sha256:89f9149e4a0169cddfc44c74f230d7743002e3aa0b9472d8c28f0388102fc4c2",
"sha256:a53fd3f03e578553623272dc46ac2f189de23862e68565e83dde203d41b76fc5",
"sha256:b135410244ebe777db80298297a97fbb4c862c881b4403b71bac9d4107d61fd1",
"sha256:b98d0148f84e3a3c569e19f52103ca1feacdac0d2df8d6533cf983d1fda28462",
"sha256:d1736130bce8cf131ac7957fa26880ca19227d4ad68b4888b3be0dea1f95df97",
"sha256:f45460f9ee70a0ec1b6694c6e4e348ad2019275680bd68a1d9314b8c7e01e574"
"sha256:1669cb7524221a8e2d9008d0842453dbefdd0fcdd64d67672f657244867635fb",
"sha256:29288813aacaa302afa2381db1d6e0482165737b0afdf2811df5fa99185c457b",
"sha256:47d31bdf58222dd296976aa1646c68c6ee80b96d22e0a3c336c9174e253fd35e",
"sha256:652d8dfece122a24d98eebfef30e31e455d300efa41999d1182e015984ac5930",
"sha256:7c535d126e7dcc714105ab20b418c4fedbd28f8b8afc42b7350b1e317bbbcc71",
"sha256:86c3d20428b007537ba6792b475c0853bba7f66b1f60e610d913b77d94b486e4",
"sha256:a33a273d21852f911b8bda47f39f4383fe7c061eb1814db2c76c9875c89c2491",
"sha256:ab4d043865dd04e6b09386981fe8f80b39a1e46139fb4a3c206229d6b9f36ff6",
"sha256:b2fea9dc8e3c0f32c38124790ef16cba2ee0628fe2022a52e435e1117bfef9b1",
"sha256:c27f371f0159feb70e6ea52ed7e768b3f3a4c5676c1900a7e51a24740381650e",
"sha256:c3325803095fb4c2a48649c321d2fbde59f8fbfcb9bfc7a86df27d112831c571",
"sha256:e474b63bab0a2ea32a7b26a4d8eec59e33e709321e5e16fb66e766b61b82a95e",
"sha256:e894e9ae603e963f0842498c4cd5d39c6a60f0d7e4c103df50ee939564298658"
],
"markers": "python_version >= '3.7'",
"version": "==4.21.12"
"version": "==4.22.0"
},
"pyaes": {
"hashes": [
@@ -838,14 +923,6 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.8.2"
},
"python-dotenv": {
"hashes": [
"sha256:b7e3b04a59693c42c36f9ab1cc2acc46fa5df8c78e178fc33a8d4cd05c8d498f",
"sha256:d92a187be61fe482e4fd675b6d52200e7be63a12b724abbf931a40ce4fa92938"
],
"markers": "python_version >= '3.5'",
"version": "==0.20.0"
},
"python-slugify": {
"hashes": [
"sha256:51f217508df20a6c166c7821683384b998560adcf8f19a6c2ca8b460528ccd9c",
@@ -1019,11 +1096,11 @@
},
"requests": {
"hashes": [
"sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f",
"sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b"
"sha256:64299f4909223da747622c030b781c0d7811e359c37124b4bd368fb8c6518baa",
"sha256:98b1b2782e3c6c4904938b84c0eb932721069dfdb9134313beff7c83c2df24bf"
],
"markers": "python_version >= '3.7' and python_version < '4'",
"version": "==2.28.0"
"version": "==2.28.2"
},
"requests-oauthlib": {
"hashes": [
@@ -1067,11 +1144,11 @@
},
"selenium": {
"hashes": [
"sha256:20f28ee4ea9b273b4112a7df5276ebb3052f79ff6eff42a564db6143e5926683",
"sha256:fee36724d6cf0b18c73781bb8ec7be4a35ab1e2564e64e64e64da75e50e052af"
"sha256:bd04eb41395605d9b2b65fe587f3fed21431da75512985c52772529e5e210c60",
"sha256:c48372905bffcc3b24bd55ab4683a07ee5e1f30fe918c59558ea5ee44cedf6c3"
],
"index": "pypi",
"version": "==4.8.0"
"version": "==4.8.2"
},
"six": {
"hashes": [
@@ -1106,11 +1183,11 @@
},
"soupsieve": {
"hashes": [
"sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759",
"sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"
"sha256:49e5368c2cda80ee7e84da9dbe3e110b70a4575f196efb74e51b94549d921955",
"sha256:e28dba9ca6c7c00173e34e4ba57448f0688bb681b7c5e8bf4971daafc093d69a"
],
"markers": "python_version >= '3.6'",
"version": "==2.3.2.post1"
"markers": "python_version >= '3.7'",
"version": "==2.4"
},
"telethon": {
"hashes": [
@@ -1160,11 +1237,11 @@
},
"typing-extensions": {
"hashes": [
"sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa",
"sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"
"sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb",
"sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4"
],
"markers": "python_version >= '3.7'",
"version": "==4.4.0"
"version": "==4.5.0"
},
"typing-inspect": {
"hashes": [
@@ -1198,27 +1275,30 @@
"version": "==4.1.1"
},
"urllib3": {
"hashes": [
"sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14",
"sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"
"extras": [
"socks"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.26.9"
"hashes": [
"sha256:076907bf8fd355cde77728471316625a4d2f7e713c125f51953bb5b3eecf4f72",
"sha256:75edcdc2f7d85b137124a6c3c9fc3933cdeaa12ecb9a6a959f22797a0feca7e1"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==1.26.14"
},
"vk-api": {
"hashes": [
"sha256:11c731e214ebc7fa911db81efb021f97587493a5402b992f24748fe1cd9d7afc",
"sha256:d0ae766fa93a40d47c5da045d94201721bf766dbde122a1d2253516b35c5edf3"
"sha256:c71021506449afe5b9bbb1c4acb0d86b35a007ddc21678478e46fbbeabd1f3ef",
"sha256:c7741e40bc05980c91ed94c84542e1e7e7370e101b5eaa74222958d4130fe3c2"
],
"version": "==11.9.8"
"version": "==11.9.9"
},
"vk-url-scraper": {
"hashes": [
"sha256:1cd6daad89a1f920902cb68c5952c5ab5e80ba2bf4a8c3657c781b5b0f9d406b",
"sha256:d430de947575e321cedceecfdf198b8bd14db3026038b924547e8b1c7c6a09ed"
"sha256:5a32fb5419f7bb8bd35de8548948fe27a06f857a4d086c87e142bf07aabc3fd7",
"sha256:a87c5aa7c1570c3aa87031e78c2052105e3681f57503fd4cb56470c3ab6106d6"
],
"index": "pypi",
"version": "==0.3.10"
"version": "==0.3.15"
},
"websockets": {
"hashes": [
@@ -1297,11 +1377,11 @@
},
"werkzeug": {
"hashes": [
"sha256:7ea2d48322cc7c0f8b3a215ed73eabd7b5d75d0b50e31ab006286ccff9e00b8f",
"sha256:f979ab81f58d7318e064e99c4506445d60135ac5cd2e177a2de0089bfd4c9bd5"
"sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe",
"sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"
],
"markers": "python_version >= '3.7'",
"version": "==2.2.2"
"version": "==2.2.3"
},
"wsproto": {
"hashes": [
@@ -1313,11 +1393,11 @@
},
"yt-dlp": {
"hashes": [
"sha256:0e7b81fc6ac8d1b7d3fffa79f9044ca4163784422582c9a3593305da2a69ec02",
"sha256:d7d1f81d230756f094b4d9ee59b37b2c13b2e63ff5fb72cda53625edb072cdae"
"sha256:3b2df037c80922f0f83f63ee2f9253496b4a8668c0fe8d2a836ba9040f853b07",
"sha256:9af92de5effc193bdb51216d9ebf28874d96180d202fae752b0d9f2a63380f3a"
],
"index": "pypi",
"version": "==2022.7.18"
"version": "==2023.2.17"
}
},
"develop": {

View File

@@ -1,4 +1,12 @@
# Auto Archiver
<h1 align="center">Auto Archiver</h1>
[![PyPI version](https://badge.fury.io/py/auto-archiver.svg)](https://badge.fury.io/py/auto-archiver)
[![Docker Image Version (latest by date)](https://img.shields.io/docker/v/bellingcat/auto-archiver?label=version&logo=docker)](https://pypi.org/project/auto-archiver/)
<!-- ![Docker Pulls](https://img.shields.io/docker/pulls/bellingcat/auto-archiver) -->
<!-- [![PyPI download month](https://img.shields.io/pypi/dm/auto-archiver.svg)](https://pypi.python.org/pypi/auto-archiver/) -->
<!-- [![Documentation Status](https://readthedocs.org/projects/vk-url-scraper/badge/?version=latest)](https://vk-url-scraper.readthedocs.io/en/latest/?badge=latest) -->
Read the [article about Auto Archiver on bellingcat.com](https://www.bellingcat.com/resources/2022/09/22/preserve-vital-online-content-with-bellingcats-auto-archiver-tool/).
@@ -15,6 +23,11 @@ But **you always need a configuration/orchestration file**, which is where you'l
## How to run the auto-archiver
### Option 1 - docker
<details><summary><code>Docker instructions</code></summary>
[![dockeri.co](https://dockerico.blankenship.io/image/bellingcat/auto-archiver)](https://hub.docker.com/r/bellingcat/auto-archiver)
Docker works like a virtual machine running inside your computer, it isolates everything and makes installation simple. Since it is an isolated environment when you need to pass it your orchestration file or get downloaded media out of docker you will need to connect folders on your machine with folders inside docker with the `-v` volume flag.
@@ -32,14 +45,20 @@ Docker works like a virtual machine running inside your computer, it isolates ev
2. `$PWD/local_archive` is a folder `local_archive/` in case you want to archive locally and have the files accessible outside docker
3. `/app/local_archive` is a folder inside docker that you can reference in your orchestration.yml file
</details>
### Option 2 - python package
<details><summary><code>Python package instructions</code></summary>
1. make sure you have python 3.8 or higher installed
2. install the package `pip/pipenv/conda install auto-archiver`
3. test it's installed with `auto-archiver --help`
4. run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml`
1. if your orchestration file is inside a `secrets/` which we advise
</details>
### Option 3 - local installation
This can also be used for development.
@@ -60,13 +79,6 @@ Clone and run:
</details><br/>
### Examples
# Orchestration
The archiver work is orchestrated by the following workflow (we call each a **step**):
1. **Feeder** gets the links (from a spreadsheet, from the console, ...)
@@ -85,7 +97,7 @@ The structure of orchestration file is split into 2 parts: `steps` (what **steps
steps:
feeder: gsheet_feeder
archivers: # order matters
- youtubedl_enricher
- youtubedl_archiver
enrichers:
- thumbnail_enricher
formatter: html_formatter
@@ -141,11 +153,11 @@ These assume you've installed with pipenv, see docker section above for how to r
# all the configurations come from ./orchestration.yaml
auto-archiver
# all the configurations come from ./secrets/orchestration.yaml
auto-archiver --config orchestration.yaml
# uses the configurations but for another google docs sheet
auto-archiver --config secrets/orchestration.yaml
# uses the same configurations but for another google docs sheet
# with a header on row 2 and with some different column names
# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided
auto-archiver --config orchestration.yaml --gsheets_feeder.sheet="use it on another sheets doc" --gsheets_feeder.header=2 --gsheets_feeder.columns='{"url": "link"}'
auto-archiver --config orchestration.yaml --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}'
# all the configurations come from orchestration.yaml and specifies that s3 files should be private
auto-archiver --s3_storage.private=1
```
@@ -154,11 +166,11 @@ auto-archiver --s3_storage.private=1
#### Google Drive
To use Google Drive storage you need the id of the shared folder in the `config.yaml` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com` and then you can use `--storage=gd`
#### Telethon (Telegrams API Library)
#### Telethon + Instagram with telegram bot
The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root.
## Running on Google Sheets Feeder (gsheets_feeder)
## Running on Google Sheets Feeder (gsheet_feeder)
The `--gseets_feeder.sheet` property is the name of the Google Sheet to check for URLs.
This sheet must have been shared with the Google Service account used by `gspread`.
This sheet must also have specific columns (case-insensitive) in the `header` row - see [Gsheet.configs](src/auto_archiver/utils/gsheet.py) for all their names.
@@ -171,23 +183,25 @@ When the auto archiver starts running, it updates the "Archive status" column.
![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Media URL" column. The auto archiver has added "archive in progress" to one of the status columns.](docs/demo-progress.png)
The links are downloaded and archived, and the spreadsheet is updated to the following:
![A screenshot of a Google Spreadsheet with videos archived and metadata added per the description of the columns above.](docs/demo-after.png)
Note that the first row is skipped, as it is assumed to be a header row (`--gsheets_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked.
Note that the first row is skipped, as it is assumed to be a header row (`--gsheet_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked.
---
## Development
Use `python -m src.auto_archiver --config secrets/orchestration.yaml` to run from the local development environment.
# Docker development
* working with docker locally:
#### Docker development
working with docker locally:
* `docker build . -t auto-archiver` to build a local image
* `docker run --rm -v $PWD/secrets:/app/secrets aa --config secrets/config.yaml`
* to use local archive, also create a volume `-v` for it by adding `-v $PWD/local_archive:/app/local_archive`
* release to docker hub
release to docker hub
* `docker image tag auto-archiver bellingcat/auto-archiver:latest`
* `docker push bellingcat/auto-archiver`
# RELEASE
#### RELEASE
* update version in [version.py](src/auto_archiver/version.py)
* run `bash ./scripts/release.sh` and confirm
* package is automatically updated in pypi

View File

@@ -1,80 +1,123 @@
steps:
# only 1 feeder allowed
# a feeder could be in an "infinite loop" for example: gsheets_infinite feeder which holds-> this could be an easy logic addiction by modifying for each to while not feeder.done() if it becomes necessary
feeder: gsheet_feeder # default -> only expects URL from CLI
archivers: # order matters
- telethon
# - tiktok
# - twitter
# - instagram
# - webarchive # this way it runs as a failsafe only
# enrichers:
# - screenshot
# - wacz
# - webarchive # this way it runs for every case, webarchive extends archiver and enrichment
# - thumbnails
formatters:
- HTMLFormater
- PdfFormater
feeder: gsheet_feeder # defaults to cli_feeder
archivers: # order matters, uncomment to activate
# - vk_archiver
# - telethon_archiver
# - telegram_archiver
# - twitter_archiver
# - twitter_api_archiver
# - instagram_tbot_archiver
# - instagram_archiver
# - tiktok_archiver
- youtubedl_archiver
- wayback_archiver_enricher
enrichers:
- hash_enricher
# - screenshot_enricher
# - thumbnail_enricher
# - wayback_archiver_enricher
# - wacz_enricher
formatter: html_formatter # defaults to mute_formatter
storages:
- local_storage
- s3
# - s3_storage
# - gdrive_storage
databases:
- gsheets_db
- mongo_db
- console_db
# - csv_db
# - gsheet_db
# - mongo_db
configurations:
gsheet_feeder:
sheet: my-auto-archiver
header: 2 # defaults to 1 in GSheetsFeeder
sheet: "your sheet name"
header: 1
service_account: "secrets/service_account.json"
# allow_worksheets: "allowed"
# block_worksheets: "blocked1,blocked2"
# allow_worksheets: "only parse this worksheet"
# block_worksheets: "blocked sheet 1,blocked sheet 2"
use_sheet_names_in_stored_paths: false
columns:
'url': 'link'
'status': 'archive status'
'folder': 'destination folder'
'archive': 'archive location'
'date': 'archive date'
'thumbnail': 'thumbnail'
'thumbnail_index': 'thumbnail index'
'timestamp': 'upload timestamp'
'title': 'upload title'
'duration': 'duration'
'screenshot': 'screenshot'
'hash': 'hash'
'wacz': 'wacz'
'replaywebpage': 'replaywebpage'
telethon:
api_id: "1234567"
api_hash: "examplehash"
session_file: "secrets/anon"
channel_invites:
- invite: https://t.me/+XXXXXXXXXXXXXX
id: 1000000000
- invite: https://t.me/joinchat/XXXXXXXXXXXXXX
id: 1000000001
url: link
status: archive status
folder: destination folder
archive: archive location
date: archive date
thumbnail: thumbnail
thumbnail_index: thumbnail index
timestamp: upload timestamp
title: upload title
text: textual content
duration: duration
screenshot: screenshot
hash: hash
wacz: wacz
replaywebpage: replaywebpage
instagram_tbot_archiver:
api_id: "TELEGRAM_BOT_API_ID"
api_hash: "TELEGRAM_BOT_API_HASH"
# session_file: "secrets/anon"
telethon_archiver:
api_id: "TELEGRAM_BOT_API_ID"
api_hash: "TELEGRAM_BOT_API_HASH"
# session_file: "secrets/anon"
join_channels: false
channel_invites: # if you want to archive from private channels
- invite: https://t.me/+123456789
id: 0000000001
- invite: https://t.me/+123456788
id: 0000000002
tiktok:
api_keys:
- username: 1
password: 2
- username: 3
password: 4
username: "abc"
password: "123"
token: "here"
screenshot:
twitter_api_archiver:
# either bearer_token only
bearer_token: "TWITTER_BEARER_TOKEN"
# OR all of the below
# consumer_key: ""
# consumer_secret: ""
# access_token: ""
# access_secret: ""
instagram_archiver:
username: "INSTAGRAM_USERNAME"
password: "INSTAGRAM_PASSWORD"
# session_file: "secrets/instaloader.session"
vk_archiver:
username: "or phone number"
password: "vk pass"
session_file: "secrets/vk_config.v2.json"
screenshot_enricher:
width: 1280
height: 4600
wacz:
height: 2300
wayback_archiver_enricher:
timeout: 10
key: "wayback key"
secret: "wayback secret"
hash_enricher:
algorithm: "SHA3-512" # can also be SHA-256
wacz_enricher:
profile: secrets/profile.tar.gz
webarchive:
api_key: "12345"
s3:
- bucket: 123
- region: "nyc3"
- cdn: "{region}{bucket}"
local_storage:
save_to: "./local_archive"
save_absolute: true
filename_generator: static
path_generator: flat
s3_storage:
bucket: your-bucket-name
region: reg1
key: S3_KEY
secret: S3_SECRET
endpoint_url: "https://{region}.digitaloceanspaces.com"
cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
# if private:true S3 urls will not be readable online
private: false
# with 'random' you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files, alternative is 'default' or not omitted from config
key_path: random
gdrive_storage:
path_generator: url
filename_generator: random
root_folder_id: folder_id_from_url
oauth_token: secrets/gd-token.json # needs to be generated with scripts/create_update_gdrive_oauth_token.py
service_account: "secrets/service_account.json"

View File

@@ -3,6 +3,7 @@ from .telethon_archiver import TelethonArchiver
from .twitter_archiver import TwitterArchiver
from .twitter_api_archiver import TwitterApiArchiver
from .instagram_archiver import InstagramArchiver
from .instagram_tbot_archiver import InstagramTbotArchiver
from .tiktok_archiver import TiktokArchiver
from .telegram_archiver import TelegramArchiver
from .vk_archiver import VkArchiver

View File

@@ -0,0 +1,77 @@
from telethon.sync import TelegramClient
from loguru import logger
import time, os
from sqlite3 import OperationalError
from . import Archiver
from ..core import Metadata, Media
class InstagramTbotArchiver(Archiver):
"""
calls a telegram bot to fetch instagram posts/stories... and gets available media from it
https://github.com/adw0rd/instagrapi
https://t.me/instagram_load_bot
"""
name = "instagram_tbot_archiver"
def __init__(self, config: dict) -> None:
super().__init__(config)
self.assert_valid_string("api_id")
self.assert_valid_string("api_hash")
self.timeout = int(self.timeout)
try:
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
except OperationalError as e:
logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_archiver. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}")
@staticmethod
def configs() -> dict:
return {
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
"session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
"timeout": {"default": 15, "help": "timeout to fetch the instagram content in seconds."},
}
def setup(self) -> None:
logger.info(f"SETUP {self.name} checking login...")
with self.client.start():
logger.success(f"SETUP {self.name} login works.")
def download(self, item: Metadata) -> Metadata:
url = item.get_url()
if not "instagram.com" in url: return False
result = Metadata()
tmp_dir = item.get_tmp_dir()
with self.client.start():
chat = self.client.get_entity("instagram_load_bot")
since_id = self.client.send_message(entity=chat, message=url).id
attempts = 0
seen_media = []
message = ""
time.sleep(4)
# media is added before text by the bot so it can be used as a stop-logic mechanism
while attempts < self.timeout and (not message or not len(seen_media)):
attempts += 1
time.sleep(1)
for post in self.client.iter_messages(chat, min_id=since_id):
since_id = max(since_id, post.id)
if post.media and post.id not in seen_media:
filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
media = self.client.download_media(post.media, filename_dest)
if media:
result.add_media(Media(media))
seen_media.append(post.id)
if post.message: message += post.message
if "You must enter a URL to a post" in message:
logger.debug(f"invalid link {url=} for {self.name}: {message}")
return False
if message:
result.set_content(message).set_title(message[:128])
return result.success("insta-via-bot")

View File

@@ -114,7 +114,7 @@ class TelethonArchiver(Archiver):
with self.client.start():
# with self.client.start(bot_token=self.bot_token):
try:
post = self.client.get_messages(chat, ids=post_id)
post = self.client.get_messages(chat, ids=post_id)
except ValueError as e:
logger.error(f"Could not fetch telegram {url} possibly it's private: {e}")
return False

View File

@@ -37,7 +37,7 @@ class TwitterArchiver(Archiver):
return self.link_clean_pattern.sub("\\1", url)
def is_rearchivable(self, url: str) -> bool:
# Twitter posts are static
# Twitter posts are static (for now)
return False
def download(self, item: Metadata) -> Metadata:
@@ -86,7 +86,7 @@ class TwitterArchiver(Archiver):
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
result.add_media(media)
return result.success("twitter")
return result.success("twitter-snscrape")
def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
"""

View File

@@ -6,7 +6,7 @@ from ..core import Metadata, Media
class YoutubeDLArchiver(Archiver):
name = "youtubedl_enricher"
name = "youtubedl_archiver"
def __init__(self, config: dict) -> None:
super().__init__(config)

View File

@@ -63,6 +63,9 @@ class Metadata:
def is_success(self) -> bool:
return "success" in self.status
def is_empty(self) -> bool:
return not self.is_success() and len(self.media) == 0 and len(self.get_clean_metadata()) <= 2 # url, processed_at
@property # getter .netloc
def netloc(self) -> str:
return urlparse(self.get_url()).netloc
@@ -122,7 +125,7 @@ class Metadata:
for m in self.media:
if m.get("id") == id: return m
return default
def get_first_image(self, default=None) -> Media:
for m in self.media:
if "image" in m.mimetype: return m

View File

@@ -123,6 +123,9 @@ class ArchivingOrchestrator:
s.store(final_media, result)
result.set_final_media(final_media)
if result.is_empty():
result.status = "nothing archived"
# signal completion to databases (DBs, Google Sheets, CSV, ...)
for d in self.databases: d.done(result)

View File

@@ -2,10 +2,8 @@ from typing import Union, Tuple
import datetime
from urllib.parse import quote
# from metadata import Metadata
from loguru import logger
# from . import Enricher
from . import Database
from ..core import Metadata
from ..core import Media
@@ -61,13 +59,13 @@ class GsheetsDb(Database):
cell_updates.append((row, 'status', item.status))
media: Media = item.get_final_media()
batch_if_valid('archive', "\n".join(media.urls))
if hasattr(media, "urls"):
batch_if_valid('archive', "\n".join(media.urls))
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
batch_if_valid('title', item.get_title())
batch_if_valid('text', item.get("content", "")[:500])
batch_if_valid('timestamp', item.get_timestamp())
if (screenshot := item.get_media_by_id("screenshot")):
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
batch_if_valid('screenshot', "\n".join(screenshot.urls))
if (thumbnail := item.get_first_image("thumbnail")):

View File

@@ -3,7 +3,7 @@ import time, uuid, os
from selenium.common.exceptions import TimeoutException
from . import Enricher
from ..utils import Webdriver
from ..utils import Webdriver, UrlUtil
from ..core import Media, Metadata
class ScreenshotEnricher(Enricher):
@@ -14,16 +14,21 @@ class ScreenshotEnricher(Enricher):
return {
"width": {"default": 1280, "help": "width of the screenshots"},
"height": {"default": 720, "help": "height of the screenshots"},
"timeout": {"default": 60, "help": "timeout for taking the screenshot"}
"timeout": {"default": 60, "help": "timeout for taking the screenshot"},
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}
}
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
if UrlUtil.is_auth_wall(url):
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
return
logger.debug(f"Enriching screenshot for {url=}")
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver:
try:
driver.get(url)
time.sleep(2)
time.sleep(int(self.sleep_before_screenshot))
screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
driver.save_screenshot(screenshot_file)
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
@@ -31,4 +36,3 @@ class ScreenshotEnricher(Enricher):
logger.info("TimeoutException loading page for screenshot")
except Exception as e:
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
# return None

View File

@@ -3,6 +3,7 @@ from loguru import logger
from ..core import Media, Metadata
from . import Enricher
from ..utils import UrlUtil
class WaczEnricher(Enricher):
@@ -20,11 +21,17 @@ class WaczEnricher(Enricher):
return {
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
"timeout": {"default": 90, "help": "timeout for WACZ generation in seconds"},
"ignore_auth_wall": {"default": True, "help": "skip URL if it is behind authentication wall, set to False if you have browsertrix profile configured for private content."},
}
def enrich(self, to_enrich: Metadata) -> bool:
# TODO: figure out support for browsertrix in docker
url = to_enrich.get_url()
if UrlUtil.is_auth_wall(url):
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
return
logger.debug(f"generating WACZ for {url=}")
collection = str(uuid.uuid4())[0:8]
browsertrix_home = os.path.abspath(to_enrich.get_tmp_dir())

View File

@@ -1,8 +1,10 @@
from loguru import logger
import time, requests
from . import Enricher
from ..archivers import Archiver
from ..utils import UrlUtil
from ..core import Metadata
class WaybackArchiverEnricher(Enricher, Archiver):
@@ -33,6 +35,10 @@ class WaybackArchiverEnricher(Enricher, Archiver):
def enrich(self, to_enrich: Metadata) -> bool:
url = to_enrich.get_url()
if UrlUtil.is_auth_wall(url):
logger.debug(f"[SKIP] WAYBACK since url is behind AUTH WALL: {url=}")
return
logger.debug(f"calling wayback for {url=}")
if to_enrich.get("wayback"):

View File

@@ -3,6 +3,7 @@ from dataclasses import dataclass
import mimetypes, uuid, os, pathlib
from jinja2 import Environment, FileSystemLoader
from urllib.parse import quote
from loguru import logger
from ..version import __version__
from ..core import Metadata, Media
@@ -26,12 +27,17 @@ class HtmlFormatter(Formatter):
@staticmethod
def configs() -> dict:
return {
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"},
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
}
def format(self, item: Metadata) -> Media:
url = item.get_url()
if item.is_empty():
logger.debug(f"[SKIP] FORMAT there is no media or metadata to format: {url=}")
return
content = self.template.render(
url=item.get_url(),
url=url,
title=item.get_title(),
media=item.media,
metadata=item.get_clean_metadata(),

View File

@@ -2,4 +2,5 @@
from .gworksheet import GWorksheet
from .misc import *
from .webdriver import Webdriver
from .gsheet import Gsheets
from .gsheet import Gsheets
from .url import UrlUtil

View File

@@ -40,11 +40,11 @@ class GWorksheet:
def _col_index(self, col: str):
self._check_col_exists(col)
return self.headers.index(self.columns[col])
return self.headers.index(self.columns[col].lower())
def col_exists(self, col: str):
self._check_col_exists(col)
return self.columns[col] in self.headers
return self.columns[col].lower() in self.headers
def count_rows(self):
return len(self.values)

View File

@@ -0,0 +1,19 @@
import re
class UrlUtil:
telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)")
is_istagram = re.compile(r"https:\/\/www\.instagram\.com")
@staticmethod
def clean(url): return url
@staticmethod
def is_auth_wall(url):
"""
checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
"""
if UrlUtil.telegram_private.match(url): return True
if UrlUtil.is_istagram.match(url): return True
return False

View File

@@ -1,9 +1,9 @@
_MAJOR = "0"
_MINOR = "3"
_MINOR = "4"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "0"
_PATCH = "3"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""