mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-08 11:28:28 +03:00
Compare commits
22 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0e3c427371 | ||
|
|
7497bc08c0 | ||
|
|
49863768fe | ||
|
|
7b9483bbf9 | ||
|
|
cd81cae559 | ||
|
|
23894fad51 | ||
|
|
876988b587 | ||
|
|
f95293b84b | ||
|
|
2fbcbe4e8b | ||
|
|
d1e4574c6c | ||
|
|
d347b26d37 | ||
|
|
1970fa3c82 | ||
|
|
aa5430451e | ||
|
|
f35875a94c | ||
|
|
5505255ea3 | ||
|
|
da17b3f68a | ||
|
|
d6dbdec6ac | ||
|
|
224ebe7ee8 | ||
|
|
54a1bc2172 | ||
|
|
77948207d1 | ||
|
|
60552ae0ea | ||
|
|
f255271ecb |
4
Pipfile
4
Pipfile
@@ -14,7 +14,6 @@ loguru = "*"
|
||||
ffmpeg-python = "*"
|
||||
selenium = "*"
|
||||
snscrape = "*"
|
||||
yt-dlp = "*"
|
||||
telethon = "*"
|
||||
google-api-python-client = "*"
|
||||
google-auth-httplib2 = "*"
|
||||
@@ -23,13 +22,14 @@ oauth2client = "*"
|
||||
python-slugify = "*"
|
||||
pyyaml = "*"
|
||||
dateparser = "*"
|
||||
vk-url-scraper = "*"
|
||||
python-twitter-v2 = "*"
|
||||
instaloader = "*"
|
||||
tqdm = "*"
|
||||
jinja2 = "*"
|
||||
cryptography = "==38.0.4"
|
||||
dataclasses-json = "*"
|
||||
yt-dlp = ">=2023.2.17"
|
||||
vk-url-scraper = "*"
|
||||
|
||||
[requires]
|
||||
python_version = "3.9"
|
||||
|
||||
248
Pipfile.lock
generated
248
Pipfile.lock
generated
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "e2f5d017d9bc9eef90cced189b6e3017d740c35d204962479417109a4deeb7f4"
|
||||
"sha256": "7176a6666639452dbf30939fa095ff23518aee6da7d9561de0f12ba0aceed527"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
@@ -57,19 +57,19 @@
|
||||
},
|
||||
"boto3": {
|
||||
"hashes": [
|
||||
"sha256:3a1ffeecfe6e61d414617294b822b008e604ccfd83434c483f429a2922db314d",
|
||||
"sha256:ebea98f3054b467caf6c8aead9f0ef78395a78bce78b04db12fde452c02b3734"
|
||||
"sha256:17f0d782487275cac12676a61b3f1a4900954cc454c842b8551ca47a3dcd59b4",
|
||||
"sha256:bf808f7433629650128ab577a9d4a0f4daf072d9f2f3a907b9d567a6952d9154"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.26.66"
|
||||
"version": "==1.26.77"
|
||||
},
|
||||
"botocore": {
|
||||
"hashes": [
|
||||
"sha256:4d1ac019e677cc39e615f9d473fa658ea22a8d906c1c562f9406b5d0cd854cbd",
|
||||
"sha256:772da07d2a49a9d2dc8d23e060e88eb72881e58074be7c813aa946ecdbd0e5b5"
|
||||
"sha256:9d94a02f2584b52c65fb3cb309fb1b29d6d0c36d69062722b0275c1c382c44c9",
|
||||
"sha256:d8aa7bffe2422de282b2d02945b7b45d5fecf00f67b65eebb0b1fa3de1abc6d0"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==1.29.66"
|
||||
"version": "==1.29.77"
|
||||
},
|
||||
"brotli": {
|
||||
"hashes": [
|
||||
@@ -176,11 +176,11 @@
|
||||
},
|
||||
"certifi": {
|
||||
"hashes": [
|
||||
"sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d",
|
||||
"sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412"
|
||||
"sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3",
|
||||
"sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2022.6.15"
|
||||
"version": "==2022.12.7"
|
||||
},
|
||||
"cffi": {
|
||||
"hashes": [
|
||||
@@ -253,11 +253,97 @@
|
||||
},
|
||||
"charset-normalizer": {
|
||||
"hashes": [
|
||||
"sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
|
||||
"sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"
|
||||
"sha256:00d3ffdaafe92a5dc603cb9bd5111aaa36dfa187c8285c543be562e61b755f6b",
|
||||
"sha256:024e606be3ed92216e2b6952ed859d86b4cfa52cd5bc5f050e7dc28f9b43ec42",
|
||||
"sha256:0298eafff88c99982a4cf66ba2efa1128e4ddaca0b05eec4c456bbc7db691d8d",
|
||||
"sha256:02a51034802cbf38db3f89c66fb5d2ec57e6fe7ef2f4a44d070a593c3688667b",
|
||||
"sha256:083c8d17153ecb403e5e1eb76a7ef4babfc2c48d58899c98fcaa04833e7a2f9a",
|
||||
"sha256:0a11e971ed097d24c534c037d298ad32c6ce81a45736d31e0ff0ad37ab437d59",
|
||||
"sha256:0bf2dae5291758b6f84cf923bfaa285632816007db0330002fa1de38bfcb7154",
|
||||
"sha256:0c0a590235ccd933d9892c627dec5bc7511ce6ad6c1011fdf5b11363022746c1",
|
||||
"sha256:0f438ae3532723fb6ead77e7c604be7c8374094ef4ee2c5e03a3a17f1fca256c",
|
||||
"sha256:109487860ef6a328f3eec66f2bf78b0b72400280d8f8ea05f69c51644ba6521a",
|
||||
"sha256:11b53acf2411c3b09e6af37e4b9005cba376c872503c8f28218c7243582df45d",
|
||||
"sha256:12db3b2c533c23ab812c2b25934f60383361f8a376ae272665f8e48b88e8e1c6",
|
||||
"sha256:14e76c0f23218b8f46c4d87018ca2e441535aed3632ca134b10239dfb6dadd6b",
|
||||
"sha256:16a8663d6e281208d78806dbe14ee9903715361cf81f6d4309944e4d1e59ac5b",
|
||||
"sha256:292d5e8ba896bbfd6334b096e34bffb56161c81408d6d036a7dfa6929cff8783",
|
||||
"sha256:2c03cc56021a4bd59be889c2b9257dae13bf55041a3372d3295416f86b295fb5",
|
||||
"sha256:2e396d70bc4ef5325b72b593a72c8979999aa52fb8bcf03f701c1b03e1166918",
|
||||
"sha256:2edb64ee7bf1ed524a1da60cdcd2e1f6e2b4f66ef7c077680739f1641f62f555",
|
||||
"sha256:31a9ddf4718d10ae04d9b18801bd776693487cbb57d74cc3458a7673f6f34639",
|
||||
"sha256:356541bf4381fa35856dafa6a965916e54bed415ad8a24ee6de6e37deccf2786",
|
||||
"sha256:358a7c4cb8ba9b46c453b1dd8d9e431452d5249072e4f56cfda3149f6ab1405e",
|
||||
"sha256:37f8febc8ec50c14f3ec9637505f28e58d4f66752207ea177c1d67df25da5aed",
|
||||
"sha256:39049da0ffb96c8cbb65cbf5c5f3ca3168990adf3551bd1dee10c48fce8ae820",
|
||||
"sha256:39cf9ed17fe3b1bc81f33c9ceb6ce67683ee7526e65fde1447c772afc54a1bb8",
|
||||
"sha256:3ae1de54a77dc0d6d5fcf623290af4266412a7c4be0b1ff7444394f03f5c54e3",
|
||||
"sha256:3b590df687e3c5ee0deef9fc8c547d81986d9a1b56073d82de008744452d6541",
|
||||
"sha256:3e45867f1f2ab0711d60c6c71746ac53537f1684baa699f4f668d4c6f6ce8e14",
|
||||
"sha256:3fc1c4a2ffd64890aebdb3f97e1278b0cc72579a08ca4de8cd2c04799a3a22be",
|
||||
"sha256:4457ea6774b5611f4bed5eaa5df55f70abde42364d498c5134b7ef4c6958e20e",
|
||||
"sha256:44ba614de5361b3e5278e1241fda3dc1838deed864b50a10d7ce92983797fa76",
|
||||
"sha256:4a8fcf28c05c1f6d7e177a9a46a1c52798bfe2ad80681d275b10dcf317deaf0b",
|
||||
"sha256:4b0d02d7102dd0f997580b51edc4cebcf2ab6397a7edf89f1c73b586c614272c",
|
||||
"sha256:502218f52498a36d6bf5ea77081844017bf7982cdbe521ad85e64cabee1b608b",
|
||||
"sha256:503e65837c71b875ecdd733877d852adbc465bd82c768a067badd953bf1bc5a3",
|
||||
"sha256:5995f0164fa7df59db4746112fec3f49c461dd6b31b841873443bdb077c13cfc",
|
||||
"sha256:59e5686dd847347e55dffcc191a96622f016bc0ad89105e24c14e0d6305acbc6",
|
||||
"sha256:601f36512f9e28f029d9481bdaf8e89e5148ac5d89cffd3b05cd533eeb423b59",
|
||||
"sha256:608862a7bf6957f2333fc54ab4399e405baad0163dc9f8d99cb236816db169d4",
|
||||
"sha256:62595ab75873d50d57323a91dd03e6966eb79c41fa834b7a1661ed043b2d404d",
|
||||
"sha256:70990b9c51340e4044cfc394a81f614f3f90d41397104d226f21e66de668730d",
|
||||
"sha256:71140351489970dfe5e60fc621ada3e0f41104a5eddaca47a7acb3c1b851d6d3",
|
||||
"sha256:72966d1b297c741541ca8cf1223ff262a6febe52481af742036a0b296e35fa5a",
|
||||
"sha256:74292fc76c905c0ef095fe11e188a32ebd03bc38f3f3e9bcb85e4e6db177b7ea",
|
||||
"sha256:761e8904c07ad053d285670f36dd94e1b6ab7f16ce62b9805c475b7aa1cffde6",
|
||||
"sha256:772b87914ff1152b92a197ef4ea40efe27a378606c39446ded52c8f80f79702e",
|
||||
"sha256:79909e27e8e4fcc9db4addea88aa63f6423ebb171db091fb4373e3312cb6d603",
|
||||
"sha256:7e189e2e1d3ed2f4aebabd2d5b0f931e883676e51c7624826e0a4e5fe8a0bf24",
|
||||
"sha256:7eb33a30d75562222b64f569c642ff3dc6689e09adda43a082208397f016c39a",
|
||||
"sha256:81d6741ab457d14fdedc215516665050f3822d3e56508921cc7239f8c8e66a58",
|
||||
"sha256:8499ca8f4502af841f68135133d8258f7b32a53a1d594aa98cc52013fff55678",
|
||||
"sha256:84c3990934bae40ea69a82034912ffe5a62c60bbf6ec5bc9691419641d7d5c9a",
|
||||
"sha256:87701167f2a5c930b403e9756fab1d31d4d4da52856143b609e30a1ce7160f3c",
|
||||
"sha256:88600c72ef7587fe1708fd242b385b6ed4b8904976d5da0893e31df8b3480cb6",
|
||||
"sha256:8ac7b6a045b814cf0c47f3623d21ebd88b3e8cf216a14790b455ea7ff0135d18",
|
||||
"sha256:8b8af03d2e37866d023ad0ddea594edefc31e827fee64f8de5611a1dbc373174",
|
||||
"sha256:8c7fe7afa480e3e82eed58e0ca89f751cd14d767638e2550c77a92a9e749c317",
|
||||
"sha256:8eade758719add78ec36dc13201483f8e9b5d940329285edcd5f70c0a9edbd7f",
|
||||
"sha256:911d8a40b2bef5b8bbae2e36a0b103f142ac53557ab421dc16ac4aafee6f53dc",
|
||||
"sha256:93ad6d87ac18e2a90b0fe89df7c65263b9a99a0eb98f0a3d2e079f12a0735837",
|
||||
"sha256:95dea361dd73757c6f1c0a1480ac499952c16ac83f7f5f4f84f0658a01b8ef41",
|
||||
"sha256:9ab77acb98eba3fd2a85cd160851816bfce6871d944d885febf012713f06659c",
|
||||
"sha256:9cb3032517f1627cc012dbc80a8ec976ae76d93ea2b5feaa9d2a5b8882597579",
|
||||
"sha256:9cf4e8ad252f7c38dd1f676b46514f92dc0ebeb0db5552f5f403509705e24753",
|
||||
"sha256:9d9153257a3f70d5f69edf2325357251ed20f772b12e593f3b3377b5f78e7ef8",
|
||||
"sha256:a152f5f33d64a6be73f1d30c9cc82dfc73cec6477ec268e7c6e4c7d23c2d2291",
|
||||
"sha256:a16418ecf1329f71df119e8a65f3aa68004a3f9383821edcb20f0702934d8087",
|
||||
"sha256:a60332922359f920193b1d4826953c507a877b523b2395ad7bc716ddd386d866",
|
||||
"sha256:a8d0fc946c784ff7f7c3742310cc8a57c5c6dc31631269876a88b809dbeff3d3",
|
||||
"sha256:ab5de034a886f616a5668aa5d098af2b5385ed70142090e2a31bcbd0af0fdb3d",
|
||||
"sha256:c22d3fe05ce11d3671297dc8973267daa0f938b93ec716e12e0f6dee81591dc1",
|
||||
"sha256:c2ac1b08635a8cd4e0cbeaf6f5e922085908d48eb05d44c5ae9eabab148512ca",
|
||||
"sha256:c512accbd6ff0270939b9ac214b84fb5ada5f0409c44298361b2f5e13f9aed9e",
|
||||
"sha256:c75ffc45f25324e68ab238cb4b5c0a38cd1c3d7f1fb1f72b5541de469e2247db",
|
||||
"sha256:c95a03c79bbe30eec3ec2b7f076074f4281526724c8685a42872974ef4d36b72",
|
||||
"sha256:cadaeaba78750d58d3cc6ac4d1fd867da6fc73c88156b7a3212a3cd4819d679d",
|
||||
"sha256:cd6056167405314a4dc3c173943f11249fa0f1b204f8b51ed4bde1a9cd1834dc",
|
||||
"sha256:db72b07027db150f468fbada4d85b3b2729a3db39178abf5c543b784c1254539",
|
||||
"sha256:df2c707231459e8a4028eabcd3cfc827befd635b3ef72eada84ab13b52e1574d",
|
||||
"sha256:e62164b50f84e20601c1ff8eb55620d2ad25fb81b59e3cd776a1902527a788af",
|
||||
"sha256:e696f0dd336161fca9adbb846875d40752e6eba585843c768935ba5c9960722b",
|
||||
"sha256:eaa379fcd227ca235d04152ca6704c7cb55564116f8bc52545ff357628e10602",
|
||||
"sha256:ebea339af930f8ca5d7a699b921106c6e29c617fe9606fa7baa043c1cdae326f",
|
||||
"sha256:f4c39b0e3eac288fedc2b43055cfc2ca7a60362d0e5e87a637beac5d801ef478",
|
||||
"sha256:f5057856d21e7586765171eac8b9fc3f7d44ef39425f85dbcccb13b3ebea806c",
|
||||
"sha256:f6f45710b4459401609ebebdbcfb34515da4fc2aa886f95107f556ac69a9147e",
|
||||
"sha256:f97e83fa6c25693c7a35de154681fcc257c1c41b38beb0304b9c4d2d9e164479",
|
||||
"sha256:f9d0c5c045a3ca9bedfc35dca8526798eb91a07aa7a2c0fee134c6c6f321cbd7",
|
||||
"sha256:ff6f3db31555657f3163b15a6b7c6938d08df7adbfc9dd13d9d19edad678f1e8"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==2.0.12"
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.0.1"
|
||||
},
|
||||
"click": {
|
||||
"hashes": [
|
||||
@@ -348,11 +434,11 @@
|
||||
},
|
||||
"flask": {
|
||||
"hashes": [
|
||||
"sha256:642c450d19c4ad482f96729bd2a8f6d32554aa1e231f4f6b4e7e5264b16cca2b",
|
||||
"sha256:b9c46cc36662a7949f34b52d8ec7bb59c0d74ba08ba6cb9ce9adc1d8676d9526"
|
||||
"sha256:7eb373984bf1c770023fce9db164ed0c3353cd0b53f130f4693da0ca756a2e6d",
|
||||
"sha256:c0bec9477df1cb867e5a67c9e1ab758de9cb4a3e52dd70681f59fa40a62b3f2d"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==2.2.2"
|
||||
"version": "==2.2.3"
|
||||
},
|
||||
"future": {
|
||||
"hashes": [
|
||||
@@ -371,19 +457,19 @@
|
||||
},
|
||||
"google-api-python-client": {
|
||||
"hashes": [
|
||||
"sha256:42a44e9adfca6bb27540ce52348aa1d3b81e214bcc53d454a76ebfbe8eee1483",
|
||||
"sha256:f18e9dbb365f0485194a8daf5d60da2cff6a80ce2c9a694efc2b279922cb3dd0"
|
||||
"sha256:577c0aeae1eb3c754eacb9122d369d67609fef759bc6a4fa16cafeab4f30019b",
|
||||
"sha256:b9b6dc5f139892310093ba75d0df4c78f48655078953c923957dab1ec86129e7"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.77.0"
|
||||
"version": "==2.79.0"
|
||||
},
|
||||
"google-auth": {
|
||||
"hashes": [
|
||||
"sha256:5045648c821fb72384cdc0e82cc326df195f113a33049d9b62b74589243d2acc",
|
||||
"sha256:ed7057a101af1146f0554a769930ac9de506aeca4fd5af6543ebe791851a9fbd"
|
||||
"sha256:5fd170986bce6bfd7bb5c845c4b8362edb1e0cba901e062196e83f8bb5d5d32c",
|
||||
"sha256:75d76ea857df65938e1f71dcbcd7d0cd48e3f80b34b8870ba229c9292081f7ef"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
|
||||
"version": "==2.16.0"
|
||||
"version": "==2.16.1"
|
||||
},
|
||||
"google-auth-httplib2": {
|
||||
"hashes": [
|
||||
@@ -435,18 +521,18 @@
|
||||
},
|
||||
"idna": {
|
||||
"hashes": [
|
||||
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
|
||||
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
|
||||
"sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4",
|
||||
"sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==3.3"
|
||||
"version": "==3.4"
|
||||
},
|
||||
"instaloader": {
|
||||
"hashes": [
|
||||
"sha256:ba925a87e2c305a3d24173d1bb0457d5a7e2e77dbac7206eeeb46f9104ecb08e"
|
||||
"sha256:16040c170fb5230c1981a47e1990261e3c0ecffe0417be95fa265632244e7c01"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==4.9.5"
|
||||
"version": "==4.9.6"
|
||||
},
|
||||
"itsdangerous": {
|
||||
"hashes": [
|
||||
@@ -565,11 +651,11 @@
|
||||
},
|
||||
"markdown-it-py": {
|
||||
"hashes": [
|
||||
"sha256:93de681e5c021a432c63147656fe21790bc01231e0cd2da73626f1aa3ac0fe27",
|
||||
"sha256:cf7e59fed14b5ae17c0006eff14a2d9a00ed5f3a846148153899a0224e2c07da"
|
||||
"sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30",
|
||||
"sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==2.1.0"
|
||||
"version": "==2.2.0"
|
||||
},
|
||||
"markupsafe": {
|
||||
"hashes": [
|
||||
@@ -700,23 +786,22 @@
|
||||
},
|
||||
"protobuf": {
|
||||
"hashes": [
|
||||
"sha256:1f22ac0ca65bb70a876060d96d914dae09ac98d114294f77584b0d2644fa9c30",
|
||||
"sha256:237216c3326d46808a9f7c26fd1bd4b20015fb6867dc5d263a493ef9a539293b",
|
||||
"sha256:27f4d15021da6d2b706ddc3860fac0a5ddaba34ab679dc182b60a8bb4e1121cc",
|
||||
"sha256:299ea899484ee6f44604deb71f424234f654606b983cb496ea2a53e3c63ab791",
|
||||
"sha256:3d164928ff0727d97022957c2b849250ca0e64777ee31efd7d6de2e07c494717",
|
||||
"sha256:6ab80df09e3208f742c98443b6166bcb70d65f52cfeb67357d52032ea1ae9bec",
|
||||
"sha256:78a28c9fa223998472886c77042e9b9afb6fe4242bd2a2a5aced88e3f4422aa7",
|
||||
"sha256:7cd532c4566d0e6feafecc1059d04c7915aec8e182d1cf7adee8b24ef1e2e6ab",
|
||||
"sha256:89f9149e4a0169cddfc44c74f230d7743002e3aa0b9472d8c28f0388102fc4c2",
|
||||
"sha256:a53fd3f03e578553623272dc46ac2f189de23862e68565e83dde203d41b76fc5",
|
||||
"sha256:b135410244ebe777db80298297a97fbb4c862c881b4403b71bac9d4107d61fd1",
|
||||
"sha256:b98d0148f84e3a3c569e19f52103ca1feacdac0d2df8d6533cf983d1fda28462",
|
||||
"sha256:d1736130bce8cf131ac7957fa26880ca19227d4ad68b4888b3be0dea1f95df97",
|
||||
"sha256:f45460f9ee70a0ec1b6694c6e4e348ad2019275680bd68a1d9314b8c7e01e574"
|
||||
"sha256:1669cb7524221a8e2d9008d0842453dbefdd0fcdd64d67672f657244867635fb",
|
||||
"sha256:29288813aacaa302afa2381db1d6e0482165737b0afdf2811df5fa99185c457b",
|
||||
"sha256:47d31bdf58222dd296976aa1646c68c6ee80b96d22e0a3c336c9174e253fd35e",
|
||||
"sha256:652d8dfece122a24d98eebfef30e31e455d300efa41999d1182e015984ac5930",
|
||||
"sha256:7c535d126e7dcc714105ab20b418c4fedbd28f8b8afc42b7350b1e317bbbcc71",
|
||||
"sha256:86c3d20428b007537ba6792b475c0853bba7f66b1f60e610d913b77d94b486e4",
|
||||
"sha256:a33a273d21852f911b8bda47f39f4383fe7c061eb1814db2c76c9875c89c2491",
|
||||
"sha256:ab4d043865dd04e6b09386981fe8f80b39a1e46139fb4a3c206229d6b9f36ff6",
|
||||
"sha256:b2fea9dc8e3c0f32c38124790ef16cba2ee0628fe2022a52e435e1117bfef9b1",
|
||||
"sha256:c27f371f0159feb70e6ea52ed7e768b3f3a4c5676c1900a7e51a24740381650e",
|
||||
"sha256:c3325803095fb4c2a48649c321d2fbde59f8fbfcb9bfc7a86df27d112831c571",
|
||||
"sha256:e474b63bab0a2ea32a7b26a4d8eec59e33e709321e5e16fb66e766b61b82a95e",
|
||||
"sha256:e894e9ae603e963f0842498c4cd5d39c6a60f0d7e4c103df50ee939564298658"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==4.21.12"
|
||||
"version": "==4.22.0"
|
||||
},
|
||||
"pyaes": {
|
||||
"hashes": [
|
||||
@@ -838,14 +923,6 @@
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.8.2"
|
||||
},
|
||||
"python-dotenv": {
|
||||
"hashes": [
|
||||
"sha256:b7e3b04a59693c42c36f9ab1cc2acc46fa5df8c78e178fc33a8d4cd05c8d498f",
|
||||
"sha256:d92a187be61fe482e4fd675b6d52200e7be63a12b724abbf931a40ce4fa92938"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==0.20.0"
|
||||
},
|
||||
"python-slugify": {
|
||||
"hashes": [
|
||||
"sha256:51f217508df20a6c166c7821683384b998560adcf8f19a6c2ca8b460528ccd9c",
|
||||
@@ -1019,11 +1096,11 @@
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
"sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f",
|
||||
"sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b"
|
||||
"sha256:64299f4909223da747622c030b781c0d7811e359c37124b4bd368fb8c6518baa",
|
||||
"sha256:98b1b2782e3c6c4904938b84c0eb932721069dfdb9134313beff7c83c2df24bf"
|
||||
],
|
||||
"markers": "python_version >= '3.7' and python_version < '4'",
|
||||
"version": "==2.28.0"
|
||||
"version": "==2.28.2"
|
||||
},
|
||||
"requests-oauthlib": {
|
||||
"hashes": [
|
||||
@@ -1067,11 +1144,11 @@
|
||||
},
|
||||
"selenium": {
|
||||
"hashes": [
|
||||
"sha256:20f28ee4ea9b273b4112a7df5276ebb3052f79ff6eff42a564db6143e5926683",
|
||||
"sha256:fee36724d6cf0b18c73781bb8ec7be4a35ab1e2564e64e64e64da75e50e052af"
|
||||
"sha256:bd04eb41395605d9b2b65fe587f3fed21431da75512985c52772529e5e210c60",
|
||||
"sha256:c48372905bffcc3b24bd55ab4683a07ee5e1f30fe918c59558ea5ee44cedf6c3"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==4.8.0"
|
||||
"version": "==4.8.2"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
@@ -1106,11 +1183,11 @@
|
||||
},
|
||||
"soupsieve": {
|
||||
"hashes": [
|
||||
"sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759",
|
||||
"sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"
|
||||
"sha256:49e5368c2cda80ee7e84da9dbe3e110b70a4575f196efb74e51b94549d921955",
|
||||
"sha256:e28dba9ca6c7c00173e34e4ba57448f0688bb681b7c5e8bf4971daafc093d69a"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.3.2.post1"
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==2.4"
|
||||
},
|
||||
"telethon": {
|
||||
"hashes": [
|
||||
@@ -1160,11 +1237,11 @@
|
||||
},
|
||||
"typing-extensions": {
|
||||
"hashes": [
|
||||
"sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa",
|
||||
"sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"
|
||||
"sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb",
|
||||
"sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==4.4.0"
|
||||
"version": "==4.5.0"
|
||||
},
|
||||
"typing-inspect": {
|
||||
"hashes": [
|
||||
@@ -1198,27 +1275,30 @@
|
||||
"version": "==4.1.1"
|
||||
},
|
||||
"urllib3": {
|
||||
"hashes": [
|
||||
"sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14",
|
||||
"sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"
|
||||
"extras": [
|
||||
"socks"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
|
||||
"version": "==1.26.9"
|
||||
"hashes": [
|
||||
"sha256:076907bf8fd355cde77728471316625a4d2f7e713c125f51953bb5b3eecf4f72",
|
||||
"sha256:75edcdc2f7d85b137124a6c3c9fc3933cdeaa12ecb9a6a959f22797a0feca7e1"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
|
||||
"version": "==1.26.14"
|
||||
},
|
||||
"vk-api": {
|
||||
"hashes": [
|
||||
"sha256:11c731e214ebc7fa911db81efb021f97587493a5402b992f24748fe1cd9d7afc",
|
||||
"sha256:d0ae766fa93a40d47c5da045d94201721bf766dbde122a1d2253516b35c5edf3"
|
||||
"sha256:c71021506449afe5b9bbb1c4acb0d86b35a007ddc21678478e46fbbeabd1f3ef",
|
||||
"sha256:c7741e40bc05980c91ed94c84542e1e7e7370e101b5eaa74222958d4130fe3c2"
|
||||
],
|
||||
"version": "==11.9.8"
|
||||
"version": "==11.9.9"
|
||||
},
|
||||
"vk-url-scraper": {
|
||||
"hashes": [
|
||||
"sha256:1cd6daad89a1f920902cb68c5952c5ab5e80ba2bf4a8c3657c781b5b0f9d406b",
|
||||
"sha256:d430de947575e321cedceecfdf198b8bd14db3026038b924547e8b1c7c6a09ed"
|
||||
"sha256:5a32fb5419f7bb8bd35de8548948fe27a06f857a4d086c87e142bf07aabc3fd7",
|
||||
"sha256:a87c5aa7c1570c3aa87031e78c2052105e3681f57503fd4cb56470c3ab6106d6"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.3.10"
|
||||
"version": "==0.3.15"
|
||||
},
|
||||
"websockets": {
|
||||
"hashes": [
|
||||
@@ -1297,11 +1377,11 @@
|
||||
},
|
||||
"werkzeug": {
|
||||
"hashes": [
|
||||
"sha256:7ea2d48322cc7c0f8b3a215ed73eabd7b5d75d0b50e31ab006286ccff9e00b8f",
|
||||
"sha256:f979ab81f58d7318e064e99c4506445d60135ac5cd2e177a2de0089bfd4c9bd5"
|
||||
"sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe",
|
||||
"sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==2.2.2"
|
||||
"version": "==2.2.3"
|
||||
},
|
||||
"wsproto": {
|
||||
"hashes": [
|
||||
@@ -1313,11 +1393,11 @@
|
||||
},
|
||||
"yt-dlp": {
|
||||
"hashes": [
|
||||
"sha256:0e7b81fc6ac8d1b7d3fffa79f9044ca4163784422582c9a3593305da2a69ec02",
|
||||
"sha256:d7d1f81d230756f094b4d9ee59b37b2c13b2e63ff5fb72cda53625edb072cdae"
|
||||
"sha256:3b2df037c80922f0f83f63ee2f9253496b4a8668c0fe8d2a836ba9040f853b07",
|
||||
"sha256:9af92de5effc193bdb51216d9ebf28874d96180d202fae752b0d9f2a63380f3a"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2022.7.18"
|
||||
"version": "==2023.2.17"
|
||||
}
|
||||
},
|
||||
"develop": {
|
||||
|
||||
52
README.md
52
README.md
@@ -1,4 +1,12 @@
|
||||
# Auto Archiver
|
||||
<h1 align="center">Auto Archiver</h1>
|
||||
|
||||
[](https://badge.fury.io/py/auto-archiver)
|
||||
[](https://pypi.org/project/auto-archiver/)
|
||||
<!--  -->
|
||||
<!-- [](https://pypi.python.org/pypi/auto-archiver/) -->
|
||||
<!-- [](https://vk-url-scraper.readthedocs.io/en/latest/?badge=latest) -->
|
||||
|
||||
|
||||
Read the [article about Auto Archiver on bellingcat.com](https://www.bellingcat.com/resources/2022/09/22/preserve-vital-online-content-with-bellingcats-auto-archiver-tool/).
|
||||
|
||||
|
||||
@@ -15,6 +23,11 @@ But **you always need a configuration/orchestration file**, which is where you'l
|
||||
## How to run the auto-archiver
|
||||
|
||||
### Option 1 - docker
|
||||
|
||||
<details><summary><code>Docker instructions</code></summary>
|
||||
|
||||
[](https://hub.docker.com/r/bellingcat/auto-archiver)
|
||||
|
||||
Docker works like a virtual machine running inside your computer, it isolates everything and makes installation simple. Since it is an isolated environment when you need to pass it your orchestration file or get downloaded media out of docker you will need to connect folders on your machine with folders inside docker with the `-v` volume flag.
|
||||
|
||||
|
||||
@@ -32,14 +45,20 @@ Docker works like a virtual machine running inside your computer, it isolates ev
|
||||
2. `$PWD/local_archive` is a folder `local_archive/` in case you want to archive locally and have the files accessible outside docker
|
||||
3. `/app/local_archive` is a folder inside docker that you can reference in your orchestration.yml file
|
||||
|
||||
</details>
|
||||
|
||||
### Option 2 - python package
|
||||
|
||||
<details><summary><code>Python package instructions</code></summary>
|
||||
|
||||
1. make sure you have python 3.8 or higher installed
|
||||
2. install the package `pip/pipenv/conda install auto-archiver`
|
||||
3. test it's installed with `auto-archiver --help`
|
||||
4. run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml`
|
||||
1. if your orchestration file is inside a `secrets/` which we advise
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
### Option 3 - local installation
|
||||
This can also be used for development.
|
||||
@@ -60,13 +79,6 @@ Clone and run:
|
||||
|
||||
</details><br/>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
### Examples
|
||||
|
||||
|
||||
# Orchestration
|
||||
The archiver work is orchestrated by the following workflow (we call each a **step**):
|
||||
1. **Feeder** gets the links (from a spreadsheet, from the console, ...)
|
||||
@@ -85,7 +97,7 @@ The structure of orchestration file is split into 2 parts: `steps` (what **steps
|
||||
steps:
|
||||
feeder: gsheet_feeder
|
||||
archivers: # order matters
|
||||
- youtubedl_enricher
|
||||
- youtubedl_archiver
|
||||
enrichers:
|
||||
- thumbnail_enricher
|
||||
formatter: html_formatter
|
||||
@@ -141,11 +153,11 @@ These assume you've installed with pipenv, see docker section above for how to r
|
||||
# all the configurations come from ./orchestration.yaml
|
||||
auto-archiver
|
||||
# all the configurations come from ./secrets/orchestration.yaml
|
||||
auto-archiver --config orchestration.yaml
|
||||
# uses the configurations but for another google docs sheet
|
||||
auto-archiver --config secrets/orchestration.yaml
|
||||
# uses the same configurations but for another google docs sheet
|
||||
# with a header on row 2 and with some different column names
|
||||
# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided
|
||||
auto-archiver --config orchestration.yaml --gsheets_feeder.sheet="use it on another sheets doc" --gsheets_feeder.header=2 --gsheets_feeder.columns='{"url": "link"}'
|
||||
auto-archiver --config orchestration.yaml --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}'
|
||||
# all the configurations come from orchestration.yaml and specifies that s3 files should be private
|
||||
auto-archiver --s3_storage.private=1
|
||||
```
|
||||
@@ -154,11 +166,11 @@ auto-archiver --s3_storage.private=1
|
||||
#### Google Drive
|
||||
To use Google Drive storage you need the id of the shared folder in the `config.yaml` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com` and then you can use `--storage=gd`
|
||||
|
||||
#### Telethon (Telegrams API Library)
|
||||
#### Telethon + Instagram with telegram bot
|
||||
The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root.
|
||||
|
||||
|
||||
## Running on Google Sheets Feeder (gsheets_feeder)
|
||||
## Running on Google Sheets Feeder (gsheet_feeder)
|
||||
The `--gseets_feeder.sheet` property is the name of the Google Sheet to check for URLs.
|
||||
This sheet must have been shared with the Google Service account used by `gspread`.
|
||||
This sheet must also have specific columns (case-insensitive) in the `header` row - see [Gsheet.configs](src/auto_archiver/utils/gsheet.py) for all their names.
|
||||
@@ -171,23 +183,25 @@ When the auto archiver starts running, it updates the "Archive status" column.
|
||||

|
||||
The links are downloaded and archived, and the spreadsheet is updated to the following:
|
||||

|
||||
Note that the first row is skipped, as it is assumed to be a header row (`--gsheets_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked.
|
||||
Note that the first row is skipped, as it is assumed to be a header row (`--gsheet_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked.
|
||||
|
||||
|
||||
---
|
||||
## Development
|
||||
Use `python -m src.auto_archiver --config secrets/orchestration.yaml` to run from the local development environment.
|
||||
|
||||
# Docker development
|
||||
* working with docker locally:
|
||||
#### Docker development
|
||||
working with docker locally:
|
||||
* `docker build . -t auto-archiver` to build a local image
|
||||
* `docker run --rm -v $PWD/secrets:/app/secrets aa --config secrets/config.yaml`
|
||||
* to use local archive, also create a volume `-v` for it by adding `-v $PWD/local_archive:/app/local_archive`
|
||||
* release to docker hub
|
||||
|
||||
|
||||
release to docker hub
|
||||
* `docker image tag auto-archiver bellingcat/auto-archiver:latest`
|
||||
* `docker push bellingcat/auto-archiver`
|
||||
|
||||
# RELEASE
|
||||
#### RELEASE
|
||||
* update version in [version.py](src/auto_archiver/version.py)
|
||||
* run `bash ./scripts/release.sh` and confirm
|
||||
* package is automatically updated in pypi
|
||||
|
||||
@@ -1,80 +1,123 @@
|
||||
steps:
|
||||
# only 1 feeder allowed
|
||||
# a feeder could be in an "infinite loop" for example: gsheets_infinite feeder which holds-> this could be an easy logic addiction by modifying for each to while not feeder.done() if it becomes necessary
|
||||
feeder: gsheet_feeder # default -> only expects URL from CLI
|
||||
archivers: # order matters
|
||||
- telethon
|
||||
# - tiktok
|
||||
# - twitter
|
||||
# - instagram
|
||||
# - webarchive # this way it runs as a failsafe only
|
||||
# enrichers:
|
||||
# - screenshot
|
||||
# - wacz
|
||||
# - webarchive # this way it runs for every case, webarchive extends archiver and enrichment
|
||||
# - thumbnails
|
||||
formatters:
|
||||
- HTMLFormater
|
||||
- PdfFormater
|
||||
feeder: gsheet_feeder # defaults to cli_feeder
|
||||
archivers: # order matters, uncomment to activate
|
||||
# - vk_archiver
|
||||
# - telethon_archiver
|
||||
# - telegram_archiver
|
||||
# - twitter_archiver
|
||||
# - twitter_api_archiver
|
||||
# - instagram_tbot_archiver
|
||||
# - instagram_archiver
|
||||
# - tiktok_archiver
|
||||
- youtubedl_archiver
|
||||
- wayback_archiver_enricher
|
||||
enrichers:
|
||||
- hash_enricher
|
||||
# - screenshot_enricher
|
||||
# - thumbnail_enricher
|
||||
# - wayback_archiver_enricher
|
||||
# - wacz_enricher
|
||||
|
||||
formatter: html_formatter # defaults to mute_formatter
|
||||
storages:
|
||||
- local_storage
|
||||
- s3
|
||||
# - s3_storage
|
||||
# - gdrive_storage
|
||||
databases:
|
||||
- gsheets_db
|
||||
- mongo_db
|
||||
|
||||
|
||||
- console_db
|
||||
# - csv_db
|
||||
# - gsheet_db
|
||||
# - mongo_db
|
||||
|
||||
configurations:
|
||||
gsheet_feeder:
|
||||
sheet: my-auto-archiver
|
||||
header: 2 # defaults to 1 in GSheetsFeeder
|
||||
sheet: "your sheet name"
|
||||
header: 1
|
||||
service_account: "secrets/service_account.json"
|
||||
# allow_worksheets: "allowed"
|
||||
# block_worksheets: "blocked1,blocked2"
|
||||
# allow_worksheets: "only parse this worksheet"
|
||||
# block_worksheets: "blocked sheet 1,blocked sheet 2"
|
||||
use_sheet_names_in_stored_paths: false
|
||||
columns:
|
||||
'url': 'link'
|
||||
'status': 'archive status'
|
||||
'folder': 'destination folder'
|
||||
'archive': 'archive location'
|
||||
'date': 'archive date'
|
||||
'thumbnail': 'thumbnail'
|
||||
'thumbnail_index': 'thumbnail index'
|
||||
'timestamp': 'upload timestamp'
|
||||
'title': 'upload title'
|
||||
'duration': 'duration'
|
||||
'screenshot': 'screenshot'
|
||||
'hash': 'hash'
|
||||
'wacz': 'wacz'
|
||||
'replaywebpage': 'replaywebpage'
|
||||
telethon:
|
||||
api_id: "1234567"
|
||||
api_hash: "examplehash"
|
||||
session_file: "secrets/anon"
|
||||
channel_invites:
|
||||
- invite: https://t.me/+XXXXXXXXXXXXXX
|
||||
id: 1000000000
|
||||
- invite: https://t.me/joinchat/XXXXXXXXXXXXXX
|
||||
id: 1000000001
|
||||
url: link
|
||||
status: archive status
|
||||
folder: destination folder
|
||||
archive: archive location
|
||||
date: archive date
|
||||
thumbnail: thumbnail
|
||||
thumbnail_index: thumbnail index
|
||||
timestamp: upload timestamp
|
||||
title: upload title
|
||||
text: textual content
|
||||
duration: duration
|
||||
screenshot: screenshot
|
||||
hash: hash
|
||||
wacz: wacz
|
||||
replaywebpage: replaywebpage
|
||||
instagram_tbot_archiver:
|
||||
api_id: "TELEGRAM_BOT_API_ID"
|
||||
api_hash: "TELEGRAM_BOT_API_HASH"
|
||||
# session_file: "secrets/anon"
|
||||
telethon_archiver:
|
||||
api_id: "TELEGRAM_BOT_API_ID"
|
||||
api_hash: "TELEGRAM_BOT_API_HASH"
|
||||
# session_file: "secrets/anon"
|
||||
join_channels: false
|
||||
channel_invites: # if you want to archive from private channels
|
||||
- invite: https://t.me/+123456789
|
||||
id: 0000000001
|
||||
- invite: https://t.me/+123456788
|
||||
id: 0000000002
|
||||
|
||||
tiktok:
|
||||
api_keys:
|
||||
- username: 1
|
||||
password: 2
|
||||
- username: 3
|
||||
password: 4
|
||||
username: "abc"
|
||||
password: "123"
|
||||
token: "here"
|
||||
screenshot:
|
||||
twitter_api_archiver:
|
||||
# either bearer_token only
|
||||
bearer_token: "TWITTER_BEARER_TOKEN"
|
||||
# OR all of the below
|
||||
# consumer_key: ""
|
||||
# consumer_secret: ""
|
||||
# access_token: ""
|
||||
# access_secret: ""
|
||||
instagram_archiver:
|
||||
username: "INSTAGRAM_USERNAME"
|
||||
password: "INSTAGRAM_PASSWORD"
|
||||
# session_file: "secrets/instaloader.session"
|
||||
|
||||
vk_archiver:
|
||||
username: "or phone number"
|
||||
password: "vk pass"
|
||||
session_file: "secrets/vk_config.v2.json"
|
||||
|
||||
screenshot_enricher:
|
||||
width: 1280
|
||||
height: 4600
|
||||
wacz:
|
||||
height: 2300
|
||||
wayback_archiver_enricher:
|
||||
timeout: 10
|
||||
key: "wayback key"
|
||||
secret: "wayback secret"
|
||||
hash_enricher:
|
||||
algorithm: "SHA3-512" # can also be SHA-256
|
||||
wacz_enricher:
|
||||
profile: secrets/profile.tar.gz
|
||||
webarchive:
|
||||
api_key: "12345"
|
||||
s3:
|
||||
- bucket: 123
|
||||
- region: "nyc3"
|
||||
- cdn: "{region}{bucket}"
|
||||
local_storage:
|
||||
save_to: "./local_archive"
|
||||
save_absolute: true
|
||||
filename_generator: static
|
||||
path_generator: flat
|
||||
s3_storage:
|
||||
bucket: your-bucket-name
|
||||
region: reg1
|
||||
key: S3_KEY
|
||||
secret: S3_SECRET
|
||||
endpoint_url: "https://{region}.digitaloceanspaces.com"
|
||||
cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
|
||||
# if private:true S3 urls will not be readable online
|
||||
private: false
|
||||
# with 'random' you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files, alternative is 'default' or not omitted from config
|
||||
key_path: random
|
||||
|
||||
gdrive_storage:
|
||||
path_generator: url
|
||||
filename_generator: random
|
||||
root_folder_id: folder_id_from_url
|
||||
oauth_token: secrets/gd-token.json # needs to be generated with scripts/create_update_gdrive_oauth_token.py
|
||||
service_account: "secrets/service_account.json"
|
||||
|
||||
@@ -3,6 +3,7 @@ from .telethon_archiver import TelethonArchiver
|
||||
from .twitter_archiver import TwitterArchiver
|
||||
from .twitter_api_archiver import TwitterApiArchiver
|
||||
from .instagram_archiver import InstagramArchiver
|
||||
from .instagram_tbot_archiver import InstagramTbotArchiver
|
||||
from .tiktok_archiver import TiktokArchiver
|
||||
from .telegram_archiver import TelegramArchiver
|
||||
from .vk_archiver import VkArchiver
|
||||
|
||||
77
src/auto_archiver/archivers/instagram_tbot_archiver.py
Normal file
77
src/auto_archiver/archivers/instagram_tbot_archiver.py
Normal file
@@ -0,0 +1,77 @@
|
||||
|
||||
from telethon.sync import TelegramClient
|
||||
from loguru import logger
|
||||
import time, os
|
||||
from sqlite3 import OperationalError
|
||||
from . import Archiver
|
||||
from ..core import Metadata, Media
|
||||
|
||||
|
||||
class InstagramTbotArchiver(Archiver):
|
||||
"""
|
||||
calls a telegram bot to fetch instagram posts/stories... and gets available media from it
|
||||
https://github.com/adw0rd/instagrapi
|
||||
https://t.me/instagram_load_bot
|
||||
"""
|
||||
name = "instagram_tbot_archiver"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
self.assert_valid_string("api_id")
|
||||
self.assert_valid_string("api_hash")
|
||||
self.timeout = int(self.timeout)
|
||||
try:
|
||||
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
|
||||
except OperationalError as e:
|
||||
logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_archiver. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
|
||||
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
||||
"session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
||||
"timeout": {"default": 15, "help": "timeout to fetch the instagram content in seconds."},
|
||||
}
|
||||
|
||||
def setup(self) -> None:
|
||||
logger.info(f"SETUP {self.name} checking login...")
|
||||
with self.client.start():
|
||||
logger.success(f"SETUP {self.name} login works.")
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
if not "instagram.com" in url: return False
|
||||
|
||||
result = Metadata()
|
||||
tmp_dir = item.get_tmp_dir()
|
||||
with self.client.start():
|
||||
chat = self.client.get_entity("instagram_load_bot")
|
||||
since_id = self.client.send_message(entity=chat, message=url).id
|
||||
|
||||
attempts = 0
|
||||
seen_media = []
|
||||
message = ""
|
||||
time.sleep(4)
|
||||
# media is added before text by the bot so it can be used as a stop-logic mechanism
|
||||
while attempts < self.timeout and (not message or not len(seen_media)):
|
||||
attempts += 1
|
||||
time.sleep(1)
|
||||
for post in self.client.iter_messages(chat, min_id=since_id):
|
||||
since_id = max(since_id, post.id)
|
||||
if post.media and post.id not in seen_media:
|
||||
filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
|
||||
media = self.client.download_media(post.media, filename_dest)
|
||||
if media:
|
||||
result.add_media(Media(media))
|
||||
seen_media.append(post.id)
|
||||
if post.message: message += post.message
|
||||
|
||||
if "You must enter a URL to a post" in message:
|
||||
logger.debug(f"invalid link {url=} for {self.name}: {message}")
|
||||
return False
|
||||
|
||||
if message:
|
||||
result.set_content(message).set_title(message[:128])
|
||||
|
||||
return result.success("insta-via-bot")
|
||||
@@ -114,7 +114,7 @@ class TelethonArchiver(Archiver):
|
||||
with self.client.start():
|
||||
# with self.client.start(bot_token=self.bot_token):
|
||||
try:
|
||||
post = self.client.get_messages(chat, ids=post_id)
|
||||
post = self.client.get_messages(chat, ids=post_id)
|
||||
except ValueError as e:
|
||||
logger.error(f"Could not fetch telegram {url} possibly it's private: {e}")
|
||||
return False
|
||||
|
||||
@@ -37,7 +37,7 @@ class TwitterArchiver(Archiver):
|
||||
return self.link_clean_pattern.sub("\\1", url)
|
||||
|
||||
def is_rearchivable(self, url: str) -> bool:
|
||||
# Twitter posts are static
|
||||
# Twitter posts are static (for now)
|
||||
return False
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
@@ -86,7 +86,7 @@ class TwitterArchiver(Archiver):
|
||||
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
|
||||
result.add_media(media)
|
||||
|
||||
return result.success("twitter")
|
||||
return result.success("twitter-snscrape")
|
||||
|
||||
def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
|
||||
"""
|
||||
|
||||
@@ -6,7 +6,7 @@ from ..core import Metadata, Media
|
||||
|
||||
|
||||
class YoutubeDLArchiver(Archiver):
|
||||
name = "youtubedl_enricher"
|
||||
name = "youtubedl_archiver"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
|
||||
@@ -63,6 +63,9 @@ class Metadata:
|
||||
def is_success(self) -> bool:
|
||||
return "success" in self.status
|
||||
|
||||
def is_empty(self) -> bool:
|
||||
return not self.is_success() and len(self.media) == 0 and len(self.get_clean_metadata()) <= 2 # url, processed_at
|
||||
|
||||
@property # getter .netloc
|
||||
def netloc(self) -> str:
|
||||
return urlparse(self.get_url()).netloc
|
||||
@@ -122,7 +125,7 @@ class Metadata:
|
||||
for m in self.media:
|
||||
if m.get("id") == id: return m
|
||||
return default
|
||||
|
||||
|
||||
def get_first_image(self, default=None) -> Media:
|
||||
for m in self.media:
|
||||
if "image" in m.mimetype: return m
|
||||
|
||||
@@ -123,6 +123,9 @@ class ArchivingOrchestrator:
|
||||
s.store(final_media, result)
|
||||
result.set_final_media(final_media)
|
||||
|
||||
if result.is_empty():
|
||||
result.status = "nothing archived"
|
||||
|
||||
# signal completion to databases (DBs, Google Sheets, CSV, ...)
|
||||
for d in self.databases: d.done(result)
|
||||
|
||||
|
||||
@@ -2,10 +2,8 @@ from typing import Union, Tuple
|
||||
import datetime
|
||||
from urllib.parse import quote
|
||||
|
||||
# from metadata import Metadata
|
||||
from loguru import logger
|
||||
|
||||
# from . import Enricher
|
||||
from . import Database
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
@@ -61,13 +59,13 @@ class GsheetsDb(Database):
|
||||
cell_updates.append((row, 'status', item.status))
|
||||
|
||||
media: Media = item.get_final_media()
|
||||
|
||||
batch_if_valid('archive', "\n".join(media.urls))
|
||||
if hasattr(media, "urls"):
|
||||
batch_if_valid('archive', "\n".join(media.urls))
|
||||
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
|
||||
batch_if_valid('title', item.get_title())
|
||||
batch_if_valid('text', item.get("content", "")[:500])
|
||||
batch_if_valid('timestamp', item.get_timestamp())
|
||||
if (screenshot := item.get_media_by_id("screenshot")):
|
||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
|
||||
batch_if_valid('screenshot', "\n".join(screenshot.urls))
|
||||
|
||||
if (thumbnail := item.get_first_image("thumbnail")):
|
||||
|
||||
@@ -3,7 +3,7 @@ import time, uuid, os
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
from . import Enricher
|
||||
from ..utils import Webdriver
|
||||
from ..utils import Webdriver, UrlUtil
|
||||
from ..core import Media, Metadata
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
@@ -14,16 +14,21 @@ class ScreenshotEnricher(Enricher):
|
||||
return {
|
||||
"width": {"default": 1280, "help": "width of the screenshots"},
|
||||
"height": {"default": 720, "help": "height of the screenshots"},
|
||||
"timeout": {"default": 60, "help": "timeout for taking the screenshot"}
|
||||
"timeout": {"default": 60, "help": "timeout for taking the screenshot"},
|
||||
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
if UrlUtil.is_auth_wall(url):
|
||||
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
|
||||
return
|
||||
|
||||
logger.debug(f"Enriching screenshot for {url=}")
|
||||
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver:
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(2)
|
||||
time.sleep(int(self.sleep_before_screenshot))
|
||||
screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
|
||||
driver.save_screenshot(screenshot_file)
|
||||
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
|
||||
@@ -31,4 +36,3 @@ class ScreenshotEnricher(Enricher):
|
||||
logger.info("TimeoutException loading page for screenshot")
|
||||
except Exception as e:
|
||||
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
|
||||
# return None
|
||||
|
||||
@@ -3,6 +3,7 @@ from loguru import logger
|
||||
|
||||
from ..core import Media, Metadata
|
||||
from . import Enricher
|
||||
from ..utils import UrlUtil
|
||||
|
||||
|
||||
class WaczEnricher(Enricher):
|
||||
@@ -20,11 +21,17 @@ class WaczEnricher(Enricher):
|
||||
return {
|
||||
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
|
||||
"timeout": {"default": 90, "help": "timeout for WACZ generation in seconds"},
|
||||
"ignore_auth_wall": {"default": True, "help": "skip URL if it is behind authentication wall, set to False if you have browsertrix profile configured for private content."},
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> bool:
|
||||
# TODO: figure out support for browsertrix in docker
|
||||
url = to_enrich.get_url()
|
||||
|
||||
if UrlUtil.is_auth_wall(url):
|
||||
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
|
||||
return
|
||||
|
||||
logger.debug(f"generating WACZ for {url=}")
|
||||
collection = str(uuid.uuid4())[0:8]
|
||||
browsertrix_home = os.path.abspath(to_enrich.get_tmp_dir())
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
from loguru import logger
|
||||
import time, requests
|
||||
|
||||
|
||||
from . import Enricher
|
||||
from ..archivers import Archiver
|
||||
from ..utils import UrlUtil
|
||||
from ..core import Metadata
|
||||
|
||||
class WaybackArchiverEnricher(Enricher, Archiver):
|
||||
@@ -33,6 +35,10 @@ class WaybackArchiverEnricher(Enricher, Archiver):
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> bool:
|
||||
url = to_enrich.get_url()
|
||||
if UrlUtil.is_auth_wall(url):
|
||||
logger.debug(f"[SKIP] WAYBACK since url is behind AUTH WALL: {url=}")
|
||||
return
|
||||
|
||||
logger.debug(f"calling wayback for {url=}")
|
||||
|
||||
if to_enrich.get("wayback"):
|
||||
|
||||
@@ -3,6 +3,7 @@ from dataclasses import dataclass
|
||||
import mimetypes, uuid, os, pathlib
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from urllib.parse import quote
|
||||
from loguru import logger
|
||||
|
||||
from ..version import __version__
|
||||
from ..core import Metadata, Media
|
||||
@@ -26,12 +27,17 @@ class HtmlFormatter(Formatter):
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"},
|
||||
|
||||
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
|
||||
}
|
||||
|
||||
def format(self, item: Metadata) -> Media:
|
||||
url = item.get_url()
|
||||
if item.is_empty():
|
||||
logger.debug(f"[SKIP] FORMAT there is no media or metadata to format: {url=}")
|
||||
return
|
||||
|
||||
content = self.template.render(
|
||||
url=item.get_url(),
|
||||
url=url,
|
||||
title=item.get_title(),
|
||||
media=item.media,
|
||||
metadata=item.get_clean_metadata(),
|
||||
|
||||
@@ -2,4 +2,5 @@
|
||||
from .gworksheet import GWorksheet
|
||||
from .misc import *
|
||||
from .webdriver import Webdriver
|
||||
from .gsheet import Gsheets
|
||||
from .gsheet import Gsheets
|
||||
from .url import UrlUtil
|
||||
@@ -40,11 +40,11 @@ class GWorksheet:
|
||||
|
||||
def _col_index(self, col: str):
|
||||
self._check_col_exists(col)
|
||||
return self.headers.index(self.columns[col])
|
||||
return self.headers.index(self.columns[col].lower())
|
||||
|
||||
def col_exists(self, col: str):
|
||||
self._check_col_exists(col)
|
||||
return self.columns[col] in self.headers
|
||||
return self.columns[col].lower() in self.headers
|
||||
|
||||
def count_rows(self):
|
||||
return len(self.values)
|
||||
|
||||
19
src/auto_archiver/utils/url.py
Normal file
19
src/auto_archiver/utils/url.py
Normal file
@@ -0,0 +1,19 @@
|
||||
import re
|
||||
|
||||
class UrlUtil:
|
||||
telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)")
|
||||
is_istagram = re.compile(r"https:\/\/www\.instagram\.com")
|
||||
|
||||
@staticmethod
|
||||
def clean(url): return url
|
||||
|
||||
@staticmethod
|
||||
def is_auth_wall(url):
|
||||
"""
|
||||
checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
|
||||
"""
|
||||
if UrlUtil.telegram_private.match(url): return True
|
||||
if UrlUtil.is_istagram.match(url): return True
|
||||
|
||||
return False
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
|
||||
_MAJOR = "0"
|
||||
_MINOR = "3"
|
||||
_MINOR = "4"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "0"
|
||||
_PATCH = "3"
|
||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||
_SUFFIX = ""
|
||||
|
||||
Reference in New Issue
Block a user